In [1]:
# 1. Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 140)


In [2]:
# 2. Load data

general_path = "SMHS_general-1.csv"
validation_path = "SMHS_validation_students-1.csv"

general = pd.read_csv(general_path)
validation = pd.read_csv(validation_path)

general.shape, validation.shape


((1500, 29), (100, 28))

In [3]:
# 3. Basic structure and missingness
general.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 29 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         1500 non-null   int64  
 1   age                        1500 non-null   int64  
 2   sex                        1500 non-null   object 
 3   ethnicity                  1500 non-null   object 
 4   latent_cluster             1500 non-null   int64  
 5   deprivation_index          1500 non-null   float64
 6   smoking_status             1500 non-null   object 
 7   alcohol_units_week         259 non-null    float64
 8   physical_activity_minutes  1500 non-null   float64
 9   diet_quality_score         1500 non-null   float64
 10  depression_score           1500 non-null   float64
 11  bmi                        1500 non-null   float64
 12  waist_hip_ratio            1500 non-null   float64
 13  sbp                        1500 non-null   float

In [4]:
general.head()


Unnamed: 0,id,age,sex,ethnicity,latent_cluster,deprivation_index,smoking_status,alcohol_units_week,physical_activity_minutes,diet_quality_score,depression_score,bmi,waist_hip_ratio,sbp,dbp,sleep_duration_hours,sleep_variability_hours,sleep_efficiency,steps_per_day,fasting_glucose,fasting_insulin,triglycerides,hdl,ldl,crp,alt,shift_worker,family_history_diabetes,metabolic_pressure
0,1,56,F,White,3,74.968216,Former,,92.872646,60.614177,5.584738,29.748838,1.046043,149.311516,93.937432,6.021335,1.887725,74.954053,5957.130999,7.367951,21.589497,2.607227,0.73726,2.7041,28.432087,37.267948,No,0,73.087968
1,2,59,M,White,2,34.912176,Never,,26.103352,72.10727,5.188831,34.552028,0.875128,133.590633,78.725356,6.182194,1.079962,95.105654,7237.822331,6.717636,16.379792,3.112712,1.654086,3.750358,1.62455,29.042661,No,0,59.54175
2,3,49,M,Black,2,54.207211,Former,,210.401582,49.249182,7.037791,32.157909,0.843049,123.986203,62.293022,5.295797,1.013001,88.639733,10968.377469,5.052053,17.104307,4.525291,1.031543,3.8125,2.236999,33.364052,No,1,54.864634
3,4,39,F,White,2,27.870993,Never,,214.605575,61.195129,1.599334,36.395585,0.859905,130.854414,76.428276,6.495996,0.52839,83.566288,8455.317304,6.018634,21.739572,1.322513,1.232799,4.086235,,22.496442,No,0,49.199538
4,5,45,F,Other,3,72.684155,Never,21.249861,135.063492,49.828539,15.358964,34.769534,1.052577,156.756304,98.913783,4.906051,1.685047,90.506972,6689.506787,5.826059,16.918793,2.312573,1.014607,3.390955,5.999061,51.853812,No,0,70.674299


In [5]:
missing_ratio = general.isna().mean().sort_values(ascending=False)
missing_ratio

Unnamed: 0,0
alcohol_units_week,0.827333
steps_per_day,0.144
crp,0.080667
sleep_variability_hours,0.046
fasting_insulin,0.024667
latent_cluster,0.0
deprivation_index,0.0
smoking_status,0.0
physical_activity_minutes,0.0
age,0.0


In [6]:
# 4. Define target and predictors
target_col = "metabolic_pressure"
cluster_col = "latent_cluster"
id_col = "id"


y = general[target_col]


X = general.drop(columns=[target_col, cluster_col, id_col])


numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

numeric_features, categorical_features


(['age',
  'deprivation_index',
  'alcohol_units_week',
  'physical_activity_minutes',
  'diet_quality_score',
  'depression_score',
  'bmi',
  'waist_hip_ratio',
  'sbp',
  'dbp',
  'sleep_duration_hours',
  'sleep_variability_hours',
  'sleep_efficiency',
  'steps_per_day',
  'fasting_glucose',
  'fasting_insulin',
  'triglycerides',
  'hdl',
  'ldl',
  'crp',
  'alt',
  'family_history_diabetes'],
 ['sex', 'ethnicity', 'smoking_status', 'shift_worker'])

In [7]:
# 5. Train–test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape


((1200, 26), (300, 26))

In [8]:
# 6. Preprocessing pipeline
# Numerical features: Median imputation + Standardization
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

# Classification features: Mode imputation + One-Hot encoding
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)

# Column Converter: Applying different transformers to different columns
preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [10]:
# 7. Baseline MLPRegressor (Improved version to reduce convergence warnings)
# The objective here is to allow the model to converge more reliably without drastically increasing training time.

mlp_baseline = MLPRegressor(
    hidden_layer_sizes=(64, 32),      # Two hidden layers: moderate model capacity
    activation="relu",                # Standard activation for regression
    solver="adam",                    # Adaptive optimizer
    alpha=0.001,                      # L2 regularization (keeps weights stable)
    learning_rate_init=0.001,         # Initial learning rate
    max_iter=600,                     # Increased from 300 → gives optimizer enough time to converge
    early_stopping=True,              # Stops training if validation score does not improve
    n_iter_no_change=20,              # Patience for early stopping
    validation_fraction=0.1,          # Portion of training data used as validation set
    random_state=42
)

# Pipeline with preprocessing steps + neural network model
mlp_pipe_baseline = Pipeline(
    steps=[
        ("preprocess", preprocess),   # Imputation + scaling + one-hot encoding
        ("model", mlp_baseline)
    ]
)

# Fit on training data
mlp_pipe_baseline.fit(X_train, y_train)

# Predict on test set
y_pred_test_baseline = mlp_pipe_baseline.predict(X_test)

# Evaluation metrics
mae_baseline = mean_absolute_error(y_test, y_pred_test_baseline)
rmse_baseline = np.sqrt(mean_squared_error(y_test, y_pred_test_baseline))

print("Baseline MLP (Improved)")
print("Test MAE :", mae_baseline)
print("Test RMSE:", rmse_baseline)


Baseline MLP (Improved)
Test MAE : 3.818477210489913
Test RMSE: 4.881295385235102


In [11]:
# 8. Hyperparameter tuning for MLPRegressor
# Improved to avoid convergence warnings and speed up tuning

mlp = MLPRegressor(
    activation="relu",
    solver="adam",
    early_stopping=True,         # Helps convergence
    n_iter_no_change=20,
    max_iter=800,                # Higher iteration limit
    random_state=42
)

mlp_pipe = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", mlp)
    ]
)

# Reduced search space — faster & still effective
param_grid = {
    "model__hidden_layer_sizes": [
        (64,),         # One hidden layer
        (64, 32),      # Two layers
        (128, 64)      # Larger model
    ],
    "model__alpha": [0.0001, 0.001, 0.01],        # L2 regularization strength
    "model__learning_rate_init": [0.0005, 0.001]  # Smaller LR → more stable
}

grid_search_mlp = GridSearchCV(
    estimator=mlp_pipe,
    param_grid=param_grid,
    scoring="neg_mean_absolute_error",
    cv=3,
    n_jobs=-1
)

# Run the tuning
grid_search_mlp.fit(X_train, y_train)

# Extract best parameters and CV score
best_params_mlp = grid_search_mlp.best_params_
best_cv_mae_mlp = -grid_search_mlp.best_score_

print("Best MLP parameters:", best_params_mlp)
print("Best CV MAE       :", best_cv_mae_mlp)


Best MLP parameters: {'model__alpha': 0.01, 'model__hidden_layer_sizes': (64,), 'model__learning_rate_init': 0.001}
Best CV MAE       : 4.07686857209119


In [12]:
# 9. Evaluation of tuned MLP
best_mlp_pipe = grid_search_mlp.best_estimator_

y_pred_test_mlp = best_mlp_pipe.predict(X_test)

test_mae_mlp = mean_absolute_error(y_test, y_pred_test_mlp)
test_rmse_mlp = np.sqrt(mean_squared_error(y_test, y_pred_test_mlp))

print("Tuned MLP – Test MAE :", test_mae_mlp)
print("Tuned MLP – Test RMSE:", test_rmse_mlp)

print("\nCompare with baseline:")
print("Baseline MLP – Test MAE :", mae_baseline)
print("Baseline MLP – Test RMSE:", rmse_baseline)


Tuned MLP – Test MAE : 3.8305377398479186
Tuned MLP – Test RMSE: 4.916744462606237

Compare with baseline:
Baseline MLP – Test MAE : 3.818477210489913
Baseline MLP – Test RMSE: 4.881295385235102


In [13]:
# 10. Cross-validation on full dataset (optional)
cv_scores_mae_mlp = cross_val_score(
    best_mlp_pipe,
    X, y,
    cv=3,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

cv_scores_mse_mlp = cross_val_score(
    best_mlp_pipe,
    X, y,
    cv=3,
    scoring="neg_mean_squared_error",
    n_jobs=-1
)

cv_mae_mean_mlp = -cv_scores_mae_mlp.mean()
cv_mae_std_mlp = cv_scores_mae_mlp.std()

cv_rmse_mean_mlp = np.sqrt(-cv_scores_mse_mlp.mean())

print("MLP CV MAE  (mean ± sd):", cv_mae_mean_mlp, "+/-", cv_mae_std_mlp)
print("MLP CV RMSE (mean):     ", cv_rmse_mean_mlp)


MLP CV MAE  (mean ± sd): 3.9208029948351686 +/- 0.07995873765110977
MLP CV RMSE (mean):      5.374433038042395


In [14]:
# 11. Fit tuned MLP on full data and predict external validation set


best_mlp_pipe.fit(X, y)


X_val = validation.drop(columns=[id_col, cluster_col], errors="ignore")


val_pred_mlp = best_mlp_pipe.predict(X_val)

submission_mlp = pd.DataFrame({
    "id": validation[id_col],
    "predicted_metabolic_pressure": val_pred_mlp
})

submission_mlp.head()


Unnamed: 0,id,predicted_metabolic_pressure
0,New_1501,58.135607
1,New_1502,59.018483
2,New_1503,70.181418
3,New_1504,35.249937
4,New_1505,35.677752


In [15]:

mlp_submission_path = "SMHS_MLP_validation_predictions.csv"
submission_mlp.to_csv(mlp_submission_path, index=False)
mlp_submission_path


'SMHS_MLP_validation_predictions.csv'