# Part B – Random Forest Regression (Full Tuned Version)

This notebook implements a **complete machine learning pipeline** using **Random Forest regression**
to predict `metabolic_pressure` from the SMHS dataset.


In [1]:
# 1. Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 140)

## 2. Load data

In [3]:
general_path = "SMHS_general-1.csv"
validation_path = "SMHS_validation_students-1.csv"

general = pd.read_csv(general_path)
validation = pd.read_csv(validation_path)

general.shape, validation.shape

((1500, 29), (100, 28))

## 3. Basic structure and missingness

In [4]:
general.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 29 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         1500 non-null   int64  
 1   age                        1500 non-null   int64  
 2   sex                        1500 non-null   object 
 3   ethnicity                  1500 non-null   object 
 4   latent_cluster             1500 non-null   int64  
 5   deprivation_index          1500 non-null   float64
 6   smoking_status             1500 non-null   object 
 7   alcohol_units_week         259 non-null    float64
 8   physical_activity_minutes  1500 non-null   float64
 9   diet_quality_score         1500 non-null   float64
 10  depression_score           1500 non-null   float64
 11  bmi                        1500 non-null   float64
 12  waist_hip_ratio            1500 non-null   float64
 13  sbp                        1500 non-null   float

In [5]:
general.head()

Unnamed: 0,id,age,sex,ethnicity,latent_cluster,deprivation_index,smoking_status,alcohol_units_week,physical_activity_minutes,diet_quality_score,depression_score,bmi,waist_hip_ratio,sbp,dbp,sleep_duration_hours,sleep_variability_hours,sleep_efficiency,steps_per_day,fasting_glucose,fasting_insulin,triglycerides,hdl,ldl,crp,alt,shift_worker,family_history_diabetes,metabolic_pressure
0,1,56,F,White,3,74.968216,Former,,92.872646,60.614177,5.584738,29.748838,1.046043,149.311516,93.937432,6.021335,1.887725,74.954053,5957.130999,7.367951,21.589497,2.607227,0.73726,2.7041,28.432087,37.267948,No,0,73.087968
1,2,59,M,White,2,34.912176,Never,,26.103352,72.10727,5.188831,34.552028,0.875128,133.590633,78.725356,6.182194,1.079962,95.105654,7237.822331,6.717636,16.379792,3.112712,1.654086,3.750358,1.62455,29.042661,No,0,59.54175
2,3,49,M,Black,2,54.207211,Former,,210.401582,49.249182,7.037791,32.157909,0.843049,123.986203,62.293022,5.295797,1.013001,88.639733,10968.377469,5.052053,17.104307,4.525291,1.031543,3.8125,2.236999,33.364052,No,1,54.864634
3,4,39,F,White,2,27.870993,Never,,214.605575,61.195129,1.599334,36.395585,0.859905,130.854414,76.428276,6.495996,0.52839,83.566288,8455.317304,6.018634,21.739572,1.322513,1.232799,4.086235,,22.496442,No,0,49.199538
4,5,45,F,Other,3,72.684155,Never,21.249861,135.063492,49.828539,15.358964,34.769534,1.052577,156.756304,98.913783,4.906051,1.685047,90.506972,6689.506787,5.826059,16.918793,2.312573,1.014607,3.390955,5.999061,51.853812,No,0,70.674299


In [6]:
missing_ratio = general.isna().mean().sort_values(ascending=False)
missing_ratio

Unnamed: 0,0
alcohol_units_week,0.827333
steps_per_day,0.144
crp,0.080667
sleep_variability_hours,0.046
fasting_insulin,0.024667
latent_cluster,0.0
deprivation_index,0.0
smoking_status,0.0
physical_activity_minutes,0.0
age,0.0


## 4. Define target and predictors

In [7]:
target_col = "metabolic_pressure"
cluster_col = "latent_cluster"
id_col = "id"

y = general[target_col]
X = general.drop(columns=[target_col, cluster_col, id_col])

numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

numeric_features, categorical_features

(['age',
  'deprivation_index',
  'alcohol_units_week',
  'physical_activity_minutes',
  'diet_quality_score',
  'depression_score',
  'bmi',
  'waist_hip_ratio',
  'sbp',
  'dbp',
  'sleep_duration_hours',
  'sleep_variability_hours',
  'sleep_efficiency',
  'steps_per_day',
  'fasting_glucose',
  'fasting_insulin',
  'triglycerides',
  'hdl',
  'ldl',
  'crp',
  'alt',
  'family_history_diabetes'],
 ['sex', 'ethnicity', 'smoking_status', 'shift_worker'])

## 5. Train–test split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape

((1200, 26), (300, 26))

## 6. Preprocessing pipeline

In [9]:
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median"))
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

## 7. Baseline Random Forest model

In [10]:
rf_baseline = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

rf_pipe_baseline = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", rf_baseline)
    ]
)

rf_pipe_baseline.fit(X_train, y_train)

y_pred_test_baseline = rf_pipe_baseline.predict(X_test)

mae_baseline = mean_absolute_error(y_test, y_pred_test_baseline)
rmse_baseline = np.sqrt(mean_squared_error(y_test, y_pred_test_baseline))

print("Baseline Random Forest")
print("Test MAE :", mae_baseline)
print("Test RMSE:", rmse_baseline)

Baseline Random Forest
Test MAE : 3.831593505705094
Test RMSE: 4.9620909485138185


## 8. Hyperparameter tuning for Random Forest

In [13]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# 8. Randomized hyperparameter search for Random Forest
# -----------------------------------------------------
# We wrap the RandomForest inside the same preprocessing pipeline as before

rf = RandomForestRegressor(random_state=42, n_jobs=-1)

rf_pipe = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", rf)
    ]
)

# Define distributions for random search
# randint(a, b) gives integers in [a, b)
param_distributions = {
    "model__n_estimators": randint(200, 801),      # 200–800 trees
    "model__max_depth": randint(4, 21),            # depth 4–20 (or None if we want, see note below)
    "model__min_samples_split": randint(2, 11),    # 2–10
    "model__min_samples_leaf": randint(1, 6),      # 1–5
    "model__max_features": ["sqrt", "log2", 0.5]   # try different feature subsets
}

# RandomizedSearchCV: faster than full grid search
# n_iter controls how many random combinations are tested
random_search_rf = RandomizedSearchCV(
    estimator=rf_pipe,
    param_distributions=param_distributions,
    n_iter=15,                      # you can reduce to 15 if Colab is slow
    scoring="neg_mean_absolute_error",
    cv=5,
    n_jobs=-1,
    random_state=42
)

# Run the random search
random_search_rf.fit(X_train, y_train)

print("Best parameters (RandomizedSearchCV):")
print(random_search_rf.best_params_)
print("Best CV MAE:", -random_search_rf.best_score_)

# Evaluate the tuned model on the held-out test set
best_rf_pipe = random_search_rf.best_estimator_
y_pred_test_rf = best_rf_pipe.predict(X_test)

test_mae_rf = mean_absolute_error(y_test, y_pred_test_rf)
test_rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_test_rf))

print("Tuned RF (Random Search) – Test MAE :", test_mae_rf)
print("Tuned RF (Random Search) – Test RMSE:", test_rmse_rf)


Best parameters (RandomizedSearchCV):
{'model__max_depth': 12, 'model__max_features': 0.5, 'model__min_samples_leaf': 2, 'model__min_samples_split': 5, 'model__n_estimators': 800}
Best CV MAE: 4.000538464908535
Tuned RF (Random Search) – Test MAE : 3.8599251121575686
Tuned RF (Random Search) – Test RMSE: 4.965251136689602


## 9. Evaluation of tuned Random Forest model

In [15]:
# Get best tuned model
best_rf_pipe = random_search_rf.best_estimator_

# Predict on test set
y_pred_test_rf = best_rf_pipe.predict(X_test)

# Evaluate performance
test_mae_rf = mean_absolute_error(y_test, y_pred_test_rf)
test_rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_test_rf))

print("Tuned Random Forest (Random Search) – Test MAE :", test_mae_rf)
print("Tuned Random Forest (Random Search) – Test RMSE:", test_rmse_rf)


Tuned Random Forest (Random Search) – Test MAE : 3.8599251121575686
Tuned Random Forest (Random Search) – Test RMSE: 4.965251136689602


## 10. Cross-validation on the full dataset

In [16]:
cv_scores_mae_rf = cross_val_score(
    best_rf_pipe,
    X, y,
    cv=5,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

cv_scores_mse_rf = cross_val_score(
    best_rf_pipe,
    X, y,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=-1
)

cv_mae_mean = -cv_scores_mae_rf.mean()
cv_mae_std = cv_scores_mae_rf.std()

cv_rmse_mean = np.sqrt(-cv_scores_mse_rf.mean())

print("RF CV MAE  (mean ± sd):", cv_mae_mean, "+/-", cv_mae_std)
print("RF CV RMSE (mean):     ", cv_rmse_mean)

RF CV MAE  (mean ± sd): 3.9577465875652513 +/- 0.1980933808469395
RF CV RMSE (mean):      5.031460154705991


## 11. Feature importance

In [17]:
preprocessor = best_rf_pipe.named_steps["preprocess"]
model = best_rf_pipe.named_steps["model"]

num_features_out = numeric_features

cat_encoder = preprocessor.named_transformers_["cat"].named_steps["onehot"]
cat_features_out = cat_encoder.get_feature_names_out(categorical_features)

all_feature_names = np.concatenate([num_features_out, cat_features_out])

importances = model.feature_importances_

fi_df = pd.DataFrame({
    "feature": all_feature_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)

fi_df.head(20)

Unnamed: 0,feature,importance
14,fasting_glucose,0.369147
6,bmi,0.203687
10,sleep_duration_hours,0.095503
11,sleep_variability_hours,0.073429
16,triglycerides,0.065848
15,fasting_insulin,0.062124
20,alt,0.030326
7,waist_hip_ratio,0.020758
17,hdl,0.008377
4,diet_quality_score,0.008174


## 12. Fit tuned RF on full data and predict external validation set

In [18]:
best_rf_pipe.fit(X, y)

X_val = validation.drop(columns=[id_col, cluster_col], errors="ignore")
val_pred_rf = best_rf_pipe.predict(X_val)

submission_rf = pd.DataFrame({
    "id": validation[id_col],
    "predicted_metabolic_pressure": val_pred_rf
})

submission_rf.head()

Unnamed: 0,id,predicted_metabolic_pressure
0,New_1501,57.109257
1,New_1502,53.745403
2,New_1503,66.299493
3,New_1504,33.973939
4,New_1505,37.295216


In [19]:
rf_submission_path = "SMHS_RF_validation_predictions.csv"
submission_rf.to_csv(rf_submission_path, index=False)
rf_submission_path

'SMHS_RF_validation_predictions.csv'

## 13. Summary

- Built a full Random Forest regression pipeline with preprocessing.
- Tuned key hyperparameters with GridSearchCV.
- Evaluated performance with test-set metrics and cross-validation.
- Interpreted the model via feature importance.
- Generated predictions for the external validation set.
