In [1]:
import time
import numpy as np
import pandas as pd
import os
import pickle

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, roc_auc_score, recall_score, f1_score,
    classification_report, confusion_matrix, log_loss
)

train_df = pd.read_csv("../../Data/BRFSS_2024_model_ready_train.csv", low_memory=False)
test_df  = pd.read_csv("../../Data/BRFSS_2024_model_ready_test.csv", low_memory=False)

X_train = train_df.drop("DIABETE4", axis=1)
y_train = train_df["DIABETE4"].astype(int)

X_test = test_df.drop("DIABETE4", axis=1)
y_test = test_df["DIABETE4"].astype(int)


In [2]:
rf_baseline = RandomForestClassifier(random_state=42)
rf_baseline.fit(X_train, y_train)

y_pred_base = rf_baseline.predict(X_test)
y_proba_base = rf_baseline.predict_proba(X_test)

print("=== BASELINE RANDOM FOREST ===")
print(f"Accuracy:  {accuracy_score(y_test, y_pred_base):.4f}")
print(f"Precision (macro): {precision_score(y_test, y_pred_base, average='macro', zero_division=0):.4f}")
print(f"Recall (macro):    {recall_score(y_test, y_pred_base, average='macro', zero_division=0):.4f}")
print(f"F1 Score (macro):  {f1_score(y_test, y_pred_base, average='macro', zero_division=0):.4f}")
print(f"Log Loss:          {log_loss(y_test, y_proba_base):.4f}")

print("\nClassification Report:\n", classification_report(y_test, y_pred_base, zero_division=0))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_base))


=== BASELINE RANDOM FOREST ===
Accuracy:  0.8353
Precision (macro): 0.4621
Recall (macro):    0.3832
F1 Score (macro):  0.3903
Log Loss:          0.5165

Classification Report:
               precision    recall  f1-score   support

           1       0.54      0.17      0.26     13162
           3       0.85      0.98      0.91     75226
           4       0.00      0.00      0.00      2261

    accuracy                           0.84     90649
   macro avg       0.46      0.38      0.39     90649
weighted avg       0.78      0.84      0.79     90649


Confusion Matrix:
 [[ 2284 10878     0]
 [ 1794 73432     0]
 [  180  2081     0]]


In [3]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

X_train_tune, _, y_train_tune, _ = train_test_split(
    X_train,
    y_train,
    test_size=0.7,          
    random_state=42,
    stratify=y_train        
)

print("Tuning subset class distribution:")
print(y_train_tune.value_counts(normalize=True))


Tuning subset class distribution:
DIABETE4
4    0.336080
1    0.332061
3    0.331859
Name: proportion, dtype: float64


In [4]:


param_grid = {
    "n_estimators": [200, 400],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 10],
    "min_samples_leaf": [1, 4],
    "max_features": ["sqrt"],
    "class_weight": ["balanced"]    
}


In [5]:


rf_tuned = RandomForestClassifier(
    random_state=42,
    n_jobs=-1
)

cv = StratifiedKFold(
    n_splits=3,
    shuffle=True,
    random_state=42
)


In [6]:


grid_search = GridSearchCV(
    estimator=rf_tuned,
    param_grid=param_grid,
    scoring="f1_macro",
    cv=cv,
    verbose=1,
    n_jobs=1
)

grid_search.fit(X_train_tune, y_train_tune)

print("\nBest Parameters Found:")
print(grid_search.best_params_)


Fitting 3 folds for each of 24 candidates, totalling 72 fits

Best Parameters Found:
{'class_weight': 'balanced', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}


In [7]:


best_params = grid_search.best_params_
print("\nTraining final model with params:", best_params)

rf_final = RandomForestClassifier(
    **best_params,
    random_state=42,
    n_jobs=-1
)

rf_final.fit(X_train, y_train)



Training final model with params: {'class_weight': 'balanced', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}


0,1,2
,n_estimators,400
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [8]:


y_pred_final = rf_final.predict(X_test)
y_proba_final = rf_final.predict_proba(X_test)

print("\n=== FINAL TUNED RANDOM FOREST PERFORMANCE ===")
print(f"Accuracy:          {accuracy_score(y_test, y_pred_final):.4f}")
print(f"Precision (macro): {precision_score(y_test, y_pred_final, average='macro', zero_division=0):.4f}")
print(f"Recall  (macro):   {recall_score(y_test, y_pred_final, average='macro', zero_division=0):.4f}")
print(f"F1 Score (macro):  {f1_score(y_test, y_pred_final, average='macro', zero_division=0):.4f}")
print(f"Log Loss:          {log_loss(y_test, y_proba_final):.4f}")

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_final, zero_division=0))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_final))



=== FINAL TUNED RANDOM FOREST PERFORMANCE ===
Accuracy:          0.8364
Precision (macro): 0.4683
Recall  (macro):   0.3806
F1 Score (macro):  0.3869
Log Loss:          0.4508

Classification Report:

              precision    recall  f1-score   support

           1       0.56      0.16      0.25     13162
           3       0.85      0.98      0.91     75226
           4       0.00      0.00      0.00      2261

    accuracy                           0.84     90649
   macro avg       0.47      0.38      0.39     90649
weighted avg       0.79      0.84      0.79     90649

Confusion Matrix:
[[ 2136 11026     0]
 [ 1542 73684     0]
 [  164  2097     0]]


In [9]:
# SAVE PICKLE BUNDLE FOR RANDOM FOREST MODEL (rf_final)


# 1. Build predictions and confusion matrix (final tuned RF)
# rf_final, X_train, y_train, X_test, y_test, y_pred_final, y_proba_final
# are all defined earlier in the notebook.

cm = confusion_matrix(y_test, y_pred_final)


# 2. Build bundle dictionary with everything we need later
feature_names = X_train.columns.to_numpy() if hasattr(X_train, "columns") else None

bundle = {
    "model_name": "Random Forest (tuned)",

    # ---- Core evaluation arrays (for common visualizations) ----
    "y_test": y_test,
    "y_pred": y_pred_final,
    "y_proba": y_proba_final,
    "confusion_matrix": cm,

    # ---- Scalar performance metrics (for comparison plots) ----
    "accuracy": accuracy_score(y_test, y_pred_final),
    "precision_macro": precision_score(y_test, y_pred_final,
                                      average="macro", zero_division=0),
    "recall_macro": recall_score(y_test, y_pred_final,
                                 average="macro", zero_division=0),
    "f1_macro": f1_score(y_test, y_pred_final,
                         average="macro", zero_division=0),
    "log_loss": log_loss(y_test, y_proba_final),
    "roc_auc_ovr": roc_auc_score(y_test, y_proba_final, multi_class="ovr"),

    # ---- Hyperparameters ----
    "params": rf_final.get_params(),

    # ---- RF-specific: Gini feature importance ----
    "rf_feature_importance": rf_final.feature_importances_,
    "rf_feature_names": feature_names,

    # ---- Common keys for interactive feature explorer ----
    "feature_names": feature_names,
    "X_test_sample": X_test.iloc[:2000] if hasattr(X_test, "iloc") else X_test[:2000],

    # ---- For partial dependence plots later ----
    # store the trained model + a sample of training data
    "rf_final_model": rf_final,
    "X_train_sample": X_train.iloc[:5000] if hasattr(X_train, "iloc") else X_train[:5000],
    "y_train_sample": y_train.iloc[:5000] if hasattr(y_train, "iloc") else y_train[:5000],
}


# 3. Save bundle to ../../Results/Visualizations
save_path = "../../Results/Visualizations"
os.makedirs(save_path, exist_ok=True)

bundle_filename = os.path.join(save_path, "rf_bundle.pkl")

with open(bundle_filename, "wb") as f:
    pickle.dump(bundle, f)

print(f"\nRandom Forest pickle bundle saved to:\n{bundle_filename}")
print("Bundle keys:", list(bundle.keys()))



Random Forest pickle bundle saved to:
../../Results/Visualizations/rf_bundle.pkl
Bundle keys: ['model_name', 'y_test', 'y_pred', 'y_proba', 'confusion_matrix', 'accuracy', 'precision_macro', 'recall_macro', 'f1_macro', 'log_loss', 'roc_auc_ovr', 'params', 'rf_feature_importance', 'rf_feature_names', 'feature_names', 'X_test_sample', 'rf_final_model', 'X_train_sample', 'y_train_sample']
