In [14]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, balanced_accuracy_score
import joblib

# =========================
# LOAD DATA
# =========================
df = pd.read_csv("ML_Feature_Matrix_Updated.csv")

# =========================
# SELECTED FEATURES 
# =========================
FEATURES = [
    'sync_delta_A_frontal',
    'Feature_Theta_Global_Abs',
    'Feature_HubPLI_Beta_Delta',
    'Feature_DeltaBeta_Global',
    'Feature_Theta_Asymmetry_Idx',
]

X = df[FEATURES].values
y = df['Label_Impaired'].values

# =========================
# CONFIG
# =========================
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=4, random_state=0)  # Repeated 5x5 CV

scoring_metrics = {
    'AUC': 'roc_auc',
    'Accuracy': 'accuracy',
    'F1': make_scorer(f1_score),
    'Precision': make_scorer(precision_score),
    'Recall': make_scorer(recall_score),
    'BalancedAcc': make_scorer(balanced_accuracy_score)
}

# =========================
# FINAL MODEL PIPELINE
# =========================
final_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # optional for RF
    ('rf', RandomForestClassifier(
        n_estimators=500,
        random_state=0,
        max_depth=10
    ))
])

# =========================
# CROSS-VALIDATED EVALUATION
# =========================
cv_results = cross_validate(final_pipeline, X, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

print("\n--- Final 5-Feature Model Repeated CV Performance ---")
for metric_name, scores in cv_results.items():
    if metric_name.startswith('test_'):
        print(f"{metric_name[5:]}: {np.mean(scores):.3f} ± {np.std(scores):.3f}")

# =========================
# TRAIN FINAL MODEL ON FULL DATA
# =========================
final_pipeline.fit(X, y)

# =========================
# FEATURE IMPORTANCE
# =========================
rf_model = final_pipeline.named_steps['rf']
importances = pd.Series(rf_model.feature_importances_, index=FEATURES).sort_values(ascending=False)

print("\n--- Feature Importances ---")
print(importances)

# =========================
# SAVE MODEL
# =========================
joblib.dump(final_pipeline, "PD_MoCA_RF_5Feature_Final.pkl")
print("\nFinal model saved as PD_MoCA_RF_5Feature_Final.pkl")

# Individual fold scores
print(f"Individual Fold AUCs: {cv_results['test_AUC']}")
print(f"Individual Fold Accs: {cv_results['test_Accuracy']}")



--- Final 5-Feature Model Repeated CV Performance ---
AUC: 0.756 ± 0.126
Accuracy: 0.682 ± 0.125
F1: 0.704 ± 0.114
Precision: 0.706 ± 0.129
Recall: 0.713 ± 0.131
BalancedAcc: 0.679 ± 0.129

--- Feature Importances ---
Feature_Theta_Global_Abs       0.258083
Feature_Theta_Asymmetry_Idx    0.219448
sync_delta_A_frontal           0.182426
Feature_HubPLI_Beta_Delta      0.176390
Feature_DeltaBeta_Global       0.163653
dtype: float64

Final model saved as PD_MoCA_RF_5Feature_Final.pkl
Individual Fold AUCs: [0.66666667 0.90909091 0.82828283 0.92       0.52       0.73737374
 0.68686869 0.78787879 0.64       0.82       0.49494949 0.78787879
 0.7020202  0.84       0.91       0.83838384 0.77777778 0.54545455
 0.88       0.82      ]
Individual Fold Accs: [0.5  0.9  0.75 0.8  0.5  0.7  0.65 0.65 0.65 0.7  0.45 0.65 0.65 0.85
 0.8  0.75 0.7  0.45 0.8  0.75]
