In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (roc_auc_score, accuracy_score, f1_score,
                             precision_score, recall_score, balanced_accuracy_score)
import joblib

# =========================
# LOAD DATA
# =========================
df = pd.read_csv(r"C:\Users\User\Documents\EEG_Project\dataSheets\ML_Feature_Matrix.csv")

# =========================
# SELECTED FEATURES
# =========================
FEATURES = [
    'Feature_Sync_Delta_ClassA_Frontal',
    'Feature_Theta_Global_Abs',
    'Feature_HubPLI_Beta_Delta',
    'Feature_DeltaBeta_Global',
    'Feature_Theta_Asymmetry_Idx',
]

X = df[FEATURES].values
y = df['Label_Impaired'].values

# =========================
# 5 x 80/20 STRATIFIED SPLITS
# =========================
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=4224)

results = {
    'AUC': [], 'Accuracy': [], 'F1': [],
    'Precision': [], 'Recall': [], 'BalancedAcc': []
}

for fold, (train_idx, test_idx) in enumerate(sss.split(X, y)):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('rf', RandomForestClassifier(
            n_estimators=500,
            random_state=4224,
            max_depth=5
        ))
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_prob = pipeline.predict_proba(X_test)[:, 1]

    results['AUC'].append(roc_auc_score(y_test, y_prob))
    results['Accuracy'].append(accuracy_score(y_test, y_pred))
    results['F1'].append(f1_score(y_test, y_pred))
    results['Precision'].append(precision_score(y_test, y_pred))
    results['Recall'].append(recall_score(y_test, y_pred))
    results['BalancedAcc'].append(balanced_accuracy_score(y_test, y_pred))

    print(f"Fold {fold+1} | AUC: {results['AUC'][-1]:.3f} | "
          f"F1: {results['F1'][-1]:.3f} | "
          f"BalAcc: {results['BalancedAcc'][-1]:.3f}")

# =========================
# SUMMARY
# =========================
print("\n--- 5-Split 80/20 Holdout Performance ---")
for metric, scores in results.items():
    print(f"{metric}: {np.mean(scores):.3f} ± {np.std(scores):.3f}")

# =========================
# TRAIN FINAL MODEL ON FULL DATA
# =========================
final_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(
        n_estimators=500,
        random_state=4224,
        max_depth=5
    ))
])
final_pipeline.fit(X, y)

# =========================
# FEATURE IMPORTANCE
# =========================
rf_model = final_pipeline.named_steps['rf']
importances = pd.Series(rf_model.feature_importances_, index=FEATURES).sort_values(ascending=False)
print("\n--- Feature Importances ---")
print(importances)

# =========================
# SAVE MODEL
# =========================
# joblib.dump(final_pipeline, "PD_MoCA_RF_5Feature_Final.pkl")
print("\nFinal model saved as PD_MoCA_RF_5Feature_Final.pkl")

Fold 1 | AUC: 0.818 | F1: 0.727 | BalAcc: 0.697
Fold 2 | AUC: 0.747 | F1: 0.762 | BalAcc: 0.753
Fold 3 | AUC: 0.616 | F1: 0.353 | BalAcc: 0.470
Fold 4 | AUC: 0.727 | F1: 0.636 | BalAcc: 0.596
Fold 5 | AUC: 0.808 | F1: 0.750 | BalAcc: 0.687

--- 5-Split 80/20 Holdout Performance ---
AUC: 0.743 ± 0.072
Accuracy: 0.640 ± 0.107
F1: 0.646 ± 0.153
Precision: 0.671 ± 0.101
Recall: 0.636 ± 0.191
BalancedAcc: 0.640 ± 0.099

--- Feature Importances ---
Feature_Theta_Global_Abs             0.262712
Feature_Theta_Asymmetry_Idx          0.208826
Feature_Sync_Delta_ClassA_Frontal    0.206298
Feature_HubPLI_Beta_Delta            0.174051
Feature_DeltaBeta_Global             0.148114
dtype: float64

Final model saved as PD_MoCA_RF_5Feature_Final.pkl


In [2]:
from sklearn.model_selection import RepeatedStratifiedKFold

# 5-fold CV, repeated 5 times -> 25 evaluations total
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=4224)

results = {k: [] for k in ['AUC', 'Accuracy', 'F1', 'Precision', 'Recall', 'BalancedAcc']}

for fold, (train_idx, test_idx) in enumerate(cv.split(X, y)):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('rf', RandomForestClassifier(n_estimators=500, max_depth=5, random_state=4224))
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_prob = pipeline.predict_proba(X_test)[:, 1]

    results['AUC'].append(roc_auc_score(y_test, y_prob))
    results['Accuracy'].append(accuracy_score(y_test, y_pred))
    results['F1'].append(f1_score(y_test, y_pred))
    results['Precision'].append(precision_score(y_test, y_pred))
    results['Recall'].append(recall_score(y_test, y_pred))
    results['BalancedAcc'].append(balanced_accuracy_score(y_test, y_pred))

    print(f"Fold {fold+1} | AUC: {results['AUC'][-1]:.3f} | "
          f"F1: {results['F1'][-1]:.3f} | "
          f"BalAcc: {results['BalancedAcc'][-1]:.3f}")

# Summary
for metric, scores in results.items():
    print(f"{metric}: {np.mean(scores):.3f} ± {np.std(scores):.3f}")

Fold 1 | AUC: 0.879 | F1: 0.857 | BalAcc: 0.854
Fold 2 | AUC: 0.677 | F1: 0.667 | BalAcc: 0.717
Fold 3 | AUC: 0.929 | F1: 0.846 | BalAcc: 0.778
Fold 4 | AUC: 0.820 | F1: 0.667 | BalAcc: 0.700
Fold 5 | AUC: 0.780 | F1: 0.700 | BalAcc: 0.700
Fold 6 | AUC: 0.889 | F1: 0.786 | BalAcc: 0.667
Fold 7 | AUC: 0.828 | F1: 0.762 | BalAcc: 0.753
Fold 8 | AUC: 0.768 | F1: 0.667 | BalAcc: 0.652
Fold 9 | AUC: 0.820 | F1: 0.750 | BalAcc: 0.800
Fold 10 | AUC: 0.810 | F1: 0.783 | BalAcc: 0.750
Fold 11 | AUC: 0.566 | F1: 0.588 | BalAcc: 0.672
Fold 12 | AUC: 0.909 | F1: 0.769 | BalAcc: 0.677
Fold 13 | AUC: 0.646 | F1: 0.696 | BalAcc: 0.641
Fold 14 | AUC: 0.810 | F1: 0.700 | BalAcc: 0.700
Fold 15 | AUC: 0.830 | F1: 0.667 | BalAcc: 0.700
Fold 16 | AUC: 0.899 | F1: 0.818 | BalAcc: 0.798
Fold 17 | AUC: 0.949 | F1: 0.870 | BalAcc: 0.843
Fold 18 | AUC: 0.707 | F1: 0.696 | BalAcc: 0.641
Fold 19 | AUC: 0.690 | F1: 0.667 | BalAcc: 0.650
Fold 20 | AUC: 0.760 | F1: 0.588 | BalAcc: 0.650
Fold 21 | AUC: 0.889 | F1: 0.

In [3]:
from sklearn.model_selection import cross_validate

# Define your pipeline once
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(n_estimators=500, max_depth=5, random_state=4224))
])

# Define the metrics you want
scoring = {
    'AUC': 'roc_auc',
    'Accuracy': 'accuracy',
    'F1': 'f1',
    'Precision': 'precision',
    'Recall': 'recall',
    'BalancedAcc': 'balanced_accuracy'
}

# Run everything in one go
cv_results = cross_validate(pipeline, X, y, cv=cv, scoring=scoring, n_jobs=-1)

# Summary logic
for metric in scoring.keys():
    test_metric = f'test_{metric}'
    print(f"{metric}: {cv_results[test_metric].mean():.3f} ± {cv_results[test_metric].std():.3f}")

AUC: 0.804 ± 0.092
Accuracy: 0.724 ± 0.060
F1: 0.733 ± 0.075
Precision: 0.753 ± 0.088
Recall: 0.737 ± 0.146
BalancedAcc: 0.721 ± 0.060
