### DONT TOUCH

In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.inspection import permutation_importance
import joblib

# =========================
# LOAD DATA
# =========================
df = pd.read_csv(r"C:\Users\User\Documents\EEG_Project\dataSheets\ML_Feature_Matrix.csv")

# Check for NaNs (Best practice for EEG datasets)
if df.isnull().values.any():
    print("Warning: NaNs detected. Dropping rows with missing values.")
    df = df.dropna()

FEATURES = [
    'Feature_Sync_Delta_ClassA_Frontal',
    'Feature_Theta_Global_Abs',
    'Feature_HubPLI_Beta_Delta',
    'Feature_DeltaBeta_Global',
    'Feature_Theta_Asymmetry_Idx',
    'Feature_Instab_Theta_duration_Var'
]

X = df[FEATURES].values
y = df['Label_Impaired'].values

# =========================
# CONFIG
# =========================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring_metrics = {
    'AUC': 'roc_auc',
    'Accuracy': 'accuracy',
    'F1': make_scorer(f1_score),
    'Precision': make_scorer(precision_score),
    'Recall': make_scorer(recall_score),
    'BalancedAcc': make_scorer(balanced_accuracy_score)
}

# =========================
# BUILD FINAL MODEL PIPELINE
# =========================
final_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(
        n_estimators=800,
        max_depth=None,
        min_samples_leaf=4,
        min_samples_split=8,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

# =========================
# CROSS-VALIDATED EVALUATION
# =========================
cv_results = cross_validate(final_pipeline, X, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

print("\n--- Final 6-Feature Model CV Performance ---")
for metric_name, scores in cv_results.items():
    if metric_name.startswith('test_'):
        print(f"{metric_name}: {np.mean(scores):.3f} ± {np.std(scores):.3f}")

# =========================
# TRAIN FINAL MODEL ON FULL DATA
# =========================
final_pipeline.fit(X, y)

# =========================
# ADVANCED FEATURE IMPORTANCE
# =========================
rf_model = final_pipeline.named_steps['rf']

# 1. Gini Importance with Variance
importances_mean = rf_model.feature_importances_
importances_std = np.std([tree.feature_importances_ for tree in rf_model.estimators_], axis=0)

# 2. Permutation Importance (More reliable)
# This measures how much the model performance drops when a feature is shuffled
perm_res = permutation_importance(final_pipeline, X, y, n_repeats=10, random_state=42, n_jobs=-1)

importance_df = pd.DataFrame({
    'Gini_Importance': importances_mean,
    'Gini_Std': importances_std,
    'Permutation_Mean': perm_res.importances_mean,
    'Permutation_Std': perm_res.importances_std
}, index=FEATURES).sort_values(by='Permutation_Mean', ascending=False)

print("\n--- Detailed Feature Importances ---")
print(importance_df)

# =========================
# SAVE MODEL
# =========================
# joblib.dump(final_pipeline, "PD_MoCA_RF_6Feature_Final.pkl")
print("\nFinal model training complete.")


--- Final 6-Feature Model CV Performance ---
test_AUC: 0.767 ± 0.099
test_Accuracy: 0.730 ± 0.040
test_F1: 0.732 ± 0.063
test_Precision: 0.769 ± 0.063
test_Recall: 0.715 ± 0.113
test_BalancedAcc: 0.729 ± 0.040

--- Detailed Feature Importances ---
                                   Gini_Importance  Gini_Std  \
Feature_Theta_Global_Abs                  0.250456  0.181750   
Feature_Theta_Asymmetry_Idx               0.199631  0.153365   
Feature_HubPLI_Beta_Delta                 0.151082  0.124445   
Feature_Sync_Delta_ClassA_Frontal         0.178227  0.158656   
Feature_DeltaBeta_Global                  0.135197  0.127653   
Feature_Instab_Theta_duration_Var         0.085407  0.096514   

                                   Permutation_Mean  Permutation_Std  
Feature_Theta_Global_Abs                      0.141         0.024678  
Feature_Theta_Asymmetry_Idx                   0.113         0.031953  
Feature_HubPLI_Beta_Delta                     0.105         0.023345  
Feature_Sync_Delta

### DONT TOUCH


In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, balanced_accuracy_score
import joblib

# =========================
# LOAD DATA
# =========================
df = pd.read_csv("ML_Feature_Matrix.csv")

# =========================
# SELECTED FEATURES (5 best)
# =========================
FEATURES = [
    #'Feature_Sync_Delta_ClassA_Frontal',
    'Feature_Theta_Global_Abs',
    'Feature_HubPLI_Beta_Delta',
    'Feature_HubPLI_Beta_Frontal',
    'Feature_DeltaBeta_Global',
    'Feature_Theta_Asymmetry_Idx',
    'Feature_ThetaAlpha_Global'

]

X = df[FEATURES].values
y = df['Label_Impaired'].values

# =========================
# CONFIG
# =========================
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=4, random_state=321)  # Repeated 5x5 CV

scoring_metrics = {
    'AUC': 'roc_auc',
    'Accuracy': 'accuracy',
    'F1': make_scorer(f1_score),
    'Precision': make_scorer(precision_score),
    'Recall': make_scorer(recall_score),
    'BalancedAcc': make_scorer(balanced_accuracy_score)
}

# =========================
# FINAL MODEL PIPELINE
# =========================
final_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # optional for RF
    ('rf', RandomForestClassifier(
        n_estimators=500,
        random_state=42,
        max_depth=10
    ))
])

# =========================
# CROSS-VALIDATED EVALUATION
# =========================
cv_results = cross_validate(final_pipeline, X, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

print("\n--- Final 5-Feature Model Repeated CV Performance ---")
for metric_name, scores in cv_results.items():
    if metric_name.startswith('test_'):
        print(f"{metric_name[5:]}: {np.mean(scores):.3f} ± {np.std(scores):.3f}")

# =========================
# TRAIN FINAL MODEL ON FULL DATA
# =========================
final_pipeline.fit(X, y)

# =========================
# FEATURE IMPORTANCE
# =========================
rf_model = final_pipeline.named_steps['rf']
importances = pd.Series(rf_model.feature_importances_, index=FEATURES).sort_values(ascending=False)

print("\n--- Feature Importances ---")
print(importances)

# =========================
# SAVE MODEL
# =========================
#joblib.dump(final_pipeline, "PD_MoCA_RF_5Feature_Final.pkl")
print("\nFinal model saved as PD_MoCA_RF_5Feature_Final.pkl")

# Individual fold scores
print(f"Individual Fold AUCs: {cv_results['test_AUC']}")
print(f"Individual Fold Accs: {cv_results['test_Accuracy']}")



--- Final 5-Feature Model Repeated CV Performance ---
AUC: 0.767 ± 0.088
Accuracy: 0.740 ± 0.087
F1: 0.749 ± 0.086
Precision: 0.768 ± 0.097
Recall: 0.743 ± 0.117
BalancedAcc: 0.737 ± 0.089

--- Feature Importances ---
Feature_Theta_Asymmetry_Idx    0.200401
Feature_Theta_Global_Abs       0.199450
Feature_ThetaAlpha_Global      0.168162
Feature_HubPLI_Beta_Frontal    0.155497
Feature_HubPLI_Beta_Delta      0.147449
Feature_DeltaBeta_Global       0.129042
dtype: float64

Final model saved as PD_MoCA_RF_5Feature_Final.pkl
Individual Fold AUCs: [0.80808081 0.88888889 0.65656566 0.67       0.72       0.86868687
 0.62626263 0.62626263 0.88       0.84       0.81818182 0.85858586
 0.83838384 0.74       0.7        0.80808081 0.67676768 0.7979798
 0.84       0.68      ]
Individual Fold Accs: [0.8  0.8  0.65 0.75 0.65 0.8  0.65 0.6  0.9  0.65 0.8  0.8  0.85 0.75
 0.65 0.85 0.6  0.75 0.8  0.7 ]


In [104]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, balanced_accuracy_score
import joblib

# =========================
# LOAD DATA
# =========================
df = pd.read_csv("ML_Feature_Matrix.csv")

# =========================
# SELECTED FEATURES (5 best)
# =========================
FEATURES = [
    'Feature_Sync_Delta_ClassA_Frontal',
    'Feature_Theta_Global_Abs',
    'Feature_HubPLI_Beta_Delta',
    'Feature_DeltaBeta_Global',
    'Feature_Theta_Asymmetry_Idx',
    'Feature_ThetaAlpha_Global'

]

X = df[FEATURES].values
y = df['Label_Impaired'].values

# =========================
# CONFIG
# =========================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring_metrics = {
    'AUC': 'roc_auc',
    'Accuracy': 'accuracy',
    'F1': make_scorer(f1_score),
    'Precision': make_scorer(precision_score),
    'Recall': make_scorer(recall_score),
    'BalancedAcc': make_scorer(balanced_accuracy_score)
}

# =========================
# FINAL MODEL PIPELINE
# =========================
final_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # optional for RF
    ('rf', RandomForestClassifier(
        n_estimators=800,
        random_state=42,
        max_depth=5,
        class_weight='balanced'
    ))
])

# =========================
# CROSS-VALIDATED EVALUATION
# =========================
cv_results = cross_validate(final_pipeline, X, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

print("\n--- Final 5-Feature Model CV Performance ---")
for metric_name, scores in cv_results.items():
    if metric_name.startswith('test_'):
        print(f"{metric_name[5:]}: {np.mean(scores):.3f} ± {np.std(scores):.3f}")

# =========================
# TRAIN FINAL MODEL ON FULL DATA
# =========================
final_pipeline.fit(X, y)

# =========================
# FEATURE IMPORTANCE
# =========================
rf_model = final_pipeline.named_steps['rf']
importances = pd.Series(rf_model.feature_importances_, index=FEATURES).sort_values(ascending=False)

print("\n--- Feature Importances ---")
print(importances)

# =========================
# SAVE MODEL
# =========================
joblib.dump(final_pipeline, "PD_MoCA_RF_5Feature_Final.pkl")
print("\nFinal model saved as PD_MoCA_RF_5Feature_Final.pkl")


# Add this line after your print loop to see the individual fold scores
print(f"Individual Fold AUCs: {cv_results['test_AUC']}")
print(f"Maximum (Peak) AUC: {np.max(cv_results['test_AUC']):.3f}")

print(f"Individual Fold Accs: {cv_results['test_Accuracy']}")
print(f"Maximum (Peak) Acc: {np.max(cv_results['test_Accuracy']):.3f}")


--- Final 5-Feature Model CV Performance ---
AUC: 0.783 ± 0.136
Accuracy: 0.750 ± 0.100
F1: 0.757 ± 0.115
Precision: 0.772 ± 0.132
Recall: 0.751 ± 0.126
BalancedAcc: 0.750 ± 0.103

--- Feature Importances ---
Feature_Theta_Global_Abs             0.214786
Feature_Theta_Asymmetry_Idx          0.195194
Feature_ThetaAlpha_Global            0.165182
Feature_Sync_Delta_ClassA_Frontal    0.159197
Feature_HubPLI_Beta_Delta            0.148849
Feature_DeltaBeta_Global             0.116792
dtype: float64

Final model saved as PD_MoCA_RF_5Feature_Final.pkl
Individual Fold AUCs: [0.6969697  0.8989899  0.83838384 0.56       0.92      ]
Maximum (Peak) AUC: 0.920
Individual Fold Accs: [0.75 0.8  0.9  0.6  0.7 ]
Maximum (Peak) Acc: 0.900


### Hyper Parameter tuning

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
#RANDOM STATE 3
# =========================
# Split data into training and test sets
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    df[FEATURES], df['Label_Impaired'], 
    test_size=0.25, stratify=df['Label_Impaired'], random_state=000
)

# =========================
# Train final pipeline on training set
# =========================
final_pipeline.fit(X_train, y_train)

# =========================
# Get predictions on test set
# =========================
y_pred = final_pipeline.predict(X_test)

# =========================
# Compute confusion matrix
# =========================
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=['Unimpaired', 'Impaired'])
disp.plot(cmap='Blues', values_format='d')
disp.ax_.set_title("Confusion Matrix - Random Forest on Test Set")


NameError: name 'df' is not defined

In [12]:
from sklearn.feature_selection import RFECV

# 1. DEFINE SELECTOR (Linear SVM is best for RFE coefficients)
selector_model = SVC(kernel='linear', class_weight='balanced', random_state=42)

# 2. RFECV PIPELINE
# min_features_to_select=3 prevents the model from dropping too many clinical markers
rfecv = RFECV(
    estimator=selector_model,
    step=1,
    cv=StratifiedKFold(5), 
    scoring='roc_auc',
    min_features_to_select=3,
    n_jobs=-1
)

# 3. FULL PIPELINE: Scale -> RFECV -> Final SVM
robust_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', rfecv),
    ('svm', SVC(probability=True, class_weight='balanced', random_state=42))
])

# 4. ROBUST EVALUATION (Repeated Nested CV)
cv_robust = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)
cv_results = cross_validate(
    robust_pipeline, X, y, 
    cv=cv_robust, 
    scoring=scoring_metrics, 
    n_jobs=-1,
    return_estimator=True
)

# 5. PRINT RESULTS
print(f"RFECV Robust AUC: {np.mean(cv_results['test_AUC']):.3f} ± {np.std(cv_results['test_AUC']):.3f}")

# 6. IDENTIFY CONSENSUS FEATURES
# Check which features were selected most often across the 50 folds
feature_support = np.array([est.named_steps['feature_selection'].support_ for est in cv_results['estimator']])
print("\n--- Feature Selection Frequency ---")
for feat, count in zip(FEATURES, feature_support.sum(axis=0)):
    print(f"{feat}: Selected in {count}/50 folds")


RFECV Robust AUC: 0.721 ± 0.103

--- Feature Selection Frequency ---
Feature_Sync_Delta_ClassA_Frontal: Selected in 15/50 folds
Feature_Theta_Global_Abs: Selected in 18/50 folds
Feature_HubPLI_Beta_Delta: Selected in 48/50 folds
Feature_DeltaBeta_Global: Selected in 21/50 folds
Feature_Theta_Asymmetry_Idx: Selected in 50/50 folds
Feature_ThetaAlpha_Global: Selected in 50/50 folds


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, balanced_accuracy_score

# =========================
# LOAD DATA
# =========================
df = pd.read_csv("ML_Feature_Matrix.csv")
FEATURES = [
    'Feature_Sync_Delta_ClassA_Frontal',
    'Feature_Theta_Global_Abs',
    'Feature_HubPLI_Beta_Delta',
    'Feature_DeltaBeta_Global',
    'Feature_Theta_Asymmetry_Idx'
]
X = df[FEATURES].values
y = df['Label_Impaired'].values

# =========================
# SCORING METRICS
# =========================
scoring_metrics = {
    'AUC': 'roc_auc',
    'Accuracy': 'accuracy',
    'F1': make_scorer(f1_score),
    'Precision': make_scorer(precision_score, zero_division=0),
    'Recall': make_scorer(recall_score),
    'BalancedAcc': make_scorer(balanced_accuracy_score)
}

# =========================
# RFECV Selector
# =========================
selector_rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    class_weight='balanced',
    random_state=42
)

rfecv = RFECV(
    estimator=selector_rf,
    step=1,
    cv=StratifiedKFold(5, shuffle=True, random_state=42),
    scoring='roc_auc',
    min_features_to_select=3,
    n_jobs=1  # safe on Windows
)

# =========================
# FULL PIPELINE
# =========================
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', rfecv),
    ('rf', RandomForestClassifier(
        n_estimators=300,
        max_depth=5,
        class_weight='balanced',
        random_state=42
    ))
])

# =========================
# REPEATED CV
# =========================
cv_robust = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)

cv_results = cross_validate(
    rf_pipeline, X, y,
    cv=cv_robust,
    scoring=scoring_metrics,
    n_jobs=1,  # prevent nested parallelism crash
    return_estimator=True
)

# =========================
# REPORT METRICS
# =========================
print("\n--- RF-RFECV Robust CV Performance ---")
for metric in scoring_metrics.keys():
    scores = cv_results[f'test_{metric}']
    print(f"{metric}: {np.mean(scores):.3f} ± {np.std(scores):.3f}")

# =========================
# FEATURE STABILITY
# =========================
feature_support = np.array([est.named_steps['feature_selection'].support_ for est in cv_results['estimator']])
print("\n--- Random Forest Feature Selection Frequency ---")
for feat, count in zip(FEATURES, feature_support.sum(axis=0)):
    print(f"{feat}: Selected in {count}/{len(cv_results['estimator'])} folds")


RF-RFECV Robust AUC: 0.778 ± 0.081

--- Random Forest Feature Selection Frequency ---
Feature_Sync_Delta_ClassA_Frontal: Selected in 49/50 folds
Feature_Theta_Global_Abs: Selected in 50/50 folds
Feature_HubPLI_Beta_Delta: Selected in 43/50 folds
Feature_DeltaBeta_Global: Selected in 41/50 folds
Feature_Theta_Asymmetry_Idx: Selected in 50/50 folds
Feature_ThetaAlpha_Global: Selected in 49/50 folds


In [2]:
import pandas as pd
import numpy as np
from collections import Counter

from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, balanced_accuracy_score

# =========================
# LOAD DATA
# =========================
df = pd.read_csv(r"C:\Users\User\Documents\EEG_Project\dataSheets\ML_Feature_Matrix.csv")

# Automatically select all features except ID/labels
X = df.drop(columns=['participant_id', 'Target_MoCA', 'Label_Impaired']).values
FEATURES = df.drop(columns=['participant_id', 'Target_MoCA', 'Label_Impaired']).columns
y = df['Label_Impaired'].values

# =========================
# SCORING METRICS
# =========================
scoring_metrics = {
    'AUC': 'roc_auc',
    'Accuracy': 'accuracy',
    'F1': make_scorer(f1_score),
    'Precision': make_scorer(precision_score, zero_division=0),
    'Recall': make_scorer(recall_score),
    'BalancedAcc': make_scorer(balanced_accuracy_score)
}

# =========================
# BASE RF FOR RFECV
# =========================
selector_rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    class_weight='balanced',
    random_state=42
)

# =========================
# RFECV FEATURE SELECTOR
# =========================
rfecv = RFECV(
    estimator=selector_rf,
    step=1,
    cv=StratifiedKFold(5, shuffle=True, random_state=42),
    scoring='roc_auc',
    min_features_to_select=3,
    n_jobs=1
)

# =========================
# FULL PIPELINE
# =========================
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', rfecv),
    ('rf', RandomForestClassifier(
        n_estimators=300,
        max_depth=5,
        class_weight='balanced',
        random_state=42
    ))
])

# =========================
# REPEATED CV
# =========================
cv_robust = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)

cv_results = cross_validate(
    rf_pipeline, X, y,
    cv=cv_robust,
    scoring=scoring_metrics,
    n_jobs=1,
    return_estimator=True
)

# =========================
# REPORT METRICS
# =========================
print("\n--- RF-RFECV Robust CV Performance ---")
for metric in scoring_metrics.keys():
    scores = cv_results[f'test_{metric}']
    print(f"{metric}: {np.mean(scores):.3f} ± {np.std(scores):.3f}")

# =========================
# FEATURE STABILITY
# =========================
feature_support = np.array([est.named_steps['feature_selection'].support_ for est in cv_results['estimator']])
print("\n--- Random Forest Feature Selection Frequency ---")
for feat, count in zip(FEATURES, feature_support.sum(axis=0)):
    print(f"{feat}: Selected in {count}/{len(cv_results['estimator'])} folds")


--- RF-RFECV Robust CV Performance ---
AUC: 0.679 ± 0.092
Accuracy: 0.617 ± 0.088
F1: 0.629 ± 0.101
Precision: 0.649 ± 0.106
Recall: 0.630 ± 0.150
BalancedAcc: 0.616 ± 0.088

--- Random Forest Feature Selection Frequency ---
Feature_ThetaAlpha_Global: Selected in 43/50 folds
Feature_DeltaBeta_Global: Selected in 30/50 folds
Feature_PLI_Beta_C3P3: Selected in 29/50 folds
Feature_PLI_Beta_F3P4: Selected in 17/50 folds
Feature_Network_FrontPost_Beta_PLI: Selected in 30/50 folds
Feature_DPBF_Alpha: Selected in 17/50 folds
Feature_HubPLI_Beta_Frontal: Selected in 47/50 folds
Feature_HubPLI_Alpha_Posterior: Selected in 26/50 folds
Feature_HubPLI_Beta_Posterior: Selected in 22/50 folds
Feature_HubPLI_Beta_Delta: Selected in 40/50 folds
Feature_Delta_CentralParietal_Abs: Selected in 29/50 folds
Feature_Theta_F5_Abs: Selected in 27/50 folds
Feature_Theta_Global_Abs: Selected in 49/50 folds
Feature_Theta_Frontal_ROI_Abs: Selected in 48/50 folds
Feature_ThetaAlpha_Peak_Freq: Selected in 27/50 fo

In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, balanced_accuracy_score, roc_auc_score
import joblib

# =========================
# LOAD DATA
# =========================
df = pd.read_csv("ML_Feature_Matrix.csv")

FEATURES = [
    'Feature_Sync_Delta_ClassA_Frontal',
    'Feature_Theta_Global_Abs',
    'Feature_HubPLI_Beta_Delta',
    'Feature_DeltaBeta_Global',
    'Feature_Theta_Asymmetry_Idx',
    'Feature_ThetaAlpha_Global',
]

X = df[FEATURES].values
y = df['Label_Impaired'].values

# =========================
# CONFIG
# =========================
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42)

scoring_metrics = {
    'AUC': 'roc_auc',
    'Accuracy': 'accuracy',
    'F1': make_scorer(f1_score),
    'Precision': make_scorer(precision_score),
    'Recall': make_scorer(recall_score),
    'BalancedAcc': make_scorer(balanced_accuracy_score)
}

# =========================
# RANDOM FOREST PIPELINE
# =========================
rf = RandomForestClassifier(
    n_estimators=500,        # More trees for stability
    max_depth=10,            # Limit depth to reduce overfitting
    min_samples_split=5,     # Minimum samples per split
    max_features=None,       # Use all features per tree
    bootstrap=True,          # Standard RF bootstrap
    class_weight='balanced', # Handle class imbalance
    random_state=42
)

# =========================
# CROSS-VALIDATED EVALUATION
# =========================
cv_results = cross_validate(
    rf, X, y,
    cv=cv,
    scoring=scoring_metrics,
    return_train_score=False,
    n_jobs=-1
)

print("\n--- RF 6-Feature Model CV Performance (Repeated 5x10) ---")
for key, scores in cv_results.items():
    if key.startswith('test_'):
        print(f"{key[5:]}: {np.mean(scores):.3f} ± {np.std(scores):.3f}")

# =========================
# TRAIN FINAL MODEL
# =========================
rf.fit(X, y)
#joblib.dump(rf, "PD_MoCA_RF_6Feature_Final.pkl")



--- RF 6-Feature Model CV Performance (Repeated 5x10) ---
AUC: 0.739 ± 0.099
Accuracy: 0.728 ± 0.084
F1: 0.737 ± 0.096
Precision: 0.752 ± 0.097
Recall: 0.739 ± 0.142
BalancedAcc: 0.727 ± 0.085


0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",500
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",10
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",5
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
RANDOM_STATE   = 4223
# ============================================================
# 1. DATA SPLITTING — 80/20, Test Set Locked Away
# ============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=RANDOM_STATE
)

print(f"Data Split: Train={len(X_train)} | Test={len(X_test)}")

# ============================================================
# 2. FORWARD FEATURE SELECTION (CV on Train only)
#    Uses 5-fold stratified CV instead of a single val set
#    — much more stable estimates, no data wasted
# ============================================================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

selected_idx = []
remaining_idx = list(range(X.shape[1]))
best_auc = 0.0

print("\n--- Running Forward Selection (5-Fold CV) ---")

for step in range(MAX_FEATURES):
    step_results = []

    for i in remaining_idx:
        trial_idx = selected_idx + [i]

        # Pipeline handles scaler + model inside each CV fold — no leakage
        pipe = Pipeline([
            ("scaler", StandardScaler()),
            ("clf", RandomForestClassifier(
                n_estimators=300, max_depth=5,
                class_weight="balanced", random_state=RANDOM_STATE,
                n_jobs=-1
            ))
        ])

        scores = cross_val_score(
            pipe, X_train[:, trial_idx], y_train,
            cv=cv, scoring="roc_auc", n_jobs=-1
        )
        step_results.append((i, scores.mean()))

    best_i, best_step_auc = max(step_results, key=lambda x: x[1])

    if best_step_auc <= best_auc + 1e-4:
        print("No further improvement — stopping.")
        break

    selected_idx.append(best_i)
    remaining_idx.remove(best_i)
    best_auc = best_step_auc
    print(f"Step {step+1}: Added '{FEATURES[best_i]}' | CV AUC: {best_auc:.4f}")

# ============================================================
# 3. FINAL MODEL — Retrain on ALL train data, evaluate on test
# ============================================================
pipe_final = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(
        n_estimators=300, max_depth=5,
        class_weight="balanced", random_state=RANDOM_STATE,
        n_jobs=-1
    ))
])

pipe_final.fit(X_train[:, selected_idx], y_train)

final_probs = pipe_final.predict_proba(X_test[:, selected_idx])[:, 1]
final_preds = pipe_final.predict(X_test[:, selected_idx])

auc = roc_auc_score(y_test, final_probs)
acc = accuracy_score(y_test, final_preds)
f1  = f1_score(y_test, final_preds)

print("\n" + "=" * 45)
print(f"SELECTED FEATURES : {FEATURES[selected_idx].tolist()}")
print(f"TEST AUC          : {auc:.4f}")
print(f"TEST ACCURACY     : {acc:.4f}")
print(f"TEST F1           : {f1:.4f}")
print("=" * 45)

NameError: name 'FEATURES' is not defined

In [19]:
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
RANDOM_STATE   = 42
# ============================================================
# 1. DATA SPLITTING — 80/20, Test Set Locked Away
# ============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=RANDOM_STATE
)

print(f"Data Split: Train={len(X_train)} | Test={len(X_test)}")

# ============================================================
# 2. FORWARD FEATURE SELECTION (CV on Train only)
#    Uses 5-fold stratified CV instead of a single val set
#    — much more stable estimates, no data wasted
# ============================================================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

selected_idx = []
remaining_idx = list(range(X.shape[1]))
best_auc = 0.0

print("\n--- Running Forward Selection (5-Fold CV) ---")

for step in range(MAX_FEATURES):
    step_results = []

    for i in remaining_idx:
        trial_idx = selected_idx + [i]

        # Pipeline handles scaler + model inside each CV fold — no leakage
        pipe = Pipeline([
            ("scaler", StandardScaler()),
            ("clf", RandomForestClassifier(
                n_estimators=300, max_depth=5,
                class_weight="balanced", random_state=RANDOM_STATE,
                n_jobs=-1
            ))
        ])

        scores = cross_val_score(
            pipe, X_train[:, trial_idx], y_train,
            cv=cv, scoring="roc_auc", n_jobs=-1
        )
        step_results.append((i, scores.mean()))

    best_i, best_step_auc = max(step_results, key=lambda x: x[1])

    if best_step_auc <= best_auc + 1e-4:
        print("No further improvement — stopping.")
        break

    selected_idx.append(best_i)
    remaining_idx.remove(best_i)
    best_auc = best_step_auc
    print(f"Step {step+1}: Added '{FEATURES[best_i]}' | CV AUC: {best_auc:.4f}")

# ============================================================
# 3. FINAL MODEL — Retrain on ALL train data, evaluate on test
# ============================================================
pipe_final = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(
        n_estimators=300, max_depth=5,
        class_weight="balanced", random_state=RANDOM_STATE,
        n_jobs=-1
    ))
])

pipe_final.fit(X_train[:, selected_idx], y_train)

final_probs = pipe_final.predict_proba(X_test[:, selected_idx])[:, 1]
final_preds = pipe_final.predict(X_test[:, selected_idx])

auc = roc_auc_score(y_test, final_probs)
acc = accuracy_score(y_test, final_preds)
f1  = f1_score(y_test, final_preds)

print("\n" + "=" * 45)
print(f"SELECTED FEATURES : {FEATURES[selected_idx].tolist()}")
print(f"TEST AUC          : {auc:.4f}")
print(f"TEST ACCURACY     : {acc:.4f}")
print(f"TEST F1           : {f1:.4f}")
print("=" * 45)

Data Split: Train=80 | Test=20

--- Running Forward Selection (5-Fold CV) ---
Step 1: Added 'Feature_HubPLI_Beta_Delta' | CV AUC: 0.6930
Step 2: Added 'Feature_Theta_Global_Abs' | CV AUC: 0.7390
Step 3: Added 'Feature_Theta_Asymmetry_Idx' | CV AUC: 0.7576
No further improvement — stopping.



TypeError: list indices must be integers or slices, not list

In [31]:
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

RANDOM_STATE = 000
MAX_FEATURES = 5  # cap forward selection at 5

# ============================================================
# 1. DATA SPLITTING — 80/20
# ============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)
print(f"Data Split: Train={len(X_train)} | Test={len(X_test)}")

# ============================================================
# 2. FORWARD FEATURE SELECTION (Transparent Step-by-Step)
# ============================================================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

selected_idx = []
remaining_idx = list(range(X.shape[1]))
best_auc = 0.0

print("\n--- Running Forward Selection (5-Fold CV) ---")
for step in range(MAX_FEATURES):
    step_results = []

    for i in remaining_idx:
        trial_idx = selected_idx + [i]

        pipe = Pipeline([
            ("scaler", StandardScaler()),
            ("clf", RandomForestClassifier(
                n_estimators=300,
                max_depth=5,
                class_weight="balanced",
                random_state=4,
                n_jobs=-1
            ))
        ])

        scores = cross_val_score(
            pipe, X_train[:, trial_idx], y_train,
            cv=cv, scoring="roc_auc", n_jobs=-1
        )
        step_results.append((i, scores.mean()))

    best_i, best_step_auc = max(step_results, key=lambda x: x[1])

    # stop if no improvement
    if best_step_auc <= best_auc + 1e-4:
        print("No further improvement — stopping.")
        break

    selected_idx.append(best_i)
    remaining_idx.remove(best_i)
    best_auc = best_step_auc
    print(f"Step {step+1}: Added '{FEATURES[best_i]}' | CV AUC: {best_auc:.4f}")

# ============================================================
# 3. FINAL MODEL
# ============================================================
pipe_final = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(
        n_estimators=300,
        max_depth=5,
        class_weight="balanced",
        random_state=RANDOM_STATE,
        n_jobs=-1
    ))
])

pipe_final.fit(X_train[:, selected_idx], y_train)
final_probs = pipe_final.predict_proba(X_test[:, selected_idx])[:, 1]
final_preds = pipe_final.predict(X_test[:, selected_idx])

auc = roc_auc_score(y_test, final_probs)
acc = accuracy_score(y_test, final_preds)
f1  = f1_score(y_test, final_preds)

print("\n" + "="*45)
print(f"SELECTED FEATURES : {[FEATURES[i] for i in selected_idx]}")
print(f"TEST AUC          : {auc:.4f}")
print(f"TEST ACCURACY     : {acc:.4f}")
print(f"TEST F1           : {f1:.4f}")
print("="*45)

Data Split: Train=80 | Test=20

--- Running Forward Selection (5-Fold CV) ---
Step 1: Added 'Feature_Theta_Asymmetry_Idx' | CV AUC: 0.6096
Step 2: Added 'Feature_Theta_Global_Abs' | CV AUC: 0.7293
Step 3: Added 'Feature_HubPLI_Beta_Delta' | CV AUC: 0.7823
No further improvement — stopping.

SELECTED FEATURES : ['Feature_Theta_Asymmetry_Idx', 'Feature_Theta_Global_Abs', 'Feature_HubPLI_Beta_Delta']
TEST AUC          : 0.6869
TEST ACCURACY     : 0.6000
TEST F1           : 0.6364
