In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, balanced_accuracy_score
import joblib

# =========================
# LOAD DATA
# =========================
df = pd.read_csv("ML_Feature_Matrix.csv")

# Define the 6 features from forward selection
FEATURES = [
    'Feature_Sync_Delta_ClassA_Frontal',
    'Feature_Theta_Global_Abs',
    'Feature_HubPLI_Beta_Delta',
    'Feature_DeltaBeta_Global',
    'Feature_Theta_Asymmetry_Idx',
    'Feature_Instab_Theta_duration_Var'
]

X = df[FEATURES].values
y = df['Label_Impaired'].values

# =========================
# CONFIG
# =========================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring_metrics = {
    'AUC': 'roc_auc',
    'Accuracy': 'accuracy',
    'F1': make_scorer(f1_score),
    'Precision': make_scorer(precision_score),
    'Recall': make_scorer(recall_score),
    'BalancedAcc': make_scorer(balanced_accuracy_score)
}

# =========================
# BUILD FINAL MODEL PIPELINE
# =========================
final_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(
        n_estimators=800,
        max_depth=6,
        min_samples_leaf=4,
        min_samples_split=8,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

# =========================
# CROSS-VALIDATED EVALUATION
# =========================
cv_results = cross_validate(final_pipeline, X, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

print("\n--- Final 6-Feature Model CV Performance ---")
for metric_name, scores in cv_results.items():
    if metric_name.startswith('test_'):
        print(f"{metric_name}: {np.mean(scores):.3f} ± {np.std(scores):.3f}")

# =========================
# TRAIN FINAL MODEL ON FULL DATA
# =========================
final_pipeline.fit(X, y)

# =========================
# FEATURE IMPORTANCE EXTRACTION
# =========================
rf_model = final_pipeline.named_steps['rf']
importances = pd.Series(rf_model.feature_importances_, index=FEATURES).sort_values(ascending=False)

print("\n--- Feature Importances ---")
print(importances)

# =========================
# SAVE MODEL
# =========================
joblib.dump(final_pipeline, "PD_MoCA_RF_6Feature_Final.pkl")
print("\nFinal model saved as PD_MoCA_RF_6Feature_Final.pkl")



--- Final 6-Feature Model CV Performance ---
test_AUC: 0.767 ± 0.099
test_Accuracy: 0.730 ± 0.040
test_F1: 0.732 ± 0.063
test_Precision: 0.769 ± 0.063
test_Recall: 0.715 ± 0.113
test_BalancedAcc: 0.729 ± 0.040

--- Feature Importances ---
Feature_Theta_Global_Abs             0.250293
Feature_Theta_Asymmetry_Idx          0.201214
Feature_Sync_Delta_ClassA_Frontal    0.178198
Feature_HubPLI_Beta_Delta            0.150434
Feature_DeltaBeta_Global             0.134646
Feature_Instab_Theta_duration_Var    0.085216
dtype: float64

Final model saved as PD_MoCA_RF_6Feature_Final.pkl


In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, roc_auc_score, f1_score, accuracy_score, precision_score, recall_score, balanced_accuracy_score
import joblib

# =========================
# LOAD DATA
# =========================
df = pd.read_csv("ML_Feature_Matrix.csv")

# Use your 6 selected features
FEATURES = [
    'Feature_Sync_Delta_ClassA_Frontal',
    'Feature_Theta_Global_Abs',
    'Feature_HubPLI_Beta_Delta',
    'Feature_DeltaBeta_Global',
    'Feature_Theta_Asymmetry_Idx',
    'Feature_Instab_Theta_duration_Var'
]

X = df[FEATURES].values
y = df['Label_Impaired'].values  # binary target

# =========================
# CONFIG
# =========================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring_metrics = {
    'AUC': 'roc_auc',
    'Accuracy': 'accuracy',
    'F1': make_scorer(f1_score),
    'Precision': make_scorer(precision_score),
    'Recall': make_scorer(recall_score),
    'BalancedAcc': make_scorer(balanced_accuracy_score)
}

# =========================
# BUILD PIPELINE
# =========================
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(
        n_estimators=800,
        max_depth=6,
        min_samples_leaf=4,
        min_samples_split=8,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

# =========================
# CROSS-VALIDATED EVALUATION
# =========================
cv_results = cross_validate(rf_pipeline, X, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

print("\n--- CV Performance ---")
for metric_name, scores in cv_results.items():
    if metric_name.startswith('test_'):
        print(f"{metric_name}: {np.mean(scores):.3f} ± {np.std(scores):.3f}")

# =========================
# TRAIN FINAL MODEL ON FULL DATA
# =========================
rf_pipeline.fit(X, y)

# =========================
# CONTINUOUS RISK SCORE
# =========================
# This is the probability of being impaired (continuous 0–1)
risk_scores = rf_pipeline.predict_proba(X)[:, 1]
df['MoCA_Risk_Score'] = risk_scores
print("\nSample continuous cognitive risk scores:")
print(df[['participant_id', 'MoCA_Risk_Score']].head())

# =========================
# FEATURE IMPORTANCES
# =========================
rf_model = rf_pipeline.named_steps['rf']
importances = pd.Series(rf_model.feature_importances_, index=FEATURES).sort_values(ascending=False)
print("\n--- Feature Importances ---")
print(importances)

# =========================
# SAVE MODEL
# =========================
joblib.dump(rf_pipeline, "PD_MoCA_RF_6Feature_RiskScore.pkl")
print("\nFinal model saved as PD_MoCA_RF_6Feature_RiskScore.pkl")



--- CV Performance ---
test_AUC: 0.767 ± 0.099
test_Accuracy: 0.730 ± 0.040
test_F1: 0.732 ± 0.063
test_Precision: 0.769 ± 0.063
test_Recall: 0.715 ± 0.113
test_BalancedAcc: 0.729 ± 0.040

Sample continuous cognitive risk scores:
  participant_id  MoCA_Risk_Score
0        sub-001         0.503580
1        sub-002         0.862633
2        sub-003         0.585766
3        sub-004         0.702518
4        sub-005         0.806243

--- Feature Importances ---
Feature_Theta_Global_Abs             0.250293
Feature_Theta_Asymmetry_Idx          0.201214
Feature_Sync_Delta_ClassA_Frontal    0.178198
Feature_HubPLI_Beta_Delta            0.150434
Feature_DeltaBeta_Global             0.134646
Feature_Instab_Theta_duration_Var    0.085216
dtype: float64

Final model saved as PD_MoCA_RF_6Feature_RiskScore.pkl


### Optimize

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, f1_score, accuracy_score, precision_score, recall_score, balanced_accuracy_score, roc_auc_score
import joblib

# =========================
# LOAD DATA
# =========================
df = pd.read_csv("ML_Feature_Matrix.csv")

FEATURES = [
    'Feature_Sync_Delta_ClassA_Frontal',
    'Feature_Theta_Global_Abs',
    'Feature_HubPLI_Beta_Delta',
    'Feature_DeltaBeta_Global',
    'Feature_Theta_Asymmetry_Idx',
    'Feature_Instab_Theta_duration_Var'
]

X = df[FEATURES].values
y = df['Label_Impaired'].values

# =========================
# PIPELINE
# =========================
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced'))
])

# =========================
# HYPERPARAMETER SPACE
# =========================
param_dist = {
    'rf__n_estimators': [500, 800, 1000, 1200],
    'rf__max_depth': [4, 6, 8, 10, None],
    'rf__min_samples_leaf': [2, 4, 6, 8],
    'rf__min_samples_split': [2, 4, 6, 8],
    'rf__max_features': ['sqrt', 'log2', 0.8, 1.0]
}

# =========================
# CROSS-VALIDATION & RANDOM SEARCH
# =========================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    rf_pipeline, 
    param_distributions=param_dist, 
    n_iter=30, 
    cv=cv, 
    scoring='roc_auc', 
    n_jobs=-1, 
    random_state=42,
    verbose=1
)

search.fit(X, y)

print("\n=== Best Hyperparameters ===")
print(search.best_params_)
print(f"Best CV AUC: {search.best_score_:.3f}")

# =========================
# EVALUATE CV METRICS
# =========================
best_model = search.best_estimator_

# Use cross_validate to report multiple metrics
scoring_metrics = {
    'AUC': 'roc_auc',
    'Accuracy': 'accuracy',
    'F1': make_scorer(f1_score),
    'Precision': make_scorer(precision_score),
    'Recall': make_scorer(recall_score),
    'BalancedAcc': make_scorer(balanced_accuracy_score)
}

cv_results = cross_validate(best_model, X, y, cv=cv, scoring=scoring_metrics, return_train_score=False)

print("\n--- Optimized CV Performance ---")
for metric_name, scores in cv_results.items():
    if metric_name.startswith('test_'):
        print(f"{metric_name}: {np.mean(scores):.3f} ± {np.std(scores):.3f}")

# =========================
# TRAIN FINAL MODEL ON FULL DATA
# =========================
best_model.fit(X, y)

# =========================
# CONTINUOUS RISK SCORE
# =========================
risk_scores = best_model.predict_proba(X)[:, 1]
df['MoCA_Risk_Score'] = risk_scores
print("\nSample continuous cognitive risk scores:")
print(df[['participant_id', 'MoCA_Risk_Score']].tail())

# =========================
# FEATURE IMPORTANCES
# =========================
rf_model = best_model.named_steps['rf']
importances = pd.Series(rf_model.feature_importances_, index=FEATURES).sort_values(ascending=False)
print("\n--- Feature Importances ---")
print(importances)

# =========================
# SAVE FINAL MODEL
# =========================
joblib.dump(best_model, "PD_MoCA_RF_6Feature_Optimized.pkl")
print("\nFinal optimized model saved as PD_MoCA_RF_6Feature_Optimized.pkl")


Fitting 5 folds for each of 30 candidates, totalling 150 fits


KeyboardInterrupt: 