In [27]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, mean_squared_error, r2_score, mean_absolute_error
import joblib

# =========================
# LOAD DATA
# =========================
df = pd.read_csv("ML_Feature_Matrix.csv")

FEATURES = [
    'Feature_ThetaAlpha_Global','Feature_DeltaBeta_Global','Feature_PLI_Beta_C3P3','Feature_PLI_Beta_F3P4',
    'Feature_Network_FrontPost_Beta_PLI','Feature_DPBF_Alpha','Feature_HubPLI_Beta_Frontal','Feature_HubPLI_Alpha_Posterior',
    'Feature_HubPLI_Beta_Posterior','Feature_HubPLI_Beta_Delta','Feature_Delta_CentralParietal_Abs','Feature_Theta_F5_Abs',
    'Feature_Theta_Global_Abs','Feature_Theta_Frontal_ROI_Abs','Feature_ThetaAlpha_Peak_Freq','Feature_Gamma_Posterior_Abs',
    'Feature_Theta_Asymmetry_Idx','Feature_DWT_Theta_energy_var','Feature_DWT_Theta_over_alpha_frac','Feature_Theta_Temporal_Correlation',
    'Feature_Delta_Functional_Strength_Posterior','Feature_Sync_Delta_ClassA_Frontal','Feature_Sync_Theta_ClassD_Central',
    'Feature_Instab_Alpha_duration_Var','Feature_Instab_Delta_duration_Var','Feature_Instab_Theta_duration_Var',
    'Feature_Instab_Alpha_occurrence_Var','Feature_Instab_Delta_occurrence_Var','Feature_Instab_Theta_occurrence_Var',
    'Feature_Instab_Alpha_coverage_CV','Feature_Instab_Delta_coverage_CV','Feature_Instab_Theta_coverage_CV'
]

X = df[FEATURES].values
y = df['Target_MoCA'].values  # continuous MoCA

# =========================
# CONFIG
# =========================
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Regression scorers
scoring_metrics = {
    'R2': make_scorer(r2_score),
    'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
    'RMSE': make_scorer(lambda y_true, y_pred: mean_squared_error(y_true, y_pred, squared=False), greater_is_better=False)
}

# =========================
# BUILD PIPELINE
# =========================
rf_regressor = RandomForestRegressor(
    n_estimators=800,
    max_depth=6,
    min_samples_leaf=4,
    min_samples_split=8,
    random_state=42,
    n_jobs=-1
)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', rf_regressor)
])

# =========================
# APPLY RFE
# =========================
# Let's select top 10 features for illustration
rfe_selector = RFE(estimator=rf_regressor, n_features_to_select=10, step=1)
pipeline_rfe = Pipeline([
    ('scaler', StandardScaler()),
    ('rfe', rfe_selector),
    ('regressor', rf_regressor)
])

# =========================
# CROSS-VALIDATION
# =========================
r2_scores = cross_val_score(pipeline_rfe, X, y, cv=cv, scoring='r2')
mae_scores = -cross_val_score(pipeline_rfe, X, y, cv=cv, scoring=make_scorer(mean_absolute_error))
rmse_scores = -cross_val_score(pipeline_rfe, X, y, cv=cv, scoring=make_scorer(lambda y_true, y_pred: mean_squared_error(y_true, y_pred, squared=False)))

print("\n--- RFE RandomForest Regression CV ---")
print(f"R2: {np.mean(r2_scores):.3f} ± {np.std(r2_scores):.3f}")
print(f"MAE: {np.mean(mae_scores):.3f} ± {np.std(mae_scores):.3f}")
print(f"RMSE: {np.mean(rmse_scores):.3f} ± {np.std(rmse_scores):.3f}")

# =========================
# TRAIN FINAL MODEL ON FULL DATA
# =========================
pipeline_rfe.fit(X, y)

# Get selected features
selected_features = [FEATURES[i] for i in rfe_selector.get_support(indices=True)]
print("\nSelected Features after RFE:")
print(selected_features)

# Feature importances
importances = pd.Series(pipeline_rfe.named_steps['regressor'].feature_importances_, index=selected_features).sort_values(ascending=False)
print("\n--- Feature Importances ---")
print(importances)

# =========================
# SAVE MODEL
# =========================
joblib.dump(pipeline_rfe, "PD_MoCA_RF_Regression_RFE.pkl")
print("\nFinal RFE regression model saved as PD_MoCA_RF_Regression_RFE.pkl")


Traceback (most recent call last):
  File "c:\Users\User\anaconda3\envs\eeg_ml\Lib\site-packages\sklearn\metrics\_scorer.py", line 166, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "c:\Users\User\anaconda3\envs\eeg_ml\Lib\site-packages\sklearn\metrics\_scorer.py", line 417, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\User\AppData\Local\Temp\ipykernel_41232\3697628036.py", line 76, in <lambda>
    rmse_scores = -cross_val_score(pipeline_rfe, X, y, cv=cv, scoring=make_scorer(lambda y_true, y_pred: mean_squared_error(y_true, y_pred, squared=False)))
                                                                                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\User\anaconda3\envs\eeg_ml\Lib\site-packages\sklearn\utils\_param_validation.py", line 196, in wrapper
    params 


--- RFE RandomForest Regression CV ---
R2: 0.038 ± 0.111
MAE: -3.045 ± 0.496
RMSE: nan ± nan

Selected Features after RFE:
['Feature_ThetaAlpha_Global', 'Feature_Theta_Global_Abs', 'Feature_Gamma_Posterior_Abs', 'Feature_Theta_Asymmetry_Idx', 'Feature_DWT_Theta_energy_var', 'Feature_DWT_Theta_over_alpha_frac', 'Feature_Theta_Temporal_Correlation', 'Feature_Sync_Theta_ClassD_Central', 'Feature_Instab_Alpha_occurrence_Var', 'Feature_Instab_Theta_occurrence_Var']

--- Feature Importances ---
Feature_DWT_Theta_over_alpha_frac      0.248130
Feature_Instab_Theta_occurrence_Var    0.132009
Feature_Theta_Asymmetry_Idx            0.096947
Feature_ThetaAlpha_Global              0.096719
Feature_Gamma_Posterior_Abs            0.084700
Feature_DWT_Theta_energy_var           0.070852
Feature_Theta_Global_Abs               0.070575
Feature_Theta_Temporal_Correlation     0.070173
Feature_Sync_Theta_ClassD_Central      0.065649
Feature_Instab_Alpha_occurrence_Var    0.064247
dtype: float64

Final RFE