### Simple Logistic Reggresion

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# =========================
# 1. LOAD DATA
# =========================
df = pd.read_csv("ML_Feature_Matrix.csv").dropna()
X = df.drop(columns=['participant_id', 'Target_MoCA', 'Label_Impaired'])
y = df['Label_Impaired'].values
feature_names = X.columns.tolist()

# =========================
# 2. SCALE FEATURES
# =========================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# =========================
# 3. CONFIG
# =========================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
clf = LogisticRegression(max_iter=1000, solver='liblinear', random_state=42)

# =========================
# 4. CROSS-VALIDATED METRICS
# =========================
auc_scores = cross_val_score(clf, X_scaled, y, cv=cv, scoring='roc_auc')
acc_scores = cross_val_score(clf, X_scaled, y, cv=cv, scoring='accuracy')

print(f"Logistic Regression CV AUC: {np.mean(auc_scores):.3f} ± {np.std(auc_scores):.3f}")
print(f"Logistic Regression CV Accuracy: {np.mean(acc_scores):.3f} ± {np.std(acc_scores):.3f}")


Logistic Regression CV AUC: 0.648 ± 0.117
Logistic Regression CV Accuracy: 0.550 ± 0.095


### Most supported features

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, balanced_accuracy_score

# =========================
# LOAD DATA
# =========================
df = pd.read_csv("ML_Feature_Matrix.csv")
X = df.drop(columns=['participant_id', 'Target_MoCA', 'Label_Impaired'])
y = df['Label_Impaired'].values
feature_names = X.columns.tolist()

# =========================
# SCALE FEATURES
# =========================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# =========================
# CONFIG
# =========================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

classifiers = {
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
}

scoring_metrics = {
    'AUC': 'roc_auc',
    'Accuracy': 'accuracy',
    'F1': make_scorer(f1_score),
    'Precision': make_scorer(precision_score),
    'Recall': make_scorer(recall_score),
    'BalancedAcc': make_scorer(balanced_accuracy_score)
}

# =========================
# FORWARD SELECTION FUNCTION
# =========================
def forward_selection(X, y, clf, max_features=None):
    selected = []
    remaining = list(range(X.shape[1]))
    best_score = 0.0
    best_metrics = {}
    max_features = max_features or X.shape[1]

    while remaining and len(selected) < max_features:
        trial_results = []
        for i in remaining:
            trial_features = selected + [i]
            metric_scores = {name: np.mean(cross_val_score(clf, X[:, trial_features], y, cv=cv, scoring=sc)) 
                             for name, sc in scoring_metrics.items()}
            trial_results.append((metric_scores, i))
        # sort by AUC
        trial_results.sort(key=lambda x: x[0]['AUC'], reverse=True)
        if trial_results[0][0]['AUC'] > best_score:
            best_score = trial_results[0][0]['AUC']
            best_feat = trial_results[0][1]
            best_metrics = trial_results[0][0]
            selected.append(best_feat)
            remaining.remove(best_feat)
            print(f"Added: {feature_names[best_feat]} | AUC: {best_metrics['AUC']:.3f} | Acc: {best_metrics['Accuracy']:.3f} | "
                  f"F1: {best_metrics['F1']:.3f} | Precision: {best_metrics['Precision']:.3f} | Recall: {best_metrics['Recall']:.3f} | "
                  f"BalancedAcc: {best_metrics['BalancedAcc']:.3f} | Total features: {len(selected)}")
        else:
            break
    return [feature_names[i] for i in selected], best_metrics

# =========================
# RUN FOR EACH CLASSIFIER
# =========================
for name, clf in classifiers.items():
    print(f"\nStarting forward feature selection for {name}...")
    sel_features, metrics = forward_selection(X_scaled, y, clf)
    print(f"\n{name} selected features: {sel_features}")
    print("Final CV metrics:")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value:.3f}")



Starting forward feature selection for RandomForest...
Added: Feature_Sync_Delta_ClassA_Frontal | AUC: 0.638 | Acc: 0.570 | F1: 0.619 | Precision: 0.588 | Recall: 0.662 | BalancedAcc: 0.568 | Total features: 1
Added: Feature_Theta_Global_Abs | AUC: 0.764 | Acc: 0.690 | F1: 0.688 | Precision: 0.729 | Recall: 0.660 | BalancedAcc: 0.693 | Total features: 2
Added: Feature_HubPLI_Beta_Delta | AUC: 0.793 | Acc: 0.730 | F1: 0.739 | Precision: 0.776 | Recall: 0.715 | BalancedAcc: 0.731 | Total features: 3
Added: Feature_DeltaBeta_Global | AUC: 0.797 | Acc: 0.740 | F1: 0.746 | Precision: 0.775 | Recall: 0.735 | BalancedAcc: 0.741 | Total features: 4
Added: Feature_Theta_Asymmetry_Idx | AUC: 0.812 | Acc: 0.740 | F1: 0.743 | Precision: 0.761 | Recall: 0.733 | BalancedAcc: 0.739 | Total features: 5
Added: Feature_Instab_Theta_duration_Var | AUC: 0.813 | Acc: 0.720 | F1: 0.726 | Precision: 0.755 | Recall: 0.715 | BalancedAcc: 0.717 | Total features: 6

RandomForest selected features: ['Feature_Syn

In [23]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, balanced_accuracy_score, roc_auc_score
import joblib

# =========================
# CONFIG: Set features here
# =========================
FEATURES = [
    'Feature_Instab_Delta_occurrence_Var',
    'Feature_HubPLI_Beta_Frontal',
    'Feature_Instab_Delta_coverage_CV',
    'Feature_Theta_Asymmetry_Idx',
    'Feature_DWT_Theta_over_alpha_frac',
    'Feature_Gamma_Posterior_Abs',

]

# =========================
# LOAD DATA
# =========================
df = pd.read_csv("ML_Feature_Matrix.csv")
X = df[FEATURES].values
y = df['Label_Impaired'].values

# =========================
# MODEL CONFIG
# =========================
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=4, random_state=42)

metrics = {
    'AUC': 'roc_auc',
    'Accuracy': 'accuracy',
    'F1': make_scorer(f1_score),
    'Precision': make_scorer(precision_score),
    'Recall': make_scorer(recall_score),
    'BalancedAcc': make_scorer(balanced_accuracy_score)
}

# =========================
# BUILD PIPELINE
# =========================
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(kernel='linear', class_weight='balanced', probability=True, random_state=42))
])

# =========================
# CROSS-VALIDATED EVALUATION
# =========================
cv_results = cross_validate(pipeline, X, y, cv=cv, scoring=metrics, return_train_score=False)

print("\n--- SVM Classification Performance ---")
for metric_name, scores in cv_results.items():
    if metric_name.startswith('test_'):
        print(f"{metric_name[5:]}: {np.mean(scores):.3f} ± {np.std(scores):.3f}")

# =========================
# TRAIN FINAL MODEL ON FULL DATA
# =========================
pipeline.fit(X, y)
joblib.dump(pipeline, "PD_MoCA_SVM_SelectedFeatures.pkl")
print("\nFinal SVM model saved as PD_MoCA_SVM_SelectedFeatures.pkl")



--- SVM Classification Performance ---
AUC: 0.806 ± 0.083
Accuracy: 0.715 ± 0.076
F1: 0.702 ± 0.097
Precision: 0.782 ± 0.106
Recall: 0.651 ± 0.137
BalancedAcc: 0.719 ± 0.076

Final SVM model saved as PD_MoCA_SVM_SelectedFeatures.pkl
