### Simple Logistic Reggresion

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# =========================
# 1. LOAD DATA
# =========================
df = pd.read_csv("ML_Feature_Matrix.csv").dropna()
X = df.drop(columns=['participant_id', 'Target_MoCA', 'Label_Impaired'])
y = df['Label_Impaired'].values
feature_names = X.columns.tolist()

# =========================
# 2. SCALE FEATURES
# =========================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# =========================
# 3. CONFIG
# =========================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
clf = LogisticRegression(max_iter=1000, solver='liblinear', random_state=42)

# =========================
# 4. CROSS-VALIDATED METRICS
# =========================
auc_scores = cross_val_score(clf, X_scaled, y, cv=cv, scoring='roc_auc')
acc_scores = cross_val_score(clf, X_scaled, y, cv=cv, scoring='accuracy')

print(f"Logistic Regression CV AUC: {np.mean(auc_scores):.3f} ± {np.std(auc_scores):.3f}")
print(f"Logistic Regression CV Accuracy: {np.mean(acc_scores):.3f} ± {np.std(acc_scores):.3f}")


Logistic Regression CV AUC: 0.648 ± 0.117
Logistic Regression CV Accuracy: 0.550 ± 0.095


### Most supported features

In [23]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, balanced_accuracy_score, roc_auc_score
import joblib

# =========================
# CONFIG: Set features here
# =========================
FEATURES = [
    'Feature_Instab_Delta_occurrence_Var',
    'Feature_HubPLI_Beta_Frontal',
    'Feature_Instab_Delta_coverage_CV',
    'Feature_Theta_Asymmetry_Idx',
    'Feature_DWT_Theta_over_alpha_frac',
    'Feature_Gamma_Posterior_Abs',

]

# =========================
# LOAD DATA
# =========================
df = pd.read_csv("ML_Feature_Matrix.csv")
X = df[FEATURES].values
y = df['Label_Impaired'].values

# =========================
# MODEL CONFIG
# =========================
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=4, random_state=42)

metrics = {
    'AUC': 'roc_auc',
    'Accuracy': 'accuracy',
    'F1': make_scorer(f1_score),
    'Precision': make_scorer(precision_score),
    'Recall': make_scorer(recall_score),
    'BalancedAcc': make_scorer(balanced_accuracy_score)
}

# =========================
# BUILD PIPELINE
# =========================
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(kernel='linear', class_weight='balanced', probability=True, random_state=42))
])

# =========================
# CROSS-VALIDATED EVALUATION
# =========================
cv_results = cross_validate(pipeline, X, y, cv=cv, scoring=metrics, return_train_score=False)

print("\n--- SVM Classification Performance ---")
for metric_name, scores in cv_results.items():
    if metric_name.startswith('test_'):
        print(f"{metric_name[5:]}: {np.mean(scores):.3f} ± {np.std(scores):.3f}")

# =========================
# TRAIN FINAL MODEL ON FULL DATA
# =========================
pipeline.fit(X, y)
joblib.dump(pipeline, "PD_MoCA_SVM_SelectedFeatures.pkl")
print("\nFinal SVM model saved as PD_MoCA_SVM_SelectedFeatures.pkl")



--- SVM Classification Performance ---
AUC: 0.806 ± 0.083
Accuracy: 0.715 ± 0.076
F1: 0.702 ± 0.097
Precision: 0.782 ± 0.106
Recall: 0.651 ± 0.137
BalancedAcc: 0.719 ± 0.076

Final SVM model saved as PD_MoCA_SVM_SelectedFeatures.pkl


In [3]:
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, balanced_accuracy_score

# =========================
# LOAD DATA
# =========================
df = pd.read_csv(r"C:\Users\User\Documents\EEG_Project\dataSheets\ML_Feature_Matrix.csv")

X = df.drop(columns=["participant_id", "Target_MoCA", "Label_Impaired"])
y = df["Label_Impaired"].values
feature_names = np.array(X.columns)

# =========================
# CV SETUP
# =========================
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=4224)
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=4242)

# =========================
# BASE MODEL
# =========================
base_model = LogisticRegression(
    penalty="l2",
    solver="liblinear",
    max_iter=1000
)

# =========================
# STORAGE
# =========================
outer_auc = []
outer_bal_acc = []
feature_counter = Counter()

# =========================
# NESTED CV LOOP
# =========================
for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X, y), 1):

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # ----- Inner pipeline: scaling + feature selection + model -----
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("sfs", SequentialFeatureSelector(
            base_model,
            n_features_to_select=5,
            direction="forward",
            scoring="roc_auc",
            cv=inner_cv,
            n_jobs=-1
        )),
        ("clf", base_model)
    ])

    # ----- Fit ONLY on training data -----
    pipe.fit(X_train, y_train)

    # ----- Evaluate on held-out outer fold -----
    y_prob = pipe.predict_proba(X_test)[:, 1]
    y_pred = pipe.predict(X_test)

    outer_auc.append(roc_auc_score(y_test, y_prob))
    outer_bal_acc.append(balanced_accuracy_score(y_test, y_pred))

    # ----- Track selected features (from training only) -----
    selected_mask = pipe.named_steps["sfs"].get_support()
    selected_features = feature_names[selected_mask]
    feature_counter.update(selected_features)

    print(f"Fold {fold} complete | AUC={outer_auc[-1]:.3f} | BalAcc={outer_bal_acc[-1]:.3f}")

# =========================
# RESULTS
# =========================
print("\n===== Nested CV Performance =====")
print(f"Mean AUC: {np.mean(outer_auc):.3f} ± {np.std(outer_auc):.3f}")
print(f"Mean Balanced Acc: {np.mean(outer_bal_acc):.3f} ± {np.std(outer_bal_acc):.3f}")

print("\n===== Feature Selection Frequency =====")
for feat, count in feature_counter.most_common():
    print(f"{feat}: selected in {count}/5 folds")



Fold 1 complete | AUC=0.616 | BalAcc=0.596




Fold 2 complete | AUC=0.677 | BalAcc=0.596




Fold 3 complete | AUC=0.626 | BalAcc=0.485




Fold 4 complete | AUC=0.760 | BalAcc=0.700
Fold 5 complete | AUC=0.640 | BalAcc=0.600

===== Nested CV Performance =====
Mean AUC: 0.664 ± 0.052
Mean Balanced Acc: 0.595 ± 0.068

===== Feature Selection Frequency =====
Feature_Gamma_Posterior_Abs: selected in 4/5 folds
Feature_HubPLI_Beta_Frontal: selected in 3/5 folds
Feature_Theta_Asymmetry_Idx: selected in 3/5 folds
Feature_Delta_CentralParietal_Abs: selected in 2/5 folds
Feature_DWT_Theta_energy_var: selected in 2/5 folds
Feature_Theta_F5_Abs: selected in 1/5 folds
Feature_Instab_Alpha_occurrence_Var: selected in 1/5 folds
Feature_DWT_Theta_over_alpha_frac: selected in 1/5 folds
Feature_Sync_Delta_ClassA_Frontal: selected in 1/5 folds
Feature_DeltaBeta_Global: selected in 1/5 folds
Feature_Instab_Theta_duration_Var: selected in 1/5 folds
Feature_Instab_Delta_occurrence_Var: selected in 1/5 folds
Feature_ThetaAlpha_Global: selected in 1/5 folds
Feature_Theta_Temporal_Correlation: selected in 1/5 folds
Feature_Instab_Theta_occurrence

