In [3]:
import re
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.base import clone

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# --- Load data ---
con_df = pd.read_csv(r"E:\USA_PD_2024\Analysis\ppr6\COP\Data\Walk\Full\Controlled.csv")
pd_df = pd.read_csv(r"E:\USA_PD_2024\Analysis\ppr6\COP\Data\Walk\Full\PD.csv")

con_df["label"] = 0
pd_df["label"] = 1
df = pd.concat([con_df, pd_df], ignore_index=True)

# --- Extract SubjectID from File name ---
def get_id(fname):
    base = Path(fname).stem
    m = re.search(r"_[a-zA-Z]*([0-9]+)", base)
    return m.group(1) if m else base

df["SubjectID"] = df["File"].apply(get_id)

# --- Drop non-feature columns: label, File, SubjectID, and Age ---
X = df.drop(columns=["label", "File", "SubjectID", "Age"])
X = pd.get_dummies(X, columns=["Sex"], drop_first=True)
y = df["label"].values
groups = df["SubjectID"].values

# --- Cross-validation setup ---
cv = GroupKFold(n_splits=5)

# --- Forward Selection based on F1-score ---
def groupwise_sfs(pipe, X, y, groups):
    best_score = 0
    selected = []
    remaining = list(X.columns)
    while remaining:
        best_feat = None
        for feat in remaining:
            trial_feats = selected + [feat]
            fold_scores = []
            for tr, te in cv.split(X[trial_feats], y, groups):
                model = clone(pipe)
                model.fit(X[trial_feats].iloc[tr], y[tr])
                preds = model.predict(X[trial_feats].iloc[te])
                fold_scores.append(f1_score(y[te], preds))
            mean_score = np.mean(fold_scores)
            if mean_score > best_score:
                best_score = mean_score
                best_feat = feat
        if best_feat:
            selected.append(best_feat)
            remaining.remove(best_feat)
        else:
            break
    return selected

# --- Define models and hyperparameters ---
models = [
    ("SVM (RBF)", Pipeline([("scaler", StandardScaler()), ("clf", SVC(kernel="rbf", probability=True, random_state=42))]),
     {"clf__C": [0.1, 1, 10], "clf__gamma": ["scale", 0.1]}),

    ("Random Forest", Pipeline([("clf", RandomForestClassifier(random_state=42, n_jobs=-1))]),
     {"clf__n_estimators": [100], "clf__max_depth": [None]}),

    ("Logistic Regression", Pipeline([("scaler", StandardScaler()), ("clf", LogisticRegression(max_iter=500, random_state=42))]),
     {"clf__C": [1]}),

    ("k-NN", Pipeline([("scaler", StandardScaler()), ("clf", KNeighborsClassifier())]),
     {"clf__n_neighbors": [5, 7]}),

    ("Gaussian NB", Pipeline([("clf", GaussianNB())]),
     {})
]

# --- Train and Evaluate each model ---
for name, pipe, param_grid in models:
    print(f"\n--- {name} Feature Selection ---")
    selected_feats = groupwise_sfs(pipe, X, y, groups)
    print(f"Selected features: {selected_feats}")

    X_sel = X[selected_feats]

    # Hyperparameter tuning
    gs = GridSearchCV(pipe, param_grid, scoring="f1", cv=cv, n_jobs=-1)
    gs.fit(X_sel, y, groups=groups)
    best_est = gs.best_estimator_

    # Evaluation with best estimator
    accs, precs, recs, f1s, aucs = [], [], [], [], []
    for tr, te in cv.split(X_sel, y, groups):
        best_est.fit(X_sel.iloc[tr], y[tr])
        preds = best_est.predict(X_sel.iloc[te])
        probs = (best_est.predict_proba(X_sel.iloc[te])[:, 1]
                 if hasattr(best_est, "predict_proba")
                 else best_est.decision_function(X_sel.iloc[te]))

        accs.append(accuracy_score(y[te], preds))
        precs.append(precision_score(y[te], preds))
        recs.append(recall_score(y[te], preds))
        f1s.append(f1_score(y[te], preds))
        aucs.append(roc_auc_score(y[te], probs))

    print(f"=== {name} ===")
    print(f"Best params: {gs.best_params_}")
    print(f"Accuracy : {np.mean(accs):.3f} ± {np.std(accs):.3f}")
    print(f"Precision: {np.mean(precs):.3f} ± {np.std(precs):.3f}")
    print(f"Recall   : {np.mean(recs):.3f} ± {np.std(recs):.3f}")
    print(f"F1-score : {np.mean(f1s):.3f} ± {np.std(f1s):.3f}")
    print(f"ROC-AUC  : {np.mean(aucs):.3f} ± {np.std(aucs):.3f}")



--- SVM (RBF) Feature Selection ---
Selected features: ['Feature_avg_centroid_frequency_Power_Spectrum_Density_ML', 'Feature_asym_peak_velocity_pos_SPD_ML', 'Feature_avg_peak_velocity_all_SPD_ML', 'Sex_Male']
=== SVM (RBF) ===
Best params: {'clf__C': 1, 'clf__gamma': 'scale'}
Accuracy : 0.757 ± 0.076
Precision: 0.740 ± 0.114
Recall   : 0.734 ± 0.078
F1-score : 0.734 ± 0.086
ROC-AUC  : 0.755 ± 0.082

--- Random Forest Feature Selection ---
Selected features: ['Feature_asym_range_ML', 'Feature_avg_frequency_mode_Power_Spectrum_Density_ML', 'Feature_asym_confidence_ellipse_area_ML_AND_AP', 'Feature_asym_mean_distance_AP', 'Feature_avg_centroid_frequency_Power_Spectrum_Density_ML']
=== Random Forest ===
Best params: {'clf__max_depth': None, 'clf__n_estimators': 100}
Accuracy : 0.730 ± 0.089
Precision: 0.736 ± 0.158
Recall   : 0.695 ± 0.145
F1-score : 0.699 ± 0.103
ROC-AUC  : 0.755 ± 0.103

--- Logistic Regression Feature Selection ---
Selected features: ['Sex_Male', 'Feature_asym_long_tim