In [5]:
"""
Embedded selection → union of kept features → evaluation of five classifiers
Author : 2025‑07‑18

Edit the two CSV paths (CTRL, PD) to match your location.
"""

# ── Imports ──────────────────────────────────────────────────
import re, numpy as np, pandas as pd
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, precision_score,
                             recall_score, f1_score, roc_auc_score)
from sklearn.feature_selection import SelectFromModel
from sklearn.base import clone

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# ── 1. Load & tidy data ─────────────────────────────────────
CTRL = r"E:\USA_PD_2024\Analysis\ppr6\COP\ML\Feature_Selection_2\Data\Controlled1.csv"
PD   = r"E:\USA_PD_2024\Analysis\ppr6\COP\ML\Feature_Selection_2\Data\PD1.csv"

con_df = pd.read_csv(CTRL);  pd_df = pd.read_csv(PD)
con_df.columns = con_df.columns.str.strip();  pd_df.columns = pd_df.columns.str.strip()
con_df["label"] = 0;  pd_df["label"] = 1
df = pd.concat([con_df, pd_df], ignore_index=True)

def subj_id(fname):
    m = re.search(r"_[a-zA-Z]*([0-9]+)", Path(fname).stem)
    return m.group(1) if m else Path(fname).stem
df["SubjectID"] = df["File"].apply(subj_id)

X = df.drop(columns=["label", "File", "SubjectID"], errors="ignore")
y, groups = df["label"].values, df["SubjectID"].values
feat_names = X.columns
cv = GroupKFold(n_splits=5)

# ── 2. Embedded selectors ───────────────────────────────────
embed_models = [
    ("LogReg_L1",
     Pipeline([("scale", StandardScaler()),
               ("clf", LogisticRegression(penalty="l1", solver="saga",
                                          max_iter=500, random_state=42))]),
     {"clf__C": [0.01, 0.1, 1, 10]}),

    ("LinSVC_L1",
     Pipeline([("scale", StandardScaler()),
               ("clf", LinearSVC(penalty="l1", dual=False,
                                 max_iter=5000, random_state=42))]),
     {"clf__C": [0.01, 0.1, 1, 10]}),

    ("RF_SFModel",
     Pipeline([("selector", SelectFromModel(
                   RandomForestClassifier(n_estimators=100,
                                          random_state=42, n_jobs=-1))),
               ("clf", RandomForestClassifier(n_estimators=100,
                                              random_state=42, n_jobs=-1))]),
     {"selector__threshold": ["median", "mean", 0.005, 0.01]})
]

union_mask = np.zeros(len(feat_names), dtype=bool)

print("\n=== Embedded selection ===")
for name, pipe, grid in embed_models:
    gs = GridSearchCV(pipe, grid, cv=cv, scoring="f1", n_jobs=-1)
    gs.fit(X, y, groups=groups)
    best = gs.best_estimator_

    if name.startswith("RF"):
        mask = best.named_steps["selector"].get_support()
    else:
        mask = best.named_steps["clf"].coef_.ravel() != 0

    union_mask |= mask
    selected = feat_names[mask].tolist()
    print(f"{name}: kept {len(selected)} features")

# ── 3. Build reduced dataset ─────────────────────────────────
sel_feats = feat_names[union_mask]
X_sel = X[sel_feats]
print(f"\nUnion of embedded‑kept features: {len(sel_feats)} features")

# ── 4. Define five classifiers ───────────────────────────────
final_models = [
    ("SVM_RBF",
     Pipeline([("scale", StandardScaler()),
               ("clf", SVC(kernel="rbf", probability=True,
                           C=1, gamma="scale", random_state=42))])),
    ("RandomForest",
     Pipeline([("clf", RandomForestClassifier(n_estimators=100,
                                             random_state=42, n_jobs=-1))])),
    ("LogReg_L1",
     Pipeline([("scale", StandardScaler()),
               ("clf", LogisticRegression(penalty="l1", solver="saga",
                                          C=1, max_iter=500, random_state=42))])),
    ("kNN",
     Pipeline([("scale", StandardScaler()),
               ("clf", KNeighborsClassifier(n_neighbors=5))])),
    ("GaussianNB",
     Pipeline([("clf", GaussianNB())]))
]

# ── 5. Evaluate and print ────────────────────────────────────
print("\n=== Performance on union feature subset (5 models) ===")
for name, model in final_models:
    accs=precs=recs=f1s=aucs=[]; accs,precs,recs,f1s,aucs = [],[],[],[],[]
    for tr, te in cv.split(X_sel, y, groups):
        mdl = clone(model).fit(X_sel.iloc[tr], y[tr])
        pred = mdl.predict(X_sel.iloc[te])
        scores = (mdl.predict_proba(X_sel.iloc[te])[:,1]
                  if hasattr(mdl,"predict_proba")
                  else mdl.decision_function(X_sel.iloc[te]))
        accs.append(accuracy_score(y[te], pred))
        precs.append(precision_score(y[te], pred))
        recs.append(recall_score(y[te], pred))
        f1s.append(f1_score(y[te], pred))
        aucs.append(roc_auc_score(y[te], scores))

    print(f"\n{name}")
    print(f"  Accuracy : {np.mean(accs):.3f} ± {np.std(accs):.3f}")
    print(f"  Precision: {np.mean(precs):.3f} ± {np.std(precs):.3f}")
    print(f"  Recall   : {np.mean(recs):.3f} ± {np.std(recs):.3f}")
    print(f"  F1‑score : {np.mean(f1s):.3f} ± {np.std(f1s):.3f}")
    print(f"  ROC‑AUC  : {np.mean(aucs):.3f} ± {np.std(aucs):.3f}")

print("\n📌 Selected feature list:")
for i, f in enumerate(sel_feats, 1):
    print(f"{i:2d}. {f}")



=== Embedded selection ===
LogReg_L1: kept 52 features
LinSVC_L1: kept 37 features
RF_SFModel: kept 72 features

Union of embedded‑kept features: 90 features

=== Performance on union feature subset (5 models) ===

SVM_RBF
  Accuracy : 0.687 ± 0.058
  Precision: 0.681 ± 0.113
  Recall   : 0.664 ± 0.124
  F1‑score : 0.657 ± 0.064
  ROC‑AUC  : 0.779 ± 0.133

RandomForest
  Accuracy : 0.704 ± 0.121
  Precision: 0.740 ± 0.205
  Recall   : 0.624 ± 0.164
  F1‑score : 0.659 ± 0.136
  ROC‑AUC  : 0.773 ± 0.146

LogReg_L1
  Accuracy : 0.722 ± 0.076
  Precision: 0.761 ± 0.198
  Recall   : 0.680 ± 0.105
  F1‑score : 0.695 ± 0.064
  ROC‑AUC  : 0.770 ± 0.119

kNN
  Accuracy : 0.643 ± 0.043
  Precision: 0.677 ± 0.169
  Recall   : 0.491 ± 0.084
  F1‑score : 0.556 ± 0.063
  ROC‑AUC  : 0.718 ± 0.078

GaussianNB
  Accuracy : 0.687 ± 0.111
  Precision: 0.679 ± 0.156
  Recall   : 0.655 ± 0.093
  F1‑score : 0.661 ± 0.110
  ROC‑AUC  : 0.719 ± 0.141

📌 Selected feature list:
 1. Feature_avg_mean_value_AP
 2.