In [5]:
import re
import numpy as np
import pandas as pd
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, precision_score,
                             recall_score, f1_score, roc_auc_score)
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.base import clone

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# ──────────────────────────────────────────────────────────────
# 1. Load and clean data
# ──────────────────────────────────────────────────────────────
con_df = pd.read_csv(r"E:\USA_PD_2024\Analysis\ppr6\COP\ML\Feature_Selection_2\Data\Controlled1.csv")
pd_df = pd.read_csv(r"E:\USA_PD_2024\Analysis\ppr6\COP\ML\Feature_Selection_2\Data\PD1.csv")

# Remove leading/trailing whitespace from column headers
con_df.columns = con_df.columns.str.strip()
pd_df.columns  = pd_df.columns.str.strip()

con_df["label"] = 0
pd_df["label"] = 1
df = pd.concat([con_df, pd_df], ignore_index=True)

# ──────────────────────────────────────────────────────────────
# 2. Extract subject IDs
# ──────────────────────────────────────────────────────────────
def get_id(fname):
    base = Path(fname).stem
    m = re.search(r"_[a-zA-Z]*([0-9]+)", base)
    return m.group(1) if m else base

df["SubjectID"] = df["File"].apply(get_id)

# ──────────────────────────────────────────────────────────────
# 3. Build feature matrix and targets
# ──────────────────────────────────────────────────────────────
X = df.drop(columns=["label", "File", "SubjectID"])  # Only valid feature columns
y = df["label"].values
groups = df["SubjectID"].values

# ──────────────────────────────────────────────────────────────
# 4. GroupKFold for subject-wise cross-validation
# ──────────────────────────────────────────────────────────────
cv = GroupKFold(n_splits=5)

# ──────────────────────────────────────────────────────────────
# 5. Define classifiers with SelectKBest
# ──────────────────────────────────────────────────────────────
models = [
    ("SVM (RBF)",
     Pipeline([
         ("scaler", StandardScaler()),
         ("selector", SelectKBest(score_func=f_classif)),
         ("clf", SVC(kernel="rbf", probability=True, random_state=42))
     ]),
     {
         "selector__k": [5, 10, 15, 20],
         "clf__C": [0.1, 1, 10],
         "clf__gamma": ["scale", 0.1]
     }),

    ("Random Forest",
     Pipeline([
         ("selector", SelectKBest(score_func=f_classif)),
         ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))
     ]),
     {
         "selector__k": [5, 10, 20, 30],
         "clf__n_estimators": [100],
         "clf__max_depth": [None]
     }),

    ("Logistic Regression",
     Pipeline([
         ("scaler", StandardScaler()),
         ("selector", SelectKBest(score_func=f_classif)),
         ("clf", LogisticRegression(max_iter=500, random_state=42))
     ]),
     {
         "selector__k": [5, 10, 15, 20],
         "clf__C": [1, 10]
     }),

    ("k-NN",
     Pipeline([
         ("scaler", StandardScaler()),
         ("selector", SelectKBest(score_func=f_classif)),
         ("clf", KNeighborsClassifier())
     ]),
     {
         "selector__k": [5, 10, 20],
         "clf__n_neighbors": [5, 7]
     }),

    ("Gaussian NB",
     Pipeline([
         ("selector", SelectKBest(score_func=f_classif)),
         ("clf", GaussianNB())
     ]),
     {
         "selector__k": [5, 10, 20, 30]
     })
]

# ──────────────────────────────────────────────────────────────
# 6. Train, evaluate, and print selected features
# ──────────────────────────────────────────────────────────────
for name, pipe, param_grid in models:
    print(f"\n=== {name} | Filter Selection (ANOVA F-test) ===")

    # Inner CV for hyperparameters
    gs = GridSearchCV(pipe, param_grid, scoring="f1", cv=cv, n_jobs=-1)
    gs.fit(X, y, groups=groups)

    best_est = gs.best_estimator_
    best_k = gs.best_params_.get("selector__k", "N/A")

    # Get selected feature names
    selector = best_est.named_steps["selector"]
    mask = selector.get_support()
    selected_features = X.columns[mask].tolist()

    # Outer CV evaluation
    accs, precs, recs, f1s, aucs = [], [], [], [], []
    for tr, te in cv.split(X, y, groups):
        model = clone(best_est)
        model.fit(X.iloc[tr], y[tr])
        preds = model.predict(X.iloc[te])
        probs = (model.predict_proba(X.iloc[te])[:, 1]
                 if hasattr(model, "predict_proba")
                 else model.decision_function(X.iloc[te]))

        accs.append(accuracy_score(y[te], preds))
        precs.append(precision_score(y[te], preds))
        recs.append(recall_score(y[te], preds))
        f1s.append(f1_score(y[te], preds))
        aucs.append(roc_auc_score(y[te], probs))

    # Print results
    print(f"Top-k features   : {best_k}")
    print(f"Best parameters  : {gs.best_params_}")
    print(f"Selected features: {selected_features}")
    print(f"Accuracy         : {np.mean(accs):.3f} ± {np.std(accs):.3f}")
    print(f"Precision        : {np.mean(precs):.3f} ± {np.std(precs):.3f}")
    print(f"Recall           : {np.mean(recs):.3f} ± {np.std(recs):.3f}")
    print(f"F1-score         : {np.mean(f1s):.3f} ± {np.std(f1s):.3f}")
    print(f"ROC-AUC          : {np.mean(aucs):.3f} ± {np.std(aucs):.3f}")



=== SVM (RBF) | Filter Selection (ANOVA F-test) ===
Top-k features   : 10
Best parameters  : {'clf__C': 1, 'clf__gamma': 'scale', 'selector__k': 10}
Selected features: ['Feature_avg_fractal_dimension_ML_AND_AP', 'Feature_avg_power_frequency_95_Power_Spectrum_Density_ML', 'Feature_avg_power_frequency_95_Power_Spectrum_Density_AP', 'Feature_avg_centroid_frequency_Power_Spectrum_Density_AP', 'Feature_avg_frequency_quotient_Power_Spectrum_Density_AP', 'Feature_asym_power_frequency_95_Power_Spectrum_Density_AP', 'Feature_asym_centroid_frequency_Power_Spectrum_Density_AP', 'Feature_asym_energy_content_below_05_Power_Spectrum_Density_AP', 'Feature_asym_frequency_quotient_Power_Spectrum_Density_AP', 'Feature_asym_short_time_diffusion_Diffusion_ML']
Accuracy         : 0.722 ± 0.059
Precision        : 0.709 ± 0.110
Recall           : 0.700 ± 0.147
F1-score         : 0.693 ± 0.085
ROC-AUC          : 0.781 ± 0.088

=== Random Forest | Filter Selection (ANOVA F-test) ===
Top-k features   : 10
Best