In [4]:
"""
Binary PD vs Control classification using manually selected CoP + demographic features
──────────────────────────────────────────────────────────────────────────────────────
Features: 15 CoP-derived + Height, Weight, Sex (if available)
Modeling: SVM, RF, LR, k-NN, GNB
Validation: 5-fold GroupKFold using SubjectID
"""

# ──────────────────────────────────────────────────────────────
# 1. Imports
# ──────────────────────────────────────────────────────────────
import re
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.base import clone

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# ──────────────────────────────────────────────────────────────
# 2. Load and concatenate data
# ──────────────────────────────────────────────────────────────
con_df = pd.read_csv(r"E:\USA_PD_2024\Analysis\ppr6\COP\Data\Combine_Activity\Full\Controlled1.csv")
pd_df  = pd.read_csv(r"E:\USA_PD_2024\Analysis\ppr6\COP\Data\Combine_Activity\Full\PD1.csv")

con_df["label"] = 0
pd_df ["label"] = 1
df = pd.concat([con_df, pd_df], ignore_index=True)

# ──────────────────────────────────────────────────────────────
# 3. Clean column names
# ──────────────────────────────────────────────────────────────
df.columns = df.columns.str.strip().str.replace(" ", "_")

# ──────────────────────────────────────────────────────────────
# 4. Extract Subject ID from filename
# ──────────────────────────────────────────────────────────────
def get_id(fname: str) -> str:
    base = Path(fname).stem
    m = re.search(r"_[a-zA-Z]*([0-9]+)", base)
    return m.group(1) if m else base

df["SubjectID"] = df["File"].apply(get_id)

# ──────────────────────────────────────────────────────────────
# 5. Manually selected features
# ──────────────────────────────────────────────────────────────
manual_features = [
    "Feature_asym_energy_content_below_05_Power_Spectrum_Density_AP",
    "Feature_avg_frequency_quotient_Power_Spectrum_Density_ML",
    "Feature_avg_fractal_dimension_ML_AND_AP",
    "Feature_avg_phase_plane_parameter_ML",
    "Feature_avg_short_time_diffusion_Diffusion_ML",
    "Feature_asym_mean_distance_Radius",
    "Height"
]

# Check availability
available_features = [f for f in manual_features if f in df.columns]
missing = set(manual_features) - set(available_features)
if missing:
    print(f"⚠️ Warning: Missing features skipped: {missing}")
else:
    print("✅ All selected features are present.")

# ──────────────────────────────────────────────────────────────
# 6. Prepare feature matrix X
# ──────────────────────────────────────────────────────────────
if "Sex" in df.columns:
    X = df[available_features + ["Sex"]].copy()
    X = pd.get_dummies(X, columns=["Sex"], drop_first=True)
else:
    X = df[available_features].copy()
    print("⚠️ 'Sex' column not found. Proceeding without it.")

y = df["label"].values
groups = df["SubjectID"].values

# ──────────────────────────────────────────────────────────────
# 7. Cross-validation setup
# ──────────────────────────────────────────────────────────────
cv = GroupKFold(n_splits=5)

# ──────────────────────────────────────────────────────────────
# 8. Models and parameter grids
# ──────────────────────────────────────────────────────────────
models = [
    ("SVM (RBF)", Pipeline([
        ("scaler", StandardScaler()),
        ("clf", SVC(kernel="rbf", probability=True, random_state=42))
    ]), {"clf__C": [0.1, 1, 10], "clf__gamma": ["scale", 0.1]}),

    ("Random Forest", Pipeline([
        ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))
    ]), {"clf__n_estimators": [100], "clf__max_depth": [None]}),

    ("Logistic Regression", Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=500, random_state=42))
    ]), {"clf__C": [1]}),

    ("k-NN", Pipeline([
        ("scaler", StandardScaler()),
        ("clf", KNeighborsClassifier())
    ]), {"clf__n_neighbors": [5, 7]}),

    ("Gaussian NB", Pipeline([
        ("clf", GaussianNB())
    ]), {})
]

# ──────────────────────────────────────────────────────────────
# 9. Train, tune and evaluate
# ──────────────────────────────────────────────────────────────
for name, pipe, param_grid in models:
    print(f"\n🔹 {name} Model 🔹")
    
    # Grid Search CV
    gs = GridSearchCV(pipe, param_grid, scoring="f1", cv=cv, n_jobs=-1)
    gs.fit(X, y, groups=groups)
    best_est = gs.best_estimator_

    # Evaluate with group-wise CV
    accs, precs, recs, f1s, aucs = [], [], [], [], []
    for tr, te in cv.split(X, y, groups):
        best_est.fit(X.iloc[tr], y[tr])
        preds = best_est.predict(X.iloc[te])
        probs = (best_est.predict_proba(X.iloc[te])[:, 1]
                 if hasattr(best_est, "predict_proba")
                 else best_est.decision_function(X.iloc[te]))

        accs.append(accuracy_score(y[te], preds))
        precs.append(precision_score(y[te], preds, zero_division=0))
        recs.append(recall_score(y[te], preds, zero_division=0))
        f1s.append(f1_score(y[te], preds, zero_division=0))
        aucs.append(roc_auc_score(y[te], probs))

    # Print metrics
    print(f"Best Params: {gs.best_params_}")
    print(f"Accuracy : {np.mean(accs):.3f} ± {np.std(accs):.3f}")
    print(f"Precision: {np.mean(precs):.3f} ± {np.std(precs):.3f}")
    print(f"Recall   : {np.mean(recs):.3f} ± {np.std(recs):.3f}")
    print(f"F1-score : {np.mean(f1s):.3f} ± {np.std(f1s):.3f}")
    print(f"ROC-AUC  : {np.mean(aucs):.3f} ± {np.std(aucs):.3f}")


✅ All selected features are present.

🔹 SVM (RBF) Model 🔹
Best Params: {'clf__C': 1, 'clf__gamma': 0.1}
Accuracy : 0.870 ± 0.095
Precision: 0.887 ± 0.153
Recall   : 0.852 ± 0.065
F1-score : 0.862 ± 0.093
ROC-AUC  : 0.881 ± 0.082

🔹 Random Forest Model 🔹
Best Params: {'clf__max_depth': None, 'clf__n_estimators': 100}
Accuracy : 0.835 ± 0.101
Precision: 0.836 ± 0.164
Recall   : 0.832 ± 0.062
F1-score : 0.826 ± 0.099
ROC-AUC  : 0.862 ± 0.116

🔹 Logistic Regression Model 🔹
Best Params: {'clf__C': 1}
Accuracy : 0.774 ± 0.084
Precision: 0.788 ± 0.135
Recall   : 0.739 ± 0.083
F1-score : 0.753 ± 0.075
ROC-AUC  : 0.827 ± 0.102

🔹 k-NN Model 🔹
Best Params: {'clf__n_neighbors': 7}
Accuracy : 0.852 ± 0.081
Precision: 0.877 ± 0.158
Recall   : 0.834 ± 0.086
F1-score : 0.842 ± 0.075
ROC-AUC  : 0.873 ± 0.101

🔹 Gaussian NB Model 🔹
Best Params: {}
Accuracy : 0.809 ± 0.081
Precision: 0.797 ± 0.141
Recall   : 0.832 ± 0.109
F1-score : 0.801 ± 0.074
ROC-AUC  : 0.874 ± 0.106


In [6]:
"""
Binary PD vs Control classification using manually selected CoP + demographic features
──────────────────────────────────────────────────────────────────────────────────────
Features: 15 CoP-derived + Height, Weight, Sex (if available)
Modeling: SVM, RF, LR, k-NN, GNB
Validation: 5-fold GroupKFold using SubjectID
"""

# ──────────────────────────────────────────────────────────────
# 1. Imports
# ──────────────────────────────────────────────────────────────
import re
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.base import clone

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# ──────────────────────────────────────────────────────────────
# 2. Load and concatenate data
# ──────────────────────────────────────────────────────────────
con_df = pd.read_csv(r"E:\USA_PD_2024\Analysis\ppr6\COP\Data\Combine_Activity\Full\Controlled1.csv")
pd_df  = pd.read_csv(r"E:\USA_PD_2024\Analysis\ppr6\COP\Data\Combine_Activity\Full\PD1.csv")

con_df["label"] = 0
pd_df ["label"] = 1
df = pd.concat([con_df, pd_df], ignore_index=True)

# ──────────────────────────────────────────────────────────────
# 3. Clean column names
# ──────────────────────────────────────────────────────────────
df.columns = df.columns.str.strip().str.replace(" ", "_")

# ──────────────────────────────────────────────────────────────
# 4. Extract Subject ID from filename
# ──────────────────────────────────────────────────────────────
def get_id(fname: str) -> str:
    base = Path(fname).stem
    m = re.search(r"_[a-zA-Z]*([0-9]+)", base)
    return m.group(1) if m else base

df["SubjectID"] = df["File"].apply(get_id)

# ──────────────────────────────────────────────────────────────
# 5. Manually selected features
# ──────────────────────────────────────────────────────────────
manual_features = [
    "Feature_asym_energy_content_below_05_Power_Spectrum_Density_AP",
    "Feature_avg_frequency_quotient_Power_Spectrum_Density_ML",
    "Feature_avg_fractal_dimension_ML_AND_AP",
    "Feature_avg_phase_plane_parameter_ML",
    "Feature_avg_short_time_diffusion_Diffusion_ML",
    "Feature_asym_mean_distance_Radius"
]

# Check availability
available_features = [f for f in manual_features if f in df.columns]
missing = set(manual_features) - set(available_features)
if missing:
    print(f"⚠️ Warning: Missing features skipped: {missing}")
else:
    print("✅ All selected features are present.")

# ──────────────────────────────────────────────────────────────
# 6. Prepare feature matrix X
# ──────────────────────────────────────────────────────────────
if "Sex" in df.columns:
    X = df[available_features + ["Sex"]].copy()
    X = pd.get_dummies(X, columns=["Sex"], drop_first=True)
else:
    X = df[available_features].copy()
    print("⚠️ 'Sex' column not found. Proceeding without it.")

y = df["label"].values
groups = df["SubjectID"].values

# ──────────────────────────────────────────────────────────────
# 7. Cross-validation setup
# ──────────────────────────────────────────────────────────────
cv = GroupKFold(n_splits=5)

# ──────────────────────────────────────────────────────────────
# 8. Models and parameter grids
# ──────────────────────────────────────────────────────────────
models = [
    ("SVM (RBF)", Pipeline([
        ("scaler", StandardScaler()),
        ("clf", SVC(kernel="rbf", probability=True, random_state=42))
    ]), {"clf__C": [0.1, 1, 10], "clf__gamma": ["scale", 0.1]}),

    ("Random Forest", Pipeline([
        ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))
    ]), {"clf__n_estimators": [100], "clf__max_depth": [None]}),

    ("Logistic Regression", Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=500, random_state=42))
    ]), {"clf__C": [1]}),

    ("k-NN", Pipeline([
        ("scaler", StandardScaler()),
        ("clf", KNeighborsClassifier())
    ]), {"clf__n_neighbors": [5, 7]}),

    ("Gaussian NB", Pipeline([
        ("clf", GaussianNB())
    ]), {})
]

# ──────────────────────────────────────────────────────────────
# 9. Train, tune and evaluate
# ──────────────────────────────────────────────────────────────
for name, pipe, param_grid in models:
    print(f"\n🔹 {name} Model 🔹")
    
    # Grid Search CV
    gs = GridSearchCV(pipe, param_grid, scoring="f1", cv=cv, n_jobs=-1)
    gs.fit(X, y, groups=groups)
    best_est = gs.best_estimator_

    # Evaluate with group-wise CV
    accs, precs, recs, f1s, aucs = [], [], [], [], []
    for tr, te in cv.split(X, y, groups):
        best_est.fit(X.iloc[tr], y[tr])
        preds = best_est.predict(X.iloc[te])
        probs = (best_est.predict_proba(X.iloc[te])[:, 1]
                 if hasattr(best_est, "predict_proba")
                 else best_est.decision_function(X.iloc[te]))

        accs.append(accuracy_score(y[te], preds))
        precs.append(precision_score(y[te], preds, zero_division=0))
        recs.append(recall_score(y[te], preds, zero_division=0))
        f1s.append(f1_score(y[te], preds, zero_division=0))
        aucs.append(roc_auc_score(y[te], probs))

    # Print metrics
    print(f"Best Params: {gs.best_params_}")
    print(f"Accuracy : {np.mean(accs):.3f} ± {np.std(accs):.3f}")
    print(f"Precision: {np.mean(precs):.3f} ± {np.std(precs):.3f}")
    print(f"Recall   : {np.mean(recs):.3f} ± {np.std(recs):.3f}")
    print(f"F1-score : {np.mean(f1s):.3f} ± {np.std(f1s):.3f}")
    print(f"ROC-AUC  : {np.mean(aucs):.3f} ± {np.std(aucs):.3f}")


✅ All selected features are present.

🔹 SVM (RBF) Model 🔹
Best Params: {'clf__C': 1, 'clf__gamma': 'scale'}
Accuracy : 0.852 ± 0.076
Precision: 0.883 ± 0.145
Recall   : 0.817 ± 0.086
F1-score : 0.839 ± 0.077
ROC-AUC  : 0.894 ± 0.088

🔹 Random Forest Model 🔹
Best Params: {'clf__max_depth': None, 'clf__n_estimators': 100}
Accuracy : 0.817 ± 0.111
Precision: 0.829 ± 0.178
Recall   : 0.815 ± 0.088
F1-score : 0.810 ± 0.106
ROC-AUC  : 0.852 ± 0.127

🔹 Logistic Regression Model 🔹
Best Params: {'clf__C': 1}
Accuracy : 0.765 ± 0.081
Precision: 0.777 ± 0.141
Recall   : 0.739 ± 0.083
F1-score : 0.746 ± 0.070
ROC-AUC  : 0.845 ± 0.089

🔹 k-NN Model 🔹
Best Params: {'clf__n_neighbors': 5}
Accuracy : 0.852 ± 0.105
Precision: 0.868 ± 0.173
Recall   : 0.854 ± 0.088
F1-score : 0.848 ± 0.096
ROC-AUC  : 0.871 ± 0.108

🔹 Gaussian NB Model 🔹
Best Params: {}
Accuracy : 0.817 ± 0.127
Precision: 0.821 ± 0.184
Recall   : 0.848 ± 0.101
F1-score : 0.817 ± 0.105
ROC-AUC  : 0.876 ± 0.097


In [12]:
"""
Binary PD vs Control classification using manually selected CoP + demographic features
──────────────────────────────────────────────────────────────────────────────────────
Features: 15 CoP-derived + Height, Weight, Sex (if available)
Modeling: SVM, RF, LR, k-NN, GNB
Validation: 5-fold GroupKFold using SubjectID
"""

# ──────────────────────────────────────────────────────────────
# 1. Imports
# ──────────────────────────────────────────────────────────────
import re
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.base import clone

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# ──────────────────────────────────────────────────────────────
# 2. Load and concatenate data
# ──────────────────────────────────────────────────────────────
con_df = pd.read_csv(r"E:\USA_PD_2024\Analysis\ppr6\COP\Data\Combine_Activity\Full\Controlled1.csv")
pd_df  = pd.read_csv(r"E:\USA_PD_2024\Analysis\ppr6\COP\Data\Combine_Activity\Full\PD1.csv")

con_df["label"] = 0
pd_df ["label"] = 1
df = pd.concat([con_df, pd_df], ignore_index=True)

# ──────────────────────────────────────────────────────────────
# 3. Clean column names
# ──────────────────────────────────────────────────────────────
df.columns = df.columns.str.strip().str.replace(" ", "_")

# ──────────────────────────────────────────────────────────────
# 4. Extract Subject ID from filename
# ──────────────────────────────────────────────────────────────
def get_id(fname: str) -> str:
    base = Path(fname).stem
    m = re.search(r"_[a-zA-Z]*([0-9]+)", base)
    return m.group(1) if m else base

df["SubjectID"] = df["File"].apply(get_id)

# ──────────────────────────────────────────────────────────────
# 5. Manually selected features
# ──────────────────────────────────────────────────────────────
manual_features = [
    "Feature_asym_energy_content_below_05_Power_Spectrum_Density_AP",
    "Feature_avg_frequency_quotient_Power_Spectrum_Density_ML",
    "Feature_avg_fractal_dimension_ML_AND_AP",
    "Feature_avg_phase_plane_parameter_ML",
    "Feature_avg_short_time_diffusion_Diffusion_ML",
    "Feature_asym_mean_distance_Radius",
    "Height",
    "Age",
    "Weight"
]

# Check availability
available_features = [f for f in manual_features if f in df.columns]
missing = set(manual_features) - set(available_features)
if missing:
    print(f"⚠️ Warning: Missing features skipped: {missing}")
else:
    print("✅ All selected features are present.")

# ──────────────────────────────────────────────────────────────
# 6. Prepare feature matrix X
# ──────────────────────────────────────────────────────────────
if "Sex" in df.columns:
    X = df[available_features + ["Sex"]].copy()
    X = pd.get_dummies(X, columns=["Sex"], drop_first=True)
else:
    X = df[available_features].copy()
    print("⚠️ 'Sex' column not found. Proceeding without it.")

y = df["label"].values
groups = df["SubjectID"].values

# ──────────────────────────────────────────────────────────────
# 7. Cross-validation setup
# ──────────────────────────────────────────────────────────────
cv = GroupKFold(n_splits=5)

# ──────────────────────────────────────────────────────────────
# 8. Models and parameter grids
# ──────────────────────────────────────────────────────────────
models = [
    ("SVM (RBF)", Pipeline([
        ("scaler", StandardScaler()),
        ("clf", SVC(kernel="rbf", probability=True, random_state=42))
    ]), {"clf__C": [0.1, 1, 10], "clf__gamma": ["scale", 0.1]}),

    ("Random Forest", Pipeline([
        ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))
    ]), {"clf__n_estimators": [100], "clf__max_depth": [None]}),

    ("Logistic Regression", Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=500, random_state=42))
    ]), {"clf__C": [1]}),

    ("k-NN", Pipeline([
        ("scaler", StandardScaler()),
        ("clf", KNeighborsClassifier())
    ]), {"clf__n_neighbors": [5, 7]}),

    ("Gaussian NB", Pipeline([
        ("clf", GaussianNB())
    ]), {})
]

# ──────────────────────────────────────────────────────────────
# 9. Train, tune and evaluate
# ──────────────────────────────────────────────────────────────
for name, pipe, param_grid in models:
    print(f"\n🔹 {name} Model 🔹")
    
    # Grid Search CV
    gs = GridSearchCV(pipe, param_grid, scoring="f1", cv=cv, n_jobs=-1)
    gs.fit(X, y, groups=groups)
    best_est = gs.best_estimator_

    # Evaluate with group-wise CV
    accs, precs, recs, f1s, aucs = [], [], [], [], []
    for tr, te in cv.split(X, y, groups):
        best_est.fit(X.iloc[tr], y[tr])
        preds = best_est.predict(X.iloc[te])
        probs = (best_est.predict_proba(X.iloc[te])[:, 1]
                 if hasattr(best_est, "predict_proba")
                 else best_est.decision_function(X.iloc[te]))

        accs.append(accuracy_score(y[te], preds))
        precs.append(precision_score(y[te], preds, zero_division=0))
        recs.append(recall_score(y[te], preds, zero_division=0))
        f1s.append(f1_score(y[te], preds, zero_division=0))
        aucs.append(roc_auc_score(y[te], probs))

    # Print metrics
    print(f"Best Params: {gs.best_params_}")
    print(f"Accuracy : {np.mean(accs):.3f} ± {np.std(accs):.3f}")
    print(f"Precision: {np.mean(precs):.3f} ± {np.std(precs):.3f}")
    print(f"Recall   : {np.mean(recs):.3f} ± {np.std(recs):.3f}")
    print(f"F1-score : {np.mean(f1s):.3f} ± {np.std(f1s):.3f}")
    print(f"ROC-AUC  : {np.mean(aucs):.3f} ± {np.std(aucs):.3f}")


✅ All selected features are present.

🔹 SVM (RBF) Model 🔹
Best Params: {'clf__C': 1, 'clf__gamma': 'scale'}
Accuracy : 0.870 ± 0.055
Precision: 0.888 ± 0.138
Recall   : 0.850 ± 0.042
F1-score : 0.860 ± 0.050
ROC-AUC  : 0.928 ± 0.050

🔹 Random Forest Model 🔹
Best Params: {'clf__max_depth': None, 'clf__n_estimators': 100}
Accuracy : 0.809 ± 0.044
Precision: 0.825 ± 0.135
Recall   : 0.795 ± 0.087
F1-score : 0.795 ± 0.027
ROC-AUC  : 0.897 ± 0.048

🔹 Logistic Regression Model 🔹
Best Params: {'clf__C': 1}
Accuracy : 0.791 ± 0.051
Precision: 0.794 ± 0.118
Recall   : 0.779 ± 0.079
F1-score : 0.776 ± 0.040
ROC-AUC  : 0.871 ± 0.067

🔹 k-NN Model 🔹
Best Params: {'clf__n_neighbors': 5}
Accuracy : 0.809 ± 0.071
Precision: 0.806 ± 0.111
Recall   : 0.792 ± 0.113
F1-score : 0.789 ± 0.080
ROC-AUC  : 0.872 ± 0.040

🔹 Gaussian NB Model 🔹
Best Params: {}
Accuracy : 0.800 ± 0.059
Precision: 0.799 ± 0.137
Recall   : 0.795 ± 0.059
F1-score : 0.787 ± 0.052
ROC-AUC  : 0.881 ± 0.058
