In [None]:
# Setup and imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
!pip install tabpfn --quiet
from tabpfn import TabPFNClassifier
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load dataset
df = pd.read_csv("/content/sample_data/ReplicatedAcousticFeatures-ParkinsonDatabase.csv")
print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

In [None]:
# Split features and labels
X = df.drop(columns=["Status"])
y = df["Status"]
groups = df["ID"]  # group by patient ID

In [None]:
# Grouped split (train/test)
gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
for train_idx, test_idx in gss.split(X, y, groups):
    X_train_full, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train_full, y_test = y.iloc[train_idx], y.iloc[test_idx]
    groups_train = groups.iloc[train_idx]

print("Train IDs:", groups_train.nunique(), "| Test IDs:", groups.iloc[test_idx].nunique())

In [None]:
# Further split train into train/validation
gss_val = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
for tr_idx, val_idx in gss_val.split(X_train_full, y_train_full, groups_train):
    X_train, X_val = X_train_full.iloc[tr_idx], X_train_full.iloc[val_idx]
    y_train, y_val = y_train_full.iloc[tr_idx], y_train_full.iloc[val_idx]

print(f"Shapes → Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

In [None]:
# Keep numeric columns only
X_train = X_train.select_dtypes(include=[np.number])
X_val = X_val.select_dtypes(include=[np.number])
X_test = X_test.select_dtypes(include=[np.number])

In [None]:
# Feature selection with Random Forest
fs = SelectFromModel(RandomForestClassifier(n_estimators=500, random_state=42))
fs.fit(X_train, y_train)

selected_features = X_train.columns[fs.get_support()]
print(f"Selected features: {len(selected_features)} / {X_train.shape[1]}")
print("Top selected features:", selected_features.tolist())

# Transform datasets
X_train_sel = pd.DataFrame(fs.transform(X_train), columns=selected_features)
X_val_sel = pd.DataFrame(fs.transform(X_val), columns=selected_features)
X_test_sel = pd.DataFrame(fs.transform(X_test), columns=selected_features)

In [None]:

# MODEL 1: LIGHTGBM (Baseline)

print("\n================ LIGHTGBM BASELINE ================\n")

lgbm = LGBMClassifier(
    n_estimators=600,
    learning_rate=0.01,
    num_leaves=20,
    min_child_samples=60,
    subsample=0.8,
    colsample_bytree=1.0,
    reg_alpha=1.0,
    reg_lambda=6.0,
    class_weight='balanced',
    random_state=42
)

lgbm.fit(
    X_train_sel, y_train,
    eval_set=[(X_val_sel, y_val)],
    eval_metric="logloss",
    callbacks=[early_stopping(100), log_evaluation(0)]
)

y_pred_test = lgbm.predict(X_test_sel)
print("Train acc:", accuracy_score(y_train, lgbm.predict(X_train_sel)))
print("Test  acc:", accuracy_score(y_test, y_pred_test))
print("\nTest report:\n", classification_report(y_test, y_pred_test))

In [None]:

# MODEL 2: SVM with PCA


from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report

print("\n================ SVM MODEL ================\n")

# Define the pipeline
pipe_pca_svm = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA()),
    ("svc", SVC(kernel="rbf", class_weight="balanced", random_state=42))
])

# Test effect of different PCA variance thresholds
for comp in [0.90, 0.95, 0.98]:
    pipe_pca_svm.set_params(
        pca__n_components=comp,
        svc__C=5,
        svc__gamma=0.002
    )
    pipe_pca_svm.fit(X_train_sel, y_train)
    test_acc = pipe_pca_svm.score(X_test_sel, y_test)
    print(f"PCA variance {comp*100:.0f}% → Test acc: {test_acc:.3f}")

# Train final model with best PCA
best_svm = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.98)),
    ("svc", SVC(kernel="rbf", C=5, gamma=0.002, class_weight="balanced", random_state=42))
])

best_svm.fit(X_train_sel, y_train)

# Evaluate final model
print("\nFinal Evaluation:")
print("Train acc:", accuracy_score(y_train, best_svm.predict(X_train_sel)))
print("Test  acc:", accuracy_score(y_test,  best_svm.predict(X_test_sel)))
print("\nClassification report (Test):\n", classification_report(y_test, best_svm.predict(X_test_sel)))

Tabpfn

In [None]:
from tabpfn import TabPFNClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

print("\n================ TABPFN MODEL ================\n")

tabpfn = TabPFNClassifier(device="cpu")

# Standardize the input
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_sel)
X_test_scaled = scaler.transform(X_test_sel)

# Fit and evaluate
tabpfn.fit(X_train_scaled, y_train)
y_pred_tabpfn = tabpfn.predict(X_test_scaled)

print("Train acc:", accuracy_score(y_train, tabpfn.predict(X_train_scaled)))
print("Test acc:", accuracy_score(y_test, y_pred_tabpfn))
print("\nClassification report:\n", classification_report(y_test, y_pred_tabpfn))

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42)
logreg.fit(X_train_sel, y_train)

print("Train acc:", logreg.score(X_train_sel, y_train))
print("Test  acc:", logreg.score(X_test_sel, y_test))
print("\nReport:\n", classification_report(y_test, logreg.predict(X_test_sel)))

In [None]:
from sklearn.ensemble import VotingClassifier

# Enable probability in your SVM
svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.98)),
    ("svc", SVC(kernel="rbf", C=5, gamma=0.002, class_weight="balanced", probability=True, random_state=42))
])
svm_clf.fit(X_train_sel, y_train)


# Ensemble fusion of LightGBM + SVM
ensemble = VotingClassifier(
    estimators=[('lgbm', lgbm), ('svm', svm_clf)],
    voting='soft'
)
ensemble.fit(X_train_sel, y_train)

# Evaluate
y_pred_ens = ensemble.predict(X_test_sel)
print("\n=== ENSEMBLE MODEL RESULTS ===")
print("Train acc:", accuracy_score(y_train, ensemble.predict(X_train_sel)))
print("Test  acc:", accuracy_score(y_test, y_pred_ens))
print("\nClassification Report:\n", classification_report(y_test, y_pred_ens))


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC

# Define pipeline
pipe_svm = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.98)),
    ("svc", SVC(kernel="rbf", class_weight="balanced", probability=True, random_state=42))
])

# Define search space
param_grid = {
    'svc__C': [1, 5, 10, 25, 50, 100],
    'svc__gamma': [0.0005, 0.001, 0.002, 0.005, 0.01]
}

# Perform grid search with cross-validation
svm_grid = GridSearchCV(pipe_svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
svm_grid.fit(X_train_sel, y_train)

# Evaluate best model
print("Best params:", svm_grid.best_params_)
print("Best CV acc:", svm_grid.best_score_)
print("Train acc:", svm_grid.best_estimator_.score(X_train_sel, y_train))
print("Test  acc:", svm_grid.best_estimator_.score(X_test_sel, y_test))

y_pred = svm_grid.best_estimator_.predict(X_test_sel)
print("\nClassification report:\n", classification_report(y_test, y_pred))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score

print("\n================ LOGISTIC REGRESSION (TUNED) ================\n")

# Define pipeline
pipe_log = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.98)),
    ("logreg", LogisticRegression(class_weight='balanced', max_iter=5000, random_state=42))
])


# Define search space
param_grid = {
    'logreg__C': [0.5, 1, 2, 5, 10],
    'logreg__solver': ['liblinear'],
    'logreg__penalty': ['l2']
}

# Grid search
log_grid = GridSearchCV(pipe_log, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
log_grid.fit(X_train_sel, y_train)

# Evaluate best model
best_log = log_grid.best_estimator_
y_pred_log = best_log.predict(X_test_sel)

print("Best params:", log_grid.best_params_)
print("Best CV acc:", log_grid.best_score_)
print("Train acc:", best_log.score(X_train_sel, y_train))
print("Test  acc:", best_log.score(X_test_sel, y_test))

print("\nClassification report:\n", classification_report(y_test, y_pred_log))

In [None]:
# ROC CURVES FOR EACH MODEL

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.98)),
    ("svc", SVC(kernel="rbf", C=5, gamma=0.002, class_weight="balanced",
                probability=True, random_state=42))
])
svm_clf.fit(X_train_sel, y_train)

# Predicted probabilities
y_probs = {
    "LightGBM": lgbm.predict_proba(X_test_sel)[:, 1],
    "SVM": svm_clf.predict_proba(X_test_sel)[:, 1],
    "Logistic Regression": best_log.predict_proba(X_test_sel)[:, 1],
    "TabPFN": tabpfn.predict_proba(X_test_sel)[:, 1],
    "Ensemble": ensemble.predict_proba(X_test_sel)[:, 1],
}

# ROC curves
for model_name, y_prob in y_probs.items():
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    auc_score = auc(fpr, tpr)

    plt.figure(figsize=(6,5))
    plt.plot(fpr, tpr, color='blue', lw=2, label=f"AUC = {auc_score:.3f}")
    plt.plot([0,1], [0,1], 'k--', lw=1.2)
    plt.xlabel("False Positive Rate", fontsize=11)
    plt.ylabel("True Positive Rate", fontsize=11)
    plt.title(f"ROC Curve — {model_name}", fontsize=13, fontweight="bold")
    plt.legend(loc="lower right")
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.tight_layout()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# CONFUSION MATRIX - LIGHTGBM
y_pred_lgbm = lgbm.predict(X_test_sel)
cm_lgbm = confusion_matrix(y_test, y_pred_lgbm)
disp_lgbm = ConfusionMatrixDisplay(confusion_matrix=cm_lgbm, display_labels=['Healthy', 'Parkinson'])
disp_lgbm.plot(cmap='Blues')
plt.title("Confusion Matrix — LightGBM")
plt.show()


# CONFUSION MATRIX - SVM
y_pred_svm = svm_clf.predict(X_test_sel)
cm_svm = confusion_matrix(y_test, y_pred_svm)
disp_svm = ConfusionMatrixDisplay(confusion_matrix=cm_svm, display_labels=['Healthy', 'Parkinson'])
disp_svm.plot(cmap='Greens')
plt.title("Confusion Matrix — SVM")
plt.show()


# CONFUSION MATRIX - LOGISTIC REGRESSION
y_pred_log = best_log.predict(X_test_sel)
cm_log = confusion_matrix(y_test, y_pred_log)
disp_log = ConfusionMatrixDisplay(confusion_matrix=cm_log, display_labels=['Healthy', 'Parkinson'])
disp_log.plot(cmap='Oranges')
plt.title("Confusion Matrix — Logistic Regression")
plt.show()


# CONFUSION MATRIX - TABPFN
y_pred_tab = tabpfn.predict(X_test_sel)
cm_tab = confusion_matrix(y_test, y_pred_tab)
disp_tab = ConfusionMatrixDisplay(confusion_matrix=cm_tab, display_labels=['Healthy', 'Parkinson'])
disp_tab.plot(cmap='Purples')
plt.title("Confusion Matrix — TabPFN")
plt.show()
