In [1]:
# ----------------------------------------------
# Random Forest Baseline for Parkinson's Detection (LOPO)
# ----------------------------------------------

import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef

# ------------------------------
# Step 1: Load and Preprocess Dataset
# ------------------------------

# Download dataset
url = 'https://raw.githubusercontent.com/dcleres/Parkinson_Disease_ML/refs/heads/master/pd_speech_features.csv'
file_name = 'pd_speech_features.csv'
os.system(f'wget -q -O {file_name} "{url}"')

# Load CSV and fix header
pd_speech_features = pd.read_csv('pd_speech_features.csv')
header = pd_speech_features.iloc[0]
pd_speech_features = pd_speech_features[1:]
pd_speech_features.columns = header

# Convert types
pd_speech_features = pd_speech_features.apply(pd.to_numeric)
pd_speech_features = pd_speech_features.astype(float)
pd_speech_features[['id', 'numPulses', 'numPeriodsPulses']] = pd_speech_features[['id', 'numPulses', 'numPeriodsPulses']].astype(int)
pd_speech_features[['gender', 'class']] = pd_speech_features[['gender', 'class']].astype('category')

# ------------------------------
# Step 2: Feature Selection (All or Specific)
# ------------------------------

# Use all features except id, gender, class
all_features = pd_speech_features.drop(['id', 'gender', 'class'], axis=1)
scaler = MinMaxScaler()
all_features_scaled = pd.DataFrame(scaler.fit_transform(all_features), columns=all_features.columns)

# Labels and groups
labels = pd_speech_features['class'].astype(int)
person_ids = pd_speech_features['id'].astype(int)

# ------------------------------


In [None]:
# Step 3: LOPO-CV with Random Forest
# ------------------------------

logo = LeaveOneGroupOut()
splits = list(logo.split(all_features_scaled, labels, groups=person_ids))

all_preds = []
all_labels = []

for i, (train_idx, test_idx) in enumerate(splits):
    print(f"Fold {i+1}/{len(splits)}")

    X_train, X_test = all_features_scaled.iloc[train_idx], all_features_scaled.iloc[test_idx]
    y_train, y_test = labels.iloc[train_idx], labels.iloc[test_idx]

    # Define and train Random Forest
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = clf.predict(X_test)

    all_preds.extend(y_pred)
    all_labels.extend(y_test)

    print(f"Fold {i+1} done. Accuracy = {accuracy_score(y_test, y_pred):.4f}")

# ------------------------------
# Step 4: Final Metrics
# ------------------------------

accuracy = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)
mcc = matthews_corrcoef(all_labels, all_preds)

print("\nFinal Random Forest Results:")
print(f"Accuracy : {accuracy:.4f}")
print(f"F1-Score : {f1:.4f}")
print(f"MCC      : {mcc:.4f}")

Fold 1/252
Fold 1 done. Accuracy = 1.0000
Fold 2/252
Fold 2 done. Accuracy = 1.0000
Fold 3/252
Fold 3 done. Accuracy = 1.0000
Fold 4/252
Fold 4 done. Accuracy = 1.0000
Fold 5/252
Fold 5 done. Accuracy = 1.0000
Fold 6/252
Fold 6 done. Accuracy = 1.0000
Fold 7/252
Fold 7 done. Accuracy = 1.0000
Fold 8/252
Fold 8 done. Accuracy = 0.6667
Fold 9/252
Fold 9 done. Accuracy = 1.0000
Fold 10/252
Fold 10 done. Accuracy = 1.0000
Fold 11/252
Fold 11 done. Accuracy = 1.0000
Fold 12/252
Fold 12 done. Accuracy = 0.0000
Fold 13/252
Fold 13 done. Accuracy = 1.0000
Fold 14/252
Fold 14 done. Accuracy = 1.0000
Fold 15/252
Fold 15 done. Accuracy = 0.0000
Fold 16/252
Fold 16 done. Accuracy = 1.0000
Fold 17/252
Fold 17 done. Accuracy = 1.0000
Fold 18/252
Fold 18 done. Accuracy = 1.0000
Fold 19/252
Fold 19 done. Accuracy = 1.0000
Fold 20/252
