#RQ: RQ1.1: What are the most important features in detecting PD from speech?


In [None]:
import zipfile, os
import glob
from sklearn.model_selection import train_test_split
import librosa
import numpy as np
from tqdm import tqdm
from sklearn.svm import SVC
from scipy.stats import entropy
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

In [None]:
# in a code cell
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#step 1: extract zip

In [None]:
zip_path = '/content/drive/MyDrive/Italian_Parkinsons_Voice_and_speech.zip'

extract_path = '/content/audio_dataset'
os.makedirs(extract_path, exist_ok=True)

#extract zip
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
  zip_ref.extractall(extract_path)

#step 2: list all .wav files recursively

In [None]:
# Define root folder of unzipped data
dataset_root = '/content/audio_dataset'

# Assign labels
label_map = {
    '28 People with Parkinson\'s disease': 1,
    '15 Young Healthy Control': 0,
    '22 Elderly Healthy Control': 0
}

# Gather all .wav file paths and corresponding labels
file_paths = []
labels = []

for group_folder, label in label_map.items():
    group_path = os.path.join(dataset_root, group_folder)
    wavs = glob.glob(os.path.join(group_path, '**', '*.wav'), recursive=True)
    file_paths.extend(wavs)
    labels.extend([label] * len(wavs))

print(f'Total files: {len(file_paths)}')

Total files: 831


#step 3: train/test split

In [None]:
X_train_paths, X_test_paths, y_train, y_test = train_test_split(
    file_paths, labels, test_size=0.2, stratify=labels, random_state=42
)

#step 4: extract features

In [None]:
def extract_mfcc(file_path, n_mfcc=13, max_len=100):
    y, sr = librosa.load(file_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)

    #Pad to fixed length
    if mfcc.shape[1] < max_len:
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0,0),(0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]

    return mfcc.flatten()

# Extract for train and test
X_train_mfcc = np.array([extract_mfcc(p) for p in tqdm(X_train_paths)])
X_test_mfcc = np.array([extract_mfcc(p) for p in tqdm(X_test_paths)])

100%|██████████| 664/664 [00:53<00:00, 12.51it/s]
100%|██████████| 167/167 [00:09<00:00, 17.42it/s]


In [None]:
def summarize_coefficients(coeffs, prefix):
    summary = {}
    for i, coeff in enumerate(coeffs):
        summary[f'{prefix}_{i+1}_mean'] = np.mean(coeff)
        summary[f'{prefix}_{i+1}_std'] = np.std(coeff)
        summary[f'{prefix}_{i+1}_min'] = np.min(coeff)
        summary[f'{prefix}_{i+1}_max'] = np.max(coeff)
    return summary

In [None]:
def extract_mfb(file_path, n_mels=40):
    y, sr = librosa.load(file_path, sr=None)
    y = librosa.util.normalize(y)

    # Mel Filter Banks (MFB)
    mel_spec = librosa.feature.melspectrogram(
        y=y, sr=sr, n_fft=2048, hop_length=512,
        n_mels=n_mels, fmin=0, fmax=sr // 2
    )
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

    # Summarize (e.g., mean, std, skewness, etc.)
    mfb_summary = summarize_coefficients(log_mel_spec, "mfb")

    return mfb_summary

In [None]:
X_train_mfb_df = pd.DataFrame([extract_mfb(p) for p in tqdm(X_train_paths)])
X_test_mfb_df = pd.DataFrame([extract_mfb(p) for p in tqdm(X_test_paths)])

100%|██████████| 664/664 [00:32<00:00, 20.73it/s]
100%|██████████| 167/167 [00:07<00:00, 22.06it/s]


In [None]:
# Summarize entropy features
def summarize_entropy(signal, sr, frame_length=2048, hop_length=512, n_bins=30):
    frames = librosa.util.frame(signal, frame_length=frame_length, hop_length=hop_length)
    time_entropy = []
    for frame in frames.T:
        hist, _ = np.histogram(np.abs(frame), bins=n_bins, density=True)
        hist += 1e-10
        time_entropy.append(entropy(hist, base=2))
    return {
        'entropy_mean': np.mean(time_entropy),
        'entropy_std': np.std(time_entropy),
        'entropy_min': np.min(time_entropy),
        'entropy_max': np.max(time_entropy)
    }

# Extract entropy feature
def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=None)
    y = librosa.util.normalize(y)
    entropy_summary = summarize_entropy(y, sr)
    return entropy_summary

# Apply feature extraction to train/test sets
X_train_entropy = [extract_features(p) for p in tqdm(X_train_paths)]
X_test_entropy = [extract_features(p) for p in tqdm(X_test_paths)]

X_train_entropy_df = pd.DataFrame(X_train_entropy)
X_test_df = pd.DataFrame(X_test_entropy)

100%|██████████| 664/664 [05:24<00:00,  2.04it/s]
100%|██████████| 167/167 [01:36<00:00,  1.73it/s]


#Train SVM

In [None]:
clf = SVC(kernel='rbf', C=1, gamma='scale')
clf.fit(X_train_mfcc, y_train)

# Evaluate
y_pred = clf.predict(X_test_mfcc)
print("SVM statistics mfcc")
print(classification_report(y_test, y_pred))

SVM statistics mfcc
              precision    recall  f1-score   support

           0       0.92      0.90      0.91        79
           1       0.91      0.93      0.92        88

    accuracy                           0.92       167
   macro avg       0.92      0.92      0.92       167
weighted avg       0.92      0.92      0.92       167



In [None]:
clf = SVC(kernel='rbf', C=1, gamma='scale')
clf.fit(X_train_mfb_df, y_train)
y_pred = clf.predict(X_test_mfb_df)

print("SVM statistics mfb")
print(classification_report(y_test, y_pred))

SVM statistics mfb
              precision    recall  f1-score   support

           0       0.96      0.94      0.95        79
           1       0.94      0.97      0.96        88

    accuracy                           0.95       167
   macro avg       0.95      0.95      0.95       167
weighted avg       0.95      0.95      0.95       167



In [None]:
svm = SVC(kernel='linear', C=1.0, random_state=42)
svm.fit(X_train_entropy_df, y_train)
y_pred = svm.predict(X_test_df)

print("SVM statistics time entropy")
print(classification_report(y_test, y_pred))

SVM statistics time entropy
              precision    recall  f1-score   support

           0       0.57      0.63      0.60        79
           1       0.64      0.58      0.61        88

    accuracy                           0.60       167
   macro avg       0.61      0.61      0.60       167
weighted avg       0.61      0.60      0.60       167



#Train Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
# Random Forest on MFCC features
rf_mfcc = RandomForestClassifier(n_estimators=100, random_state=42)
rf_mfcc.fit(X_train_mfcc, y_train)
y_pred_mfcc = rf_mfcc.predict(X_test_mfcc)
print("Random Forest statistics MFCC")
print(classification_report(y_test, y_pred_mfcc))

Random Forest statistics MFCC
              precision    recall  f1-score   support

           0       0.99      0.96      0.97        79
           1       0.97      0.99      0.98        88

    accuracy                           0.98       167
   macro avg       0.98      0.98      0.98       167
weighted avg       0.98      0.98      0.98       167



In [None]:
# Random Forest on MFB features
rf_mfb = RandomForestClassifier(n_estimators=100, random_state=42)
rf_mfb.fit(X_train_mfb_df, y_train)
y_pred_mfb = rf_mfb.predict(X_test_mfb_df)
print("Random Forest statistics MFB")
print(classification_report(y_test, y_pred_mfb))

Random Forest statistics MFB
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        79
           1       1.00      0.99      0.99        88

    accuracy                           0.99       167
   macro avg       0.99      0.99      0.99       167
weighted avg       0.99      0.99      0.99       167



In [None]:
# Random Forest on time entropy features
rf_entropy = RandomForestClassifier(n_estimators=100, random_state=42)
rf_entropy.fit(X_train_entropy_df, y_train)
y_pred_entropy = rf_entropy.predict(X_test_df)
print("Random Forest statistics Time Entropy")
print(classification_report(y_test, y_pred_entropy))

Random Forest statistics Time Entropy
              precision    recall  f1-score   support

           0       0.82      0.70      0.75        79
           1       0.76      0.86      0.81        88

    accuracy                           0.78       167
   macro avg       0.79      0.78      0.78       167
weighted avg       0.79      0.78      0.78       167

