In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import librosa

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import joblib
import seaborn as sns
import matplotlib.pyplot as plt

def extract_audio_features(path, sr_target=16000):
    try:
        y, sr = librosa.load(path, sr=sr_target, mono=True)

        if len(y) < sr:  # < 1 second audio → useless
            return None

        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        mfcc_delta = librosa.feature.delta(mfcc)
        mfcc_delta2 = librosa.feature.delta(mfcc, order=2)

        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        tonnetz = librosa.feature.tonnetz(
            y=librosa.effects.harmonic(y), sr=sr
        )

        feat = np.hstack([
            mfcc.mean(axis=1),
            mfcc.std(axis=1),
            mfcc_delta.mean(axis=1),
            mfcc_delta2.mean(axis=1),
            chroma.mean(axis=1),
            contrast.mean(axis=1),
            tonnetz.mean(axis=1),
        ])

        return feat.astype(np.float32)

    except Exception as e:
        print(f"⚠️ Skipping corrupted file: {path}")
        return None



## Build feature matrix


In [2]:
AUDIO_ROOT = "../dataset/talkbank"
TASKS = ["cookie", "fluency", "recall", "sentence"]

paths = []
labels = []

for label in ["Control", "Dementia"]:
    for task in TASKS:
        folder = os.path.join(AUDIO_ROOT, label, task)
        print("Reading:", folder)

        if not os.path.exists(folder):
            continue

        for f in os.listdir(folder):
            if f.lower().endswith((".wav", ".mp3")):
                paths.append(os.path.join(folder, f))
                labels.append(label)

len(paths), len(labels)

Reading: ../dataset/talkbank\Control\cookie
Reading: ../dataset/talkbank\Control\fluency
Reading: ../dataset/talkbank\Control\recall
Reading: ../dataset/talkbank\Control\sentence
Reading: ../dataset/talkbank\Dementia\cookie
Reading: ../dataset/talkbank\Dementia\fluency
Reading: ../dataset/talkbank\Dementia\recall
Reading: ../dataset/talkbank\Dementia\sentence


(1361, 1361)

In [3]:
features = []
valid_labels = []

skipped = 0

for p, label in tqdm(zip(paths, labels), total=len(paths), desc="Extracting audio features"):
    feat = extract_audio_features(p)

    if feat is None:
        skipped += 1
        continue

    features.append(feat)
    valid_labels.append(label)

print(f"\n✅ Finished")
print(f"✔ Used files: {len(features)}")
print(f"⚠️ Skipped corrupted files: {skipped}")


X_audio = np.array(features)
y_audio_text = np.array(valid_labels)

print(X_audio.shape, y_audio_text.shape)



  y, sr = librosa.load(path, sr=sr_target, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Extracting audio features:  36%|█████████████████████████████████████████████████████▊                                                                                                 | 485/1361 [18:19<32:57,  2.26s/it]

⚠️ Skipping corrupted file: ../dataset/talkbank\Control\fluency\332-0.mp3


Extracting audio features:  94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏        | 1280/1361 [6:12:29<05:36,  4.15s/it]

⚠️ Skipping corrupted file: ../dataset/talkbank\Dementia\sentence\269-1.mp3


Extracting audio features: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1361/1361 [6:18:45<00:00, 16.70s/it]


✅ Finished
✔ Used files: 1359
⚠️ Skipped corrupted files: 2
(1359, 77) (1359,)





## Encode labels + scale + train/test split