In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import librosa

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import joblib
import seaborn as sns
import matplotlib.pyplot as plt

def extract_audio_features(path, sr_target=16000):
    try:
        y, sr = librosa.load(path, sr=sr_target, mono=True)

        if len(y) < sr:  # < 1 second audio ‚Üí useless
            return None

        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        mfcc_delta = librosa.feature.delta(mfcc)
        mfcc_delta2 = librosa.feature.delta(mfcc, order=2)

        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        tonnetz = librosa.feature.tonnetz(
            y=librosa.effects.harmonic(y), sr=sr
        )

        feat = np.hstack([
            mfcc.mean(axis=1),
            mfcc.std(axis=1),
            mfcc_delta.mean(axis=1),
            mfcc_delta2.mean(axis=1),
            chroma.mean(axis=1),
            contrast.mean(axis=1),
            tonnetz.mean(axis=1),
        ])

        return feat.astype(np.float32)

    except Exception as e:
        print(f"‚ö†Ô∏è Skipping corrupted file: {path}")
        return None



## Build feature matrix


In [2]:
AUDIO_ROOT = "../dataset/talkbank"
TASKS = ["cookie", "fluency", "recall", "sentence"]

paths = []
labels = []

for label in ["Control", "Dementia"]:
    for task in TASKS:
        folder = os.path.join(AUDIO_ROOT, label, task)
        print("Reading:", folder)

        if not os.path.exists(folder):
            continue

        for f in os.listdir(folder):
            if f.lower().endswith((".wav", ".mp3")):
                paths.append(os.path.join(folder, f))
                labels.append(label)

len(paths), len(labels)

Reading: ../dataset/talkbank\Control\cookie
Reading: ../dataset/talkbank\Control\fluency
Reading: ../dataset/talkbank\Control\recall
Reading: ../dataset/talkbank\Control\sentence
Reading: ../dataset/talkbank\Dementia\cookie
Reading: ../dataset/talkbank\Dementia\fluency
Reading: ../dataset/talkbank\Dementia\recall
Reading: ../dataset/talkbank\Dementia\sentence


(1361, 1361)

In [None]:
features = []
valid_labels = []

skipped = 0

for p, label in tqdm(zip(paths, labels), total=len(paths), desc="Extracting audio features"):
    feat = extract_audio_features(p)

    if feat is None:
        skipped += 1
        continue

    features.append(feat)
    valid_labels.append(label)

print(f"\n Finished")
print(f"Used files: {len(features)}")
print(f" Skipped corrupted files: {skipped}")


X_audio = np.array(features)
y_audio_text = np.array(valid_labels)

print(X_audio.shape, y_audio_text.shape)



  y, sr = librosa.load(path, sr=sr_target, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Extracting audio features:  36%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä                                                                                                 | 485/1361 [18:19<32:57,  2.26s/it]

‚ö†Ô∏è Skipping corrupted file: ../dataset/talkbank\Control\fluency\332-0.mp3


Extracting audio features:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè        | 1280/1361 [6:12:29<05:36,  4.15s/it]

‚ö†Ô∏è Skipping corrupted file: ../dataset/talkbank\Dementia\sentence\269-1.mp3


Extracting audio features: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1361/1361 [6:18:45<00:00, 16.70s/it]


‚úÖ Finished
‚úî Used files: 1359
‚ö†Ô∏è Skipped corrupted files: 2
(1359, 77) (1359,)





## Encode labels + scale + train/test split

In [5]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

le = LabelEncoder()
y = le.fit_transform(y_audio_text)  # Control=0, Dementia=1

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_audio)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [6]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [7]:
models = {
    "SVM_RBF": SVC(kernel="rbf", probability=True, class_weight="balanced"),
    
    "LogisticRegression": LogisticRegression(
        max_iter=2000,
        class_weight="balanced"
    ),
    
    "RandomForest": RandomForestClassifier(
        n_estimators=400,
        class_weight="balanced",
        random_state=42
    ),
    
    "XGBoost": XGBClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=42
    )
}


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import pandas as pd

def compute_metrics(y_true, y_pred):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1": f1_score(y_true, y_pred)
    }

results = []
trained_models = {}

for name, model in models.items():
    print(f"\n Training {name}...")
    
    # Train
    model.fit(X_train, y_train)
    
    # Predictions
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    
    # Metrics
    train_metrics = compute_metrics(y_train, train_preds)
    test_metrics = compute_metrics(y_test, test_preds)
    
    # Store results
    results.append({
        "Model": name,
        
        "Train_Accuracy": train_metrics["Accuracy"],
        "Train_Precision": train_metrics["Precision"],
        "Train_Recall": train_metrics["Recall"],
        "Train_F1": train_metrics["F1"],
        
        "Test_Accuracy": test_metrics["Accuracy"],
        "Test_Precision": test_metrics["Precision"],
        "Test_Recall": test_metrics["Recall"],
        "Test_F1": test_metrics["F1"]
    })
    
    trained_models[name] = model
    
    # Optional: print detailed test report
    print("\n Test Classification Report:")
    print(classification_report(
        y_test, test_preds,
        target_names=le.classes_
    ))



üöÄ Training SVM_RBF...

üìä Test Classification Report:
              precision    recall  f1-score   support

     Control       0.70      0.93      0.80        97
    Dementia       0.95      0.78      0.86       175

    accuracy                           0.83       272
   macro avg       0.82      0.85      0.83       272
weighted avg       0.86      0.83      0.83       272


üöÄ Training LogisticRegression...

üìä Test Classification Report:
              precision    recall  f1-score   support

     Control       0.61      0.73      0.66        97
    Dementia       0.83      0.74      0.78       175

    accuracy                           0.74       272
   macro avg       0.72      0.73      0.72       272
weighted avg       0.75      0.74      0.74       272


üöÄ Training RandomForest...

üìä Test Classification Report:
              precision    recall  f1-score   support

     Control       0.85      0.82      0.84        97
    Dementia       0.90      0.92      0.

In [13]:
results_df = pd.DataFrame(results)

# Sort by Test F1
results_df = results_df.sort_values(by="Test_F1", ascending=False)

results_df


Unnamed: 0,Model,Train_Accuracy,Train_Precision,Train_Recall,Train_F1,Test_Accuracy,Test_Precision,Test_Recall,Test_F1
2,RandomForest,1.0,1.0,1.0,1.0,0.886029,0.904494,0.92,0.912181
3,XGBoost,1.0,1.0,1.0,1.0,0.886029,0.923529,0.897143,0.910145
0,SVM_RBF,0.915363,0.98871,0.878223,0.930197,0.830882,0.951049,0.777143,0.855346
1,LogisticRegression,0.801288,0.896382,0.780802,0.834609,0.735294,0.832258,0.737143,0.781818


In [None]:

best_row = results_df.iloc[0]
best_model_name = best_row["Model"]
best_model = trained_models[best_model_name]

print("Best model based on TEST F1:", best_model_name)


üèÜ Best model based on TEST F1: RandomForest


In [18]:
import joblib
import os

os.makedirs("../models", exist_ok=True)

joblib.dump(best_model, "../models/best_audio_model.pkl")
joblib.dump(scaler, "../models/audio_scaler.pkl")
joblib.dump(le, "../models/audio_label_encoder.pkl")


['../models/audio_label_encoder.pkl']

In [19]:
results_df["Overfitting_Gap"] = results_df["Train_F1"] - results_df["Test_F1"]
results_df[["Model", "Train_F1", "Test_F1", "Overfitting_Gap"]]


Unnamed: 0,Model,Train_F1,Test_F1,Overfitting_Gap
2,RandomForest,1.0,0.912181,0.087819
3,XGBoost,1.0,0.910145,0.089855
0,SVM_RBF,0.930197,0.855346,0.074851
1,LogisticRegression,0.834609,0.781818,0.052791
