Dataset link (4Q audio emotion dataset (Russell's model) (2018)) - https://mir.dei.uc.pt/downloads.html

In [5]:
from __future__ import annotations
import os
from pathlib import Path
from dataclasses import dataclass
import numpy as np
import pandas as pd
import librosa
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier


@dataclass
class FeatureConfig:
    sr: int = 22050
    mono: bool = True
    n_mfcc: int = 20
    n_fft: int = 2048
    hop_length: int = 512
    enforce_seconds: float | None = 30.0   
    top_db_trim: int | None = None    


LABEL_MAP = {"Q1": 0, "Q2": 1, "Q3": 2, "Q4": 3}



def _fix_length(y: np.ndarray, sr: int, seconds: float) -> np.ndarray:
    target = int(sr * seconds)
    if len(y) == target:
        return y
    if len(y) > target:
        return y[:target]
    return np.pad(y, (0, target - len(y)), mode="constant")


def _agg_stats(x: np.ndarray, prefix: str) -> dict:
    feats = {}
    if x.ndim == 1:
        feats[f"{prefix}_mean"] = float(np.mean(x))
        feats[f"{prefix}_std"]  = float(np.std(x))
        feats[f"{prefix}_min"]  = float(np.min(x))
        feats[f"{prefix}_max"]  = float(np.max(x))
        return feats

    # x: (F, T)
    means = np.mean(x, axis=1)
    stds  = np.std(x, axis=1)
    mins  = np.min(x, axis=1)
    maxs  = np.max(x, axis=1)

    for i in range(x.shape[0]):
        feats[f"{prefix}{i+1:02d}_mean"] = float(means[i])
        feats[f"{prefix}{i+1:02d}_std"]  = float(stds[i])
        feats[f"{prefix}{i+1:02d}_min"]  = float(mins[i])
        feats[f"{prefix}{i+1:02d}_max"]  = float(maxs[i])
    return feats


#Feature extraction
def extract_acoustic_features(
    audio_path: str | Path,
    cfg: FeatureConfig = FeatureConfig(),
) -> dict:
    audio_path = str(audio_path)

    y, sr = librosa.load(audio_path, sr=cfg.sr, mono=cfg.mono)

    #trim silence
    if cfg.top_db_trim is not None:
        y, _ = librosa.effects.trim(y, top_db=cfg.top_db_trim)

    #force fixed length
    if cfg.enforce_seconds is not None:
        y = _fix_length(y, sr, cfg.enforce_seconds)

    #features
    n_fft = cfg.n_fft
    hop = cfg.hop_length

    #MFCC
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=cfg.n_mfcc, n_fft=n_fft, hop_length=hop)

    #Spectral
    spec_centroid = librosa.feature.spectral_centroid(y=y, sr=sr, n_fft=n_fft, hop_length=hop)
    spec_rolloff  = librosa.feature.spectral_rolloff(y=y, sr=sr, n_fft=n_fft, hop_length=hop, roll_percent=0.85)

    #Spectral Flux
    flux = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop)

    #ZCR
    zcr = librosa.feature.zero_crossing_rate(y, frame_length=n_fft, hop_length=hop)

    #Chroma
    chroma = librosa.feature.chroma_stft(y=y, sr=sr, n_fft=n_fft, hop_length=hop)

    #Tempo
    tempo = librosa.feature.tempo(y=y, sr=sr, hop_length=hop)
    tempo_val = floati = float(tempo[0]) if len(tempo) else 0.0

    #RMS Energy
    rms = librosa.feature.rms(y=y, frame_length=n_fft, hop_length=hop)

    #Aggregate to static vector (mean/std/min/max)
    feats = {}
    feats.update(_agg_stats(mfcc, "mfcc_"))
    feats.update(_agg_stats(spec_centroid, "spec_centroid"))
    feats.update(_agg_stats(spec_rolloff, "spec_rolloff"))
    feats.update(_agg_stats(flux, "spec_flux"))
    feats.update(_agg_stats(zcr, "zcr"))
    feats.update(_agg_stats(chroma, "chroma_"))
    feats.update(_agg_stats(rms, "rms"))
    feats["tempo_bpm"] = tempo_val

    return feats


def build_dataset_from_folders(
    root_dir: str | Path,
    cfg: FeatureConfig = FeatureConfig(),
) -> tuple[pd.DataFrame, np.ndarray]:

    root_dir = Path(root_dir)
    rows = []
    y = []

    for q_name, label in LABEL_MAP.items():

        files = sorted((root_dir / q_name).rglob("*.mp3"))

        for fp in tqdm(files, desc=f"Feature engineering: {q_name}"):

            feats = extract_acoustic_features(fp, cfg=cfg)

            feats["path"] = str(fp)
            feats["q"] = q_name

            rows.append(feats)
            y.append(label)

    X = pd.DataFrame(rows)
    y = np.asarray(y, dtype=int)

    return X, y

In [6]:
data_root = r"/kaggle/input/datasets/rkhalm/mer-q4-dataset"

cfg = FeatureConfig(
        sr=22050,
        n_mfcc=20,
        enforce_seconds=30.0,
        top_db_trim=None
    )

X_df, y = build_dataset_from_folders(data_root, cfg=cfg)


meta_cols = ["path", "q"]
X = X_df.drop(columns=[c for c in meta_cols if c in X_df.columns], errors="ignore")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train.shape

Feature engineering: Q1: 100%|██████████| 225/225 [02:02<00:00,  1.84it/s]
Feature engineering: Q2: 100%|██████████| 225/225 [02:02<00:00,  1.83it/s]
Feature engineering: Q3: 100%|██████████| 225/225 [02:02<00:00,  1.83it/s]
Feature engineering: Q4: 100%|██████████| 225/225 [02:03<00:00,  1.83it/s]


(720, 149)

In [11]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from xgboost import XGBClassifier


cv3 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search_spaces = {

    "XGBoost": (
        XGBClassifier(
            objective="multi:softprob",
            num_class=4,  
            eval_metric="mlogloss",
            tree_method="hist",
            random_state=42,
            n_jobs=-1,
            verbosity=0
        ),
        {
            "n_estimators": Integer(200, 1800),
            "max_depth": Integer(3, 18),
    
            "learning_rate": Real(0.01, 0.05, prior="log-uniform"),
    
            "subsample": Real(0.6, 1.0),
            "colsample_bytree": Real(0.6, 1.0),
    
            "gamma": Real(0.0, 5.0),
            "min_child_weight": Integer(1, 20),
    
            "reg_alpha": Real(1e-8, 1.0, prior="log-uniform"),
            "reg_lambda": Real(1e-3, 10.0, prior="log-uniform"),
        },
    ),
    "SVM": (
        Pipeline([("scaler", StandardScaler()), ("clf", SVC())]),
        {
            "clf__kernel": Categorical(["rbf", "poly", "sigmoid"]),
            "clf__C": Real(1e-2, 1e2, prior="log-uniform"),
            "clf__gamma": Real(1e-4, 1e-1, prior="log-uniform"),
        },
    ),
    "KNN": (
        Pipeline([("scaler", StandardScaler()), ("clf", KNeighborsClassifier())]),
        {
            "clf__n_neighbors": Integer(1, 30),
            "clf__weights": Categorical(["uniform", "distance"]),
            "clf__p": Integer(1, 2),
        },
    ),
    "RandomForest": (
        RandomForestClassifier(random_state=42, n_jobs=-1),
        {
            "n_estimators": Integer(100, 1800),
            "max_depth": Integer(3, 40),
            "min_samples_split": Integer(2, 30),
            "min_samples_leaf": Integer(1, 20),
            "max_features": Categorical(["sqrt", "log2", None]),
        },
    ),
    "LogReg": (
        Pipeline([("scaler", StandardScaler()), ("clf", LogisticRegression(max_iter=5000))]),
        {
            "clf__C": Real(1e-3, 1e2, prior="log-uniform"),
            "clf__solver": Categorical(["lbfgs", "liblinear"]),
        },
    ),
    "Ridge": (
        Pipeline([("scaler", StandardScaler()), ("clf", RidgeClassifier())]),
        {
            "clf__alpha": Real(1e-3, 1e3, prior="log-uniform"),
        },
    ),
}


def calc_metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true,
        y_pred,
        average="macro",
        zero_division=0
    )

    return acc, precision, recall, f1


results = []

for name, (estimator, space) in tqdm(search_spaces.items()):

    opt = BayesSearchCV(
        estimator=estimator,
        search_spaces=space,
        n_iter=50,
        scoring="accuracy",
        cv=cv3,
        n_jobs=-1,
        random_state=42,
        refit=True,
    )

    opt.fit(X_train, y_train)

    y_pred = opt.predict(X_test)

    acc, precision, recall, f1 = calc_metrics(y_test, y_pred)

    results.append({
        "model": name,
        "best_cv_acc": float(opt.best_score_),
        "accuracy": float(acc),
        "precision": float(precision),
        "recall": float(recall),
        "f1": float(f1),
        "best_params": opt.best_params_,
    })

    print(f"{name:12s} acc={acc:.4f}  f1={f1:.4f}")


 17%|█▋        | 1/6 [26:09<2:10:49, 1569.91s/it]

XGBoost      acc=0.6333  f1=0.6292


 33%|███▎      | 2/6 [27:42<46:43, 700.80s/it]   

SVM          acc=0.6500  f1=0.6479


 50%|█████     | 3/6 [28:37<20:18, 406.07s/it]

KNN          acc=0.6111  f1=0.6016


 67%|██████▋   | 4/6 [1:31:40<57:58, 1739.37s/it]

RandomForest acc=0.6333  f1=0.6287


 83%|████████▎ | 5/6 [1:32:31<18:50, 1130.29s/it]

LogReg       acc=0.6500  f1=0.6456


100%|██████████| 6/6 [1:33:11<00:00, 931.84s/it] 

Ridge        acc=0.6556  f1=0.6509





In [13]:
res_df = pd.DataFrame(results).sort_values("f1", ascending=False)

print(res_df[["model", "accuracy", "precision", "recall", "f1"]])

          model  accuracy  precision    recall        f1
5         Ridge  0.655556   0.653771  0.655556  0.650945
1           SVM  0.650000   0.656352  0.650000  0.647944
4        LogReg  0.650000   0.645817  0.650000  0.645566
0       XGBoost  0.633333   0.628889  0.633333  0.629177
3  RandomForest  0.633333   0.628275  0.633333  0.628695
2           KNN  0.611111   0.609422  0.611111  0.601599


In [15]:
res_df.to_csv("bayessearch_metrics_results.csv", index=False)

In [19]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
linreg.fit(X_train, y_train)

In [20]:
y_pred_cont = linreg.predict(X_test)
y_pred = np.clip(np.rint(y_pred_cont), 0, 3).astype(int)

acc, precision, recall, f1 = calc_metrics(y_test, y_pred)

results.append({
    "model": "Linear Regression",
    "best_cv_acc": None,
    "accuracy": float(acc),
    "precision": float(precision),
    "recall": float(recall),
    "f1": float(f1),
    "best_params": {"postprocess": "round+clip"}
})

print(f"{'Linear Regression':12s} acc={acc:.4f}  f1={f1:.4f}")

Linear Regression acc=0.4111  f1=0.3861


In [21]:
res_df = pd.DataFrame(results).sort_values("f1", ascending=False)

print(res_df[["model", "accuracy", "precision", "recall", "f1"]])

               model  accuracy  precision    recall        f1
5              Ridge  0.655556   0.653771  0.655556  0.650945
1                SVM  0.650000   0.656352  0.650000  0.647944
4             LogReg  0.650000   0.645817  0.650000  0.645566
0            XGBoost  0.633333   0.628889  0.633333  0.629177
3       RandomForest  0.633333   0.628275  0.633333  0.628695
2                KNN  0.611111   0.609422  0.611111  0.601599
6  Linear Regression  0.411111   0.449748  0.411111  0.386075


In [22]:
res_df_no_log = res_df[res_df['model'] != 'LogReg']
res_df_no_log

Unnamed: 0,model,best_cv_acc,accuracy,precision,recall,f1,best_params
5,Ridge,0.6375,0.655556,0.653771,0.655556,0.650945,{'clf__alpha': 237.89603259971724}
1,SVM,0.638889,0.65,0.656352,0.65,0.647944,"{'clf__C': 25.351918429872093, 'clf__gamma': 0..."
0,XGBoost,0.629167,0.633333,0.628889,0.633333,0.629177,"{'colsample_bytree': 0.6598038165316674, 'gamm..."
3,RandomForest,0.622222,0.633333,0.628275,0.633333,0.628695,"{'max_depth': 21, 'max_features': None, 'min_s..."
2,KNN,0.584722,0.611111,0.609422,0.611111,0.601599,"{'clf__n_neighbors': 24, 'clf__p': 1, 'clf__we..."
6,Linear Regression,,0.411111,0.449748,0.411111,0.386075,{'postprocess': 'round+clip'}


In [23]:
res_df_no_log.to_csv("bayessearch_metrics_results_CLASSIFICATION.csv", index=False)