In [1]:
import pandas as pd

df = pd.read_csv("modelling_metadata.csv")
df.head()
df.shape

(7431, 10)

In [20]:
import os
import numpy as np

# Audio settings
SAMPLE_RATE = 16000
DURATION_SEC = 2.5
TARGET_SAMPLES = int(SAMPLE_RATE * DURATION_SEC)

# Spectrogram settings (speech-friendly)
N_FFT = 512
WIN_LENGTH = 400      # 25 ms at 16 kHz
HOP_LENGTH = 160      # 10 ms at 16 kHz
N_MELS = 80

# Expected time frames (approx; librosa can differ by 1 frame depending on centering)
TARGET_FRAMES = 1 + (TARGET_SAMPLES // HOP_LENGTH)

# Output folders
OUT_NPY_DIR = "mel_new_npy"
os.makedirs(OUT_NPY_DIR, exist_ok=True)


In [21]:
import numpy as np
import librosa

def load_and_fix_length_train(path, sample_rate=SAMPLE_RATE, target_samples=TARGET_SAMPLES, top_db=25):
    y, sr = librosa.load(path, sr=None, mono=True)
    if sr != sample_rate:
        y = librosa.resample(y, orig_sr=sr, target_sr=sample_rate)

    y, _ = librosa.effects.trim(y, top_db=top_db)

    if len(y) >= target_samples:
        start = np.random.randint(0, len(y) - target_samples + 1)
        y = y[start:start + target_samples]
    else:
        if len(y) == 0:
            y = np.zeros(target_samples, dtype=np.float32)
        else:
            reps = int(np.ceil(target_samples / len(y)))
            y_rep = np.tile(y, reps)
            start = np.random.randint(0, len(y))
            y = y_rep[start:start + target_samples]

    # ✅ FINAL GUARANTEE (should already be true, but makes it bulletproof)
    if len(y) != target_samples:
        y = y[:target_samples] if len(y) > target_samples else np.pad(y, (0, target_samples - len(y)), mode="reflect")

    return y.astype(np.float32)


In [22]:
import numpy as np
import librosa

def load_and_fix_length_eval(path, sample_rate=SAMPLE_RATE, target_samples=TARGET_SAMPLES, top_db=25):
    y, sr = librosa.load(path, sr=None, mono=True)
    if sr != sample_rate:
        y = librosa.resample(y, orig_sr=sr, target_sr=sample_rate)

    y, _ = librosa.effects.trim(y, top_db=top_db)

    if len(y) >= target_samples:
        start = (len(y) - target_samples) // 2
        y = y[start:start + target_samples]
    else:
        if len(y) == 0:
            y = np.zeros(target_samples, dtype=np.float32)
        else:
            y = np.pad(y, (0, target_samples - len(y)), mode="reflect")

    # ✅ FINAL GUARANTEE
    if len(y) != target_samples:
        y = y[:target_samples] if len(y) > target_samples else np.pad(y, (0, target_samples - len(y)), mode="reflect")

    return y.astype(np.float32)


In [23]:
import numpy as np
import librosa

def wav_to_logmel(y, sample_rate, n_fft, win_length, hop_length, n_mels, target_frames, eps=1e-6):
    S = librosa.feature.melspectrogram(
        y=y,
        sr=sample_rate,
        n_fft=n_fft,
        win_length=win_length,
        hop_length=hop_length,
        n_mels=n_mels,
        power=2.0,
    )

    # log compression
    S = np.log(S + eps)

    # CMVN per mel bin over time
    mean = S.mean(axis=1, keepdims=True)
    std  = S.std(axis=1, keepdims=True) + eps
    S = (S - mean) / std

    # Force fixed time dimension (T)
    T = S.shape[1]
    if T < target_frames:
        S = np.pad(S, ((0, 0), (0, target_frames - T)), mode="constant")
    elif T > target_frames:
        S = S[:, :target_frames]

    return S[np.newaxis, :, :].astype(np.float32)


In [4]:
import os
import re
import random
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


In [3]:
SEED = 42
import random, numpy as np, torch

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

# Emotion mapping (keep as-is)
EMO_MAP = {
    "ANG": 0,
    "DIS": 1,
    "FEA": 2,
    "HAP": 3,
    "NEU": 4,
    "SAD": 5
}
NUM_CLASSES = len(EMO_MAP)

# Data locations (separate per split)
TRAIN_NPY_DIR = "mel_npy_train"
VAL_NPY_DIR   = "mel_npy_val"
TEST_NPY_DIR  = "mel_npy_test"

import os
os.makedirs(TRAIN_NPY_DIR, exist_ok=True)
os.makedirs(VAL_NPY_DIR, exist_ok=True)
os.makedirs(TEST_NPY_DIR, exist_ok=True)


Device: cpu


In [5]:
import os
import re
import random

# Example: EMO_MAP = {"ANG": "angry", "DIS": "disgust", "FEA": "fear", "HAP": "happy", "NEU": "neutral", "SAD": "sad"}
# Make sure EMO_MAP exists in your code.

def parse_actor_and_emotion(filename: str):
    """
    Parses CREMA-D-like filenames such as: 1001_IEO_HAP_HI.wav
    Returns: (actor_id, emotion_label)
    """
    # Actor ID: prefer 4 digits at start followed by underscore
    m = re.match(r"^(\d{4})_", filename)
    if m is not None:
        actor_id = m.group(1)
    else:
        # fallback: first run of digits anywhere
        m2 = re.search(r"\d+", filename)
        if m2 is None:
            raise ValueError(f"No actor ID in {filename}")
        actor_id = m2.group(0)

    # Emotion code -> mapped label
    emotion = None
    for emo_code, emo_label in EMO_MAP.items():
        if f"_{emo_code}_" in filename or filename.startswith(f"{emo_code}_"):
            emotion = emo_label
            break

    if emotion is None:
        raise ValueError(f"No emotion code in {filename}")

    return actor_id, emotion


def actor_independent_split(paths, train_ratio=0.7, val_ratio=0.15, seed=42):
    """
    Speaker-independent split: all files from an actor go into the same split.
    Returns:
      train_paths, val_paths, test_paths, train_actors, val_actors, test_actors
    """
    if train_ratio < 0 or val_ratio < 0 or (train_ratio + val_ratio) > 1.0:
        raise ValueError("train_ratio and val_ratio must be >= 0 and sum to <= 1.0")

    by_actor = {}
    for p in paths:
        fname = os.path.basename(p)
        actor_id, _ = parse_actor_and_emotion(fname)
        by_actor.setdefault(actor_id, []).append(p)

    actors = sorted(by_actor.keys())
    rng = random.Random(seed)
    rng.shuffle(actors)

    n = len(actors)
    n_train = int(n * train_ratio)
    n_val = int(n * val_ratio)

    train_actors = set(actors[:n_train])
    val_actors   = set(actors[n_train:n_train + n_val])
    test_actors  = set(actors[n_train + n_val:])

    def collect(actor_set):
        out = []
        for a in actor_set:
            out.extend(by_actor[a])
        return out

    train_paths = collect(train_actors)
    val_paths   = collect(val_actors)
    test_paths  = collect(test_actors)

    return train_paths, val_paths, test_paths, train_actors, val_actors, test_actors


# ----- Example usage -----
train_paths, val_paths, test_paths, *_ = actor_independent_split(df["file_path"], seed=42)
print(len(train_paths), len(val_paths), len(test_paths))


5141 1066 1224


In [27]:
import os
import numpy as np

# Output folders (separate per split)
TRAIN_NPY_DIR = "mel_npy_train"
VAL_NPY_DIR   = "mel_npy_val"
TEST_NPY_DIR  = "mel_npy_test"
os.makedirs(TRAIN_NPY_DIR, exist_ok=True)
os.makedirs(VAL_NPY_DIR, exist_ok=True)
os.makedirs(TEST_NPY_DIR, exist_ok=True)

LOG_EVERY = 500

def precompute_mels(paths, out_dir, load_fn):
    for i, wav_path in enumerate(paths, start=1):
        file_name = os.path.splitext(os.path.basename(wav_path))[0]
        out_path = os.path.join(out_dir, f"{file_name}.npy")

        try:
            y = load_fn(wav_path)  # train or eval length handling

            # ✅ pass params explicitly + enforce fixed TARGET_FRAMES inside wav_to_logmel
            mel = wav_to_logmel(
                y,
                SAMPLE_RATE,
                N_FFT,
                WIN_LENGTH,
                HOP_LENGTH,
                N_MELS,
                TARGET_FRAMES
            )

            np.save(out_path, mel)

        except Exception as e:
            print(f"Skipping: {wav_path}\n  -> {e}")
            continue

        if i % LOG_EVERY == 0:
            print(f"[{out_dir}] Processed {i} files...")

    print(f"Done. Saved {len(paths)} files to: {out_dir}")


# 1) Make the split (speaker-independent)
train_paths, val_paths, test_paths, *_ = actor_independent_split(
    df["file_path"].tolist(),
    train_ratio=0.7,
    val_ratio=0.15,
    seed=SEED
)

print("Split sizes:", len(train_paths), len(val_paths), len(test_paths))

# 2) Precompute
print("Precomputing TRAIN mels...")
precompute_mels(train_paths, TRAIN_NPY_DIR, load_and_fix_length_train)

print("Precomputing VAL mels...")
precompute_mels(val_paths, VAL_NPY_DIR, load_and_fix_length_eval)

print("Precomputing TEST mels...")
precompute_mels(test_paths, TEST_NPY_DIR, load_and_fix_length_eval)


Split sizes: 5141 1066 1224
Precomputing TRAIN mels...
[mel_npy_train] Processed 500 files...
[mel_npy_train] Processed 1000 files...
[mel_npy_train] Processed 1500 files...
[mel_npy_train] Processed 2000 files...
[mel_npy_train] Processed 2500 files...
[mel_npy_train] Processed 3000 files...
[mel_npy_train] Processed 3500 files...
[mel_npy_train] Processed 4000 files...
[mel_npy_train] Processed 4500 files...
[mel_npy_train] Processed 5000 files...
Done. Saved 5141 files to: mel_npy_train
Precomputing VAL mels...
[mel_npy_val] Processed 500 files...
[mel_npy_val] Processed 1000 files...
Done. Saved 1066 files to: mel_npy_val
Precomputing TEST mels...
[mel_npy_test] Processed 500 files...
[mel_npy_test] Processed 1000 files...
Done. Saved 1224 files to: mel_npy_test


In [28]:
import numpy as np, os
fn = next(f for f in os.listdir(TRAIN_NPY_DIR) if f.endswith(".npy"))
x = np.load(os.path.join(TRAIN_NPY_DIR, fn))
print(x.shape)  # should be (1, 80, TARGET_FRAMES)


(1, 80, 251)


In [31]:
class MelNPYDataset(Dataset):
    def __init__(self, paths):
        self.paths = paths

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        path = self.paths[idx]
        fname = os.path.basename(path)

        # loading spectrogram
        spec = np.load(path).astype(np.float32)
        if spec.ndim == 2:
            spec = spec[np.newaxis, :, :]

        # parsing label from filename
        _, label = parse_actor_and_emotion(fname)

        x = torch.from_numpy(spec)
        y = torch.tensor(label, dtype=torch.long)
        return x, y


In [32]:
import os

def wav_paths_to_npy_paths(wav_paths, npy_dir):
    out = []
    for wav_path in wav_paths:
        base = os.path.splitext(os.path.basename(wav_path))[0]
        out.append(os.path.join(npy_dir, base + ".npy"))
    return out


In [33]:
train_npy_paths = wav_paths_to_npy_paths(train_paths, TRAIN_NPY_DIR)
val_npy_paths   = wav_paths_to_npy_paths(val_paths, VAL_NPY_DIR)
test_npy_paths  = wav_paths_to_npy_paths(test_paths, TEST_NPY_DIR)

train_ds = MelNPYDataset(train_npy_paths)
val_ds   = MelNPYDataset(val_npy_paths)
test_ds  = MelNPYDataset(test_npy_paths)


In [34]:
print(train_ds[0][0].shape, train_ds[0][1])


torch.Size([1, 80, 251]) tensor(0)


In [35]:
from torch.utils.data import DataLoader

BATCH_SIZE = 32

train_loader = DataLoader(
    MelNPYDataset(train_npy_paths),
    batch_size=BATCH_SIZE,
    shuffle=True
)

val_loader = DataLoader(
    MelNPYDataset(val_npy_paths),
    batch_size=BATCH_SIZE,
    shuffle=False
)

test_loader = DataLoader(
    MelNPYDataset(test_npy_paths),
    batch_size=BATCH_SIZE,
    shuffle=False
)


In [36]:
xb, yb = next(iter(train_loader))
print(xb.shape, yb[:5])


torch.Size([32, 1, 80, 251]) tensor([5, 5, 0, 2, 0])


In [37]:
class CNNBiLSTM(nn.Module):
    def __init__(self, lstm_hidden=128):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d((2,2)),

            nn.Conv2d(32, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d((2,1)),

            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d((2,1))
        )

        # Use your actual fixed spectrogram size:
        with torch.no_grad():
            dummy = torch.zeros(1, 1, N_MELS, TARGET_FRAMES)
            z = self.cnn(dummy)
            C, Fp, Tp = z.shape[1], z.shape[2], z.shape[3]
            lstm_in = C * Fp

        self.lstm = nn.LSTM(
            input_size=lstm_in,
            hidden_size=lstm_hidden,
            batch_first=True,
            bidirectional=True
        )

        # mean+std pooling doubles the feature size -> 4H instead of 2H
        self.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(4 * lstm_hidden, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, NUM_CLASSES)
        )

    def forward(self, x):
        z = self.cnn(x)                 # (B, C, F, T)
        z = z.permute(0, 3, 1, 2)       # (B, T, C, F)
        z = z.flatten(2)                # (B, T, C*F)
        out, _ = self.lstm(z)           # (B, T, 2H)

        # Stats pooling over time (stronger than mean only)
        mu = out.mean(dim=1)            # (B, 2H)
        sigma = out.std(dim=1)          # (B, 2H)
        pooled = torch.cat([mu, sigma], dim=1)  # (B, 4H)

        return self.fc(pooled)


In [38]:
from sklearn.metrics import accuracy_score, f1_score


def train_one_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0.0

    for x, y in loader:
        x = x.to(DEVICE, non_blocking=True)
        y = y.to(DEVICE, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item() * x.size(0)

    return total_loss / len(loader.dataset)


from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    all_preds, all_true = [], []

    for x, y in loader:
        x = x.to(DEVICE, non_blocking=True)
        logits = model(x)
        preds = torch.argmax(logits, dim=1)

        all_preds.append(preds.detach().cpu().numpy())
        all_true.append(y.detach().cpu().numpy())

    y_pred = np.concatenate(all_preds)
    y_true = np.concatenate(all_true)

    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    cm = confusion_matrix(y_true, y_pred)

    return acc, macro_f1, cm



In [39]:
model = CNNBiLSTM(lstm_hidden=128).to(DEVICE)

criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-3)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="max", factor=0.5, patience=2, min_lr=1e-6
)

max_epochs = 120
early_patience = 8
best_val = -1.0
wait = 0
best_path = "best_cremad.pt"

for epoch in range(1, max_epochs + 1):
    train_loss = train_one_epoch(model, train_loader, optimizer, criterion)
    val_acc, val_f1, _ = evaluate(model, val_loader)

    scheduler.step(val_f1)

    lr = optimizer.param_groups[0]["lr"]
    print(f"Epoch {epoch:03d} | train_loss {train_loss:.4f} | val_acc {val_acc:.4f} | val_f1 {val_f1:.4f} | lr {lr:.2e}")

    if val_f1 > best_val + 1e-4:
        best_val = val_f1
        wait = 0
        torch.save(model.state_dict(), best_path)
    else:
        wait += 1
        if wait >= early_patience:
            print(f"Early stopping. Best val_f1={best_val:.4f}")
            break

model.load_state_dict(torch.load(best_path, map_location=DEVICE))
best_acc, best_f1, cm = evaluate(model, val_loader)
print(f"BEST CHECKPOINT | val_acc={best_acc:.4f} | val_macro_f1={best_f1:.4f}")
print("Confusion matrix:\n", cm)


Epoch 001 | train_loss 1.6105 | val_acc 0.4071 | val_f1 0.3455 | lr 3.00e-04
Epoch 002 | train_loss 1.5386 | val_acc 0.4212 | val_f1 0.3707 | lr 3.00e-04
Epoch 003 | train_loss 1.5078 | val_acc 0.4634 | val_f1 0.4262 | lr 3.00e-04
Epoch 004 | train_loss 1.4788 | val_acc 0.4794 | val_f1 0.4526 | lr 3.00e-04
Epoch 005 | train_loss 1.4546 | val_acc 0.4475 | val_f1 0.4096 | lr 3.00e-04
Epoch 006 | train_loss 1.4259 | val_acc 0.5019 | val_f1 0.4825 | lr 3.00e-04
Epoch 007 | train_loss 1.4073 | val_acc 0.3837 | val_f1 0.3435 | lr 3.00e-04
Epoch 008 | train_loss 1.3727 | val_acc 0.4681 | val_f1 0.4455 | lr 3.00e-04
Epoch 009 | train_loss 1.3479 | val_acc 0.5291 | val_f1 0.5175 | lr 3.00e-04
Epoch 010 | train_loss 1.3098 | val_acc 0.5328 | val_f1 0.5357 | lr 3.00e-04
Epoch 011 | train_loss 1.2744 | val_acc 0.5760 | val_f1 0.5701 | lr 3.00e-04
Epoch 012 | train_loss 1.2389 | val_acc 0.5816 | val_f1 0.5737 | lr 3.00e-04
Epoch 013 | train_loss 1.2182 | val_acc 0.5469 | val_f1 0.5368 | lr 3.00e-04

In [40]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

@torch.no_grad()
def get_preds(model, loader, device):
    model.eval()
    all_y, all_p = [], []
    for x, y in loader:
        x = x.to(device)
        logits = model(x)
        preds = logits.argmax(1).cpu().numpy()
        all_p.append(preds)
        all_y.append(y.numpy())
    return np.concatenate(all_y), np.concatenate(all_p)

y_true, y_pred = get_preds(model, test_loader, DEVICE)

print(classification_report(y_true, y_pred, target_names=[k for k,v in sorted(EMO_MAP.items(), key=lambda x:x[1])]))
print(confusion_matrix(y_true, y_pred))


              precision    recall  f1-score   support

         ANG       0.67      0.69      0.68       209
         DIS       0.51      0.56      0.53       209
         FEA       0.54      0.55      0.54       209
         HAP       0.50      0.64      0.56       209
         NEU       0.70      0.61      0.65       179
         SAD       0.64      0.46      0.54       209

    accuracy                           0.58      1224
   macro avg       0.59      0.58      0.58      1224
weighted avg       0.59      0.58      0.58      1224

[[144  28   7  28   2   0]
 [ 31 116  20  21   7  14]
 [  7  23 114  41   6  18]
 [ 20  15  28 133   9   4]
 [  9  18   3  22 109  18]
 [  3  26  41  20  22  97]]


In [41]:
FINAL_PATH = "cnn_bilstm_emotion_new.pt"

# model should already be the best one (or load best_path first)
torch.save(model.state_dict(), FINAL_PATH)
print("Saved weights to:", FINAL_PATH)


Saved weights to: cnn_bilstm_emotion_new.pt


In [6]:
import os, re, random
import numpy as np
import torch

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

DEVICE = "cpu"  # you said CPU-only

EMO_MAP = {"ANG": 0, "DIS": 1, "FEA": 2, "HAP": 3, "NEU": 4, "SAD": 5}
NUM_CLASSES = len(EMO_MAP)

def parse_actor_and_emotion(filename: str):
    # filename can end with .wav or .npy; emotion code is still in the basename
    m = re.match(r"^(\d{4})_", filename)
    if m is None:
        raise ValueError(f"Bad CREMA-D filename: {filename}")

    emotion = None
    for emo_code, emo_label in EMO_MAP.items():
        if f"_{emo_code}_" in filename:
            emotion = emo_label
            break
    if emotion is None:
        raise ValueError(f"No emotion code in {filename}")
    return m.group(1), emotion


In [7]:
train_paths, val_paths, test_paths, *_ = actor_independent_split(
    df["file_path"].tolist(),
    train_ratio=0.7,
    val_ratio=0.15,
    seed=SEED
)
print(len(train_paths), len(val_paths), len(test_paths))


5141 1066 1224


In [21]:
from transformers import Wav2Vec2FeatureExtractor, WavLMModel
import numpy as np
import torch
import soundfile as sf
import librosa

MODEL_NAME = "microsoft/wavlm-base"
SAMPLE_RATE = 16000
DEVICE = "cpu"

# Load WavLM
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
wavlm = WavLMModel.from_pretrained(MODEL_NAME).to(DEVICE)
wavlm.eval()
for p in wavlm.parameters():
    p.requires_grad = False


def load_wav_16k(path: str) -> np.ndarray:
    """
    Robust WAV loader for Windows (no torchaudio, no TorchCodec).
    Returns mono float32 waveform at 16 kHz.
    """
    wav, sr = sf.read(path, always_2d=False)

    # Ensure mono
    if wav.ndim > 1:
        wav = np.mean(wav, axis=1)

    # Resample if needed
    if sr != SAMPLE_RATE:
        wav = librosa.resample(wav, orig_sr=sr, target_sr=SAMPLE_RATE)

    # Normalize
    wav = wav / (np.max(np.abs(wav)) + 1e-9)

    return wav.astype(np.float32)


In [24]:
@torch.no_grad()
def wavlm_embedding(wav_1d: np.ndarray) -> np.ndarray:
    inputs = feature_extractor(
        wav_1d,
        sampling_rate=SAMPLE_RATE,
        return_tensors="pt",
        padding=False
    )
    input_values = inputs["input_values"].to(DEVICE)  # (1, L)

    out = wavlm(input_values=input_values)
    h = out.last_hidden_state  # (1, T, 768)

    mu = h.mean(dim=1)         # (1, 768)
    sigma = h.std(dim=1)       # (1, 768)
    emb = torch.cat([mu, sigma], dim=1)  # (1, 1536)

    return emb.squeeze(0).cpu().numpy().astype(np.float32)


In [25]:
import os, torch
torch.set_num_threads(max(1, os.cpu_count() - 1))


In [8]:
import os
import numpy as np

EMB_TRAIN_DIR = "wavlm_emb_train"
EMB_VAL_DIR   = "wavlm_emb_val"
EMB_TEST_DIR  = "wavlm_emb_test"
os.makedirs(EMB_TRAIN_DIR, exist_ok=True)
os.makedirs(EMB_VAL_DIR, exist_ok=True)
os.makedirs(EMB_TEST_DIR, exist_ok=True)

def precompute_embeddings(wav_paths, out_dir, log_every=50):
    saved = 0
    for i, wav_path in enumerate(wav_paths, start=1):
        base = os.path.splitext(os.path.basename(wav_path))[0]
        out_path = os.path.join(out_dir, base + ".npy")

        if os.path.exists(out_path):
            continue

        try:
            w = load_wav_16k(wav_path)
            emb = wavlm_embedding(w)          # (1536,)
            np.save(out_path, emb)
            saved += 1
        except Exception as e:
            print(f"Skip {wav_path}: {e}")
            continue

        if i % log_every == 0:
            print(f"[{out_dir}] {i}/{len(wav_paths)} processed | {saved} saved")

    print(f"Done {out_dir}: saved {saved} embeddings")


In [27]:
precompute_embeddings(train_paths, EMB_TRAIN_DIR)
precompute_embeddings(val_paths,   EMB_VAL_DIR)
precompute_embeddings(test_paths,  EMB_TEST_DIR)


[wavlm_emb_train] 50/5141 processed | 50 saved
[wavlm_emb_train] 100/5141 processed | 100 saved
[wavlm_emb_train] 150/5141 processed | 150 saved
[wavlm_emb_train] 200/5141 processed | 200 saved
[wavlm_emb_train] 250/5141 processed | 250 saved
[wavlm_emb_train] 300/5141 processed | 300 saved
[wavlm_emb_train] 350/5141 processed | 350 saved
[wavlm_emb_train] 400/5141 processed | 400 saved
[wavlm_emb_train] 450/5141 processed | 450 saved
[wavlm_emb_train] 500/5141 processed | 500 saved
[wavlm_emb_train] 550/5141 processed | 550 saved
[wavlm_emb_train] 600/5141 processed | 600 saved
[wavlm_emb_train] 650/5141 processed | 650 saved
[wavlm_emb_train] 700/5141 processed | 700 saved
[wavlm_emb_train] 750/5141 processed | 750 saved
[wavlm_emb_train] 800/5141 processed | 800 saved
[wavlm_emb_train] 850/5141 processed | 850 saved
[wavlm_emb_train] 900/5141 processed | 900 saved
[wavlm_emb_train] 950/5141 processed | 950 saved
[wavlm_emb_train] 1000/5141 processed | 1000 saved
[wavlm_emb_train] 10

In [9]:
fn = next(f for f in os.listdir(EMB_TRAIN_DIR) if f.endswith(".npy"))
x = np.load(os.path.join(EMB_TRAIN_DIR, fn))
print(x.shape)   # (1536,)


(1536,)


In [10]:
from torch.utils.data import Dataset, DataLoader
import torch

class EmbDataset(Dataset):
    def __init__(self, wav_paths, emb_dir):
        self.items = []
        for wp in wav_paths:
            base = os.path.splitext(os.path.basename(wp))[0]
            emb_path = os.path.join(emb_dir, base + ".npy")
            _, label = parse_actor_and_emotion(base + ".wav")
            self.items.append((emb_path, label))

    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        emb_path, label = self.items[idx]
        x = np.load(emb_path).astype(np.float32)   # (1536,)
        return torch.from_numpy(x), torch.tensor(label, dtype=torch.long)

train_loader = DataLoader(EmbDataset(train_paths, EMB_TRAIN_DIR), batch_size=64, shuffle=True)
val_loader   = DataLoader(EmbDataset(val_paths,   EMB_VAL_DIR),   batch_size=128, shuffle=False)
test_loader  = DataLoader(EmbDataset(test_paths,  EMB_TEST_DIR),  batch_size=128, shuffle=False)


In [11]:
xb, yb = next(iter(train_loader))
print(xb.shape, yb[:5])  # torch.Size([64, 1536])


torch.Size([64, 1536]) tensor([5, 5, 0, 3, 0])


In [12]:
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score, classification_report

class MLPClassifier(nn.Module):
    def __init__(self, in_dim=1536, num_classes=6):
        super().__init__()
        self.net = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(in_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_classes),
        )

    def forward(self, x):
        return self.net(x)

def train_one_epoch(model, loader, optimizer, criterion):
    model.train()
    total = 0.0
    for x, y in loader:
        x = x.to(DEVICE)
        y = y.to(DEVICE)
        optimizer.zero_grad(set_to_none=True)
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        total += loss.item() * x.size(0)
    return total / len(loader.dataset)

@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    all_preds, all_true = [], []
    for x, y in loader:
        x = x.to(DEVICE)
        logits = model(x)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.append(preds)
        all_true.append(y.cpu().numpy())
    y_pred = np.concatenate(all_preds)
    y_true = np.concatenate(all_true)
    acc = accuracy_score(y_true, y_pred)
    f1  = f1_score(y_true, y_pred, average="macro")
    return acc, f1, y_true, y_pred

model = MLPClassifier(in_dim=1536, num_classes=NUM_CLASSES).to(DEVICE)
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-3)

best_f1 = -1.0
patience = 8
wait = 0
best_path = "best_wavlm_mlp.pt"

for epoch in range(1, 81):
    tr_loss = train_one_epoch(model, train_loader, optimizer, criterion)
    val_acc, val_f1, _, _ = evaluate(model, val_loader)
    print(f"Epoch {epoch:03d} | train_loss {tr_loss:.4f} | val_acc {val_acc:.4f} | val_f1 {val_f1:.4f}")

    if val_f1 > best_f1 + 1e-4:
        best_f1 = val_f1
        wait = 0
        torch.save(model.state_dict(), best_path)
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stopping. Best val_f1={best_f1:.4f}")
            break

model.load_state_dict(torch.load(best_path, map_location=DEVICE))
val_acc, val_f1, y_true, y_pred = evaluate(model, val_loader)
print(f"BEST | val_acc={val_acc:.4f} val_f1={val_f1:.4f}")
print(classification_report(y_true, y_pred, digits=4))


Epoch 001 | train_loss 1.6394 | val_acc 0.4812 | val_f1 0.4307
Epoch 002 | train_loss 1.4711 | val_acc 0.4841 | val_f1 0.4447
Epoch 003 | train_loss 1.3780 | val_acc 0.5403 | val_f1 0.5138
Epoch 004 | train_loss 1.3230 | val_acc 0.5675 | val_f1 0.5532
Epoch 005 | train_loss 1.2758 | val_acc 0.5844 | val_f1 0.5756
Epoch 006 | train_loss 1.2636 | val_acc 0.5854 | val_f1 0.5777
Epoch 007 | train_loss 1.2406 | val_acc 0.6154 | val_f1 0.6242
Epoch 008 | train_loss 1.2189 | val_acc 0.6454 | val_f1 0.6469
Epoch 009 | train_loss 1.2116 | val_acc 0.6229 | val_f1 0.6278
Epoch 010 | train_loss 1.1960 | val_acc 0.6473 | val_f1 0.6496
Epoch 011 | train_loss 1.1871 | val_acc 0.6370 | val_f1 0.6417
Epoch 012 | train_loss 1.1782 | val_acc 0.6229 | val_f1 0.6293
Epoch 013 | train_loss 1.1714 | val_acc 0.6210 | val_f1 0.6086
Epoch 014 | train_loss 1.1729 | val_acc 0.6220 | val_f1 0.6282
Epoch 015 | train_loss 1.1653 | val_acc 0.6379 | val_f1 0.6372
Epoch 016 | train_loss 1.1588 | val_acc 0.6604 | val_f1

In [20]:
FINAL_PATH = "wavlm_mlp_emotion.pt"

# model should already be the best one (or load best_path first)
torch.save(model.state_dict(), FINAL_PATH)
print("Saved weights to:", FINAL_PATH)


Saved weights to: wavlm_mlp_emotion.pt


In [15]:
model.load_state_dict(torch.load(best_path, map_location=DEVICE))
model.eval()


MLPClassifier(
  (net): Sequential(
    (0): Dropout(p=0.2, inplace=False)
    (1): Linear(in_features=1536, out_features=256, bias=True)
    (2): ReLU()
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=256, out_features=6, bias=True)
  )
)

In [16]:
model.load_state_dict(torch.load(best_path, map_location=DEVICE))
test_acc, test_f1, y_true, y_pred = evaluate(model, test_loader)

print(f"TEST | acc={test_acc:.4f} | macro_f1={test_f1:.4f}")
print(classification_report(
    y_true, y_pred,
    target_names=["ANG","DIS","FEA","HAP","NEU","SAD"],
    digits=4
))


TEST | acc=0.6258 | macro_f1=0.6234
              precision    recall  f1-score   support

         ANG     0.7067    0.7608    0.7327       209
         DIS     0.5525    0.6794    0.6094       209
         FEA     0.6460    0.4976    0.5622       209
         HAP     0.5669    0.6890    0.6220       209
         NEU     0.6910    0.6872    0.6891       179
         SAD     0.6309    0.4498    0.5251       209

    accuracy                         0.6258      1224
   macro avg     0.6323    0.6273    0.6234      1224
weighted avg     0.6309    0.6258    0.6218      1224



In [17]:
from sklearn.metrics import recall_score

# y_true: ground truth emotion labels
# y_pred: predicted emotion labels

uar = recall_score(y_true, y_pred, average='macro')
print(f"UAR: {uar:.4f}")


UAR: 0.6273


In [18]:
from sklearn.metrics import classification_report, recall_score
import numpy as np

# overall UAR
uar = recall_score(y_true, y_pred, average='macro')

# per-class recall
recalls = recall_score(y_true, y_pred, average=None)

print(f"UAR: {uar:.4f}")
print("Per-class recall:")
for i, r in enumerate(recalls):
    print(f"Class {i}: {r:.3f}")


UAR: 0.6273
Per-class recall:
Class 0: 0.761
Class 1: 0.679
Class 2: 0.498
Class 3: 0.689
Class 4: 0.687
Class 5: 0.450


In [19]:
from sklearn.metrics import accuracy_score

wa = accuracy_score(y_true, y_pred)
print(f"Weighted Accuracy (WA): {wa:.4f}")


Weighted Accuracy (WA): 0.6258
