# Speech Emotion Recognition with BDH SNN

This notebook implements a Speech Emotion Recognition (SER) system using the Crema-D dataset and Pathway's BDH (Bio-Digital Hebbian) Spiking Neural Network architecture.

## Objectives
1. Load and Preprocess Crema-D Audio Data
2. Extract Acoustic Features (MFCCs)
3. Train a BDH SNN Model
4. Save Learned Hebbian Synapses

In [1]:
import os
import numpy as np
import pandas as pd
import librosa
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

# Add src to path to import BDH SNN
import sys
sys.path.append(os.path.abspath('src'))
from bdh_snn.network import SpikingNeuralNetwork

## Configuration

In [None]:
DATASET_PATH = "src/Crema-D/AudioWAV"
SAMPLE_RATE = 16000
DURATION = 2.5 # seconds
N_MFCC = 13
INPUT_DIM = 40 # We will use mean MFCC + other features or just MFCCs flattened if small
# Actually, let's use a standard number of MFCCs. The SNN default is 128 input dim.
# We can project or pad.
TARGET_INPUT_DIM = 128
HIDDEN_DIM = 256
BATCH_SIZE = 32
EPOCHS = 20
LEARNING_RATE = 0.001

## Data Loading and Feature Extraction

In [None]:
def extract_features(file_path):
    try:
        # Load audio
        audio, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE, res_type='kaiser_fast')
        
        # Pad or truncate to fixed duration
        target_len = int(SAMPLE_RATE * DURATION)
        if len(audio) < target_len:
            audio = np.pad(audio, (0, target_len - len(audio)))
        else:
            audio = audio[:target_len]
            
        # MFCC
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_mean = np.mean(mfccs.T, axis=0)
        
        # We obtained 40 features. We need 128 for the default SNN or we change SNN input dim.
        # Let's verify what the SNN expects. It defaults to 128 but we can instantiate it with 40.
        
        return mfccs_mean
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

def load_crema_d(data_path):
    features = []
    labels = []
    
    # Crema-D Filename: 1001_DFA_ANG_XX.wav
    # Indices: 0: Actor, 1: Sentence, 2: Emotion, 3: Intensity
    
    files = [f for f in os.listdir(data_path) if f.endswith('.wav')]
    
    print(f"Found {len(files)} files.")
    
    for i, file in enumerate(files):
        if i % 500 == 0: print(f"Processing {i}/{len(files)}...")
        
        parts = file.split('_')
        if len(parts) < 3:
            continue
            
        emotion = parts[2]
        
        # Feature extraction
        feat = extract_features(os.path.join(data_path, file))
        
        if feat is not None:
            features.append(feat)
            labels.append(emotion)
            
    return np.array(features), np.array(labels)

In [None]:
# Load Data
X, y = load_crema_d(DATASET_PATH)
print(f"Features shape: {X.shape}, Labels shape: {y.shape}")

In [None]:
# Encode Labels
lb = LabelEncoder()
y_encoded = lb.fit_transform(y)
classes = lb.classes_
print(f"Classes: {classes}")

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Standardize Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# To Tensor
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# DataLoaders
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

## BDH SNN Model Initialization

In [None]:
# Initialize SNN
# Our input dimension is X.shape[1] (should be 40)
model = SpikingNeuralNetwork(input_dim=X.shape[1], hidden_dim=HIDDEN_DIM)
print(model)

## Training Loop with Hebbian Updates

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

model.train()

for epoch in range(EPOCHS):
    total_loss = 0
    correct = 0
    total = 0
    
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        
        # Forward pass
        output_dict = model(inputs)
        outputs = output_dict['output']
        hidden_state = output_dict['hidden_state']
        
        # Compute Loss
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        # Hebbian Update
        # We assume self-supervised or unsupervised plasticity
        with torch.no_grad():
             model.update_synapses(inputs, hidden_state)
        
        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
    print(f'Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss/len(train_loader):.4f}, Accuracy: {100 * correct / total:.2f}%')

## Save Learned Synapses

In [None]:
# Save the weights of the HebbianSynapse layer
synapse_weights = model.synapses.weights.data
torch.save(synapse_weights, 'learned_synapses.pt')
print("Saved learned synapses to 'learned_synapses.pt'")

# Also save the full model for good measure
torch.save(model.state_dict(), 'ser_bdh_model.pth')
print("Saved full model to 'ser_bdh_model.pth'")

## Verification

In [None]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        output_dict = model(inputs)
        outputs = output_dict['output']
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Test Accuracy: {100 * correct / total:.2f}%')