In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import librosa
import soundfile as sf
import glob
import os
import numpy as np
import wandb
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2Model

# Load Wav2Vec2 model

In [30]:
# Load feature extractor and model
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-er")
wav2vec2_model = Wav2Vec2Model.from_pretrained("superb/wav2vec2-base-superb-er")

# Ensure the model is on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
wav2vec2_model.to(device)




Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2GroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Wav2Vec2Encoder(
    (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
  

# Features extraction

In [31]:
# Path to the RAVDESS dataset
ravdess_path = "data/ravdess"

# Function to process a single audio file
def extract_features(audio_path):
    # Load audio (mono, 16kHz)
    speech_array, sr = librosa.load(audio_path, sr=16000, mono=True)
    # Normalize
    speech_array = speech_array / np.max(np.abs(speech_array))
    
    # Convert audio to tensor and move to GPU
    input_values = torch.tensor(speech_array).unsqueeze(0).to(device)
    
    # Extract features
    with torch.no_grad():
        outputs = wav2vec2_model(input_values)

    # Get last hidden state (contextualized features)
    contextualized_features = outputs.last_hidden_state.squeeze(0)  # Shape: (n, 768)
    
    # Mean pooling to get a single 768-dim vector
    feature_vector = contextualized_features.mean(dim=0)  # Shape: (768,)

    return feature_vector.cpu().numpy()

# Preprocessing

In [32]:
# Function to extract emotion label from filename
def get_emotion_label(filename):
    parts = filename.split("-")
    emotion_label = int(parts[2]) - 1  # Convert emotion from (01-08) → (0-7)
    return emotion_label

# Process all audio files
data = []
labels = []

# Find all .wav files
wav_files = glob.glob(os.path.join(ravdess_path, "**", "*.wav"), recursive=True)

for file in wav_files:
    emotion_label = get_emotion_label(os.path.basename(file))
    
    # Extract feature vector
    feature_vector = extract_features(file)
    
    # Append to dataset
    data.append(feature_vector)
    labels.append(emotion_label)

# Convert to NumPy arrays
data = np.array(data)
labels = np.array(labels)

# Split into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Save processed data
np.save("X_train.npy", X_train)
np.save("y_train.npy", y_train)
np.save("X_val.npy", X_val)
np.save("y_val.npy", y_val)
np.save("X_test.npy", X_test)
np.save("y_test.npy", y_test)

print(f"Dataset Processed!")
print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

Dataset Processed!
Train: (2304, 768), Val: (288, 768), Test: (288, 768)


# Classification model

In [33]:
class EmotionClassifier(nn.Module):
    def __init__(self, num_classes):
        super(EmotionClassifier, self).__init__()
        self.fc1 = nn.Linear(768, 512)  # First FC layer
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(512, num_classes)  # Final classification layer

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Initialize the model
num_classes = 8  # RAVDESS has 8 emotions
model = EmotionClassifier(num_classes).to("cuda")

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)


# Training

In [34]:
# Initialize Weights & Biases
wandb.init(project="emotion-classification", name="wav2vec2-training")

# Load processed dataset
X_train = np.load("X_train.npy")
y_train = np.load("y_train.npy")
X_val = np.load("X_val.npy")
y_val = np.load("y_val.npy")

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val, dtype=torch.long).to(device)
# Create DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Define model, loss, optimizer
num_classes = 8
model = EmotionClassifier(num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training function
def train_model(epochs=10):
    for epoch in range(epochs):
        model.train()
        total_loss, total_correct = 0, 0
        
        for features, labels in train_loader:
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(features)
            loss = criterion(outputs, labels)
            
            # Backward pass
            loss.backward()
            optimizer.step()

            # Compute accuracy
            preds = torch.argmax(outputs, dim=1)
            total_correct += (preds == labels).sum().item()
            total_loss += loss.item()

        # Calculate metrics
        train_acc = total_correct / len(train_loader.dataset)
        avg_loss = total_loss / len(train_loader)

        # Log to W&B
        wandb.log({"Train Loss": avg_loss, "Train Accuracy": train_acc})

        # Validation
        model.eval()
        val_loss, val_correct = 0, 0

        with torch.no_grad():
            for features, labels in val_loader:
                outputs = model(features)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                preds = torch.argmax(outputs, dim=1)
                val_correct += (preds == labels).sum().item()

        val_acc = val_correct / len(val_loader.dataset)
        avg_val_loss = val_loss / len(val_loader)

        # Log validation results
        wandb.log({"Val Loss": avg_val_loss, "Val Accuracy": val_acc})

        print(f"Epoch {epoch+1}: Train Loss = {avg_loss:.4f}, Train Acc = {train_acc:.4f}, Val Loss = {avg_val_loss:.4f}, Val Acc = {val_acc:.4f}")
        
# Train the model
train_model(epochs=2000)

# Finish W&B run
wandb.finish()

Epoch 1: Train Loss = 2.0168, Train Acc = 0.2201, Val Loss = 1.9607, Val Acc = 0.3125
Epoch 2: Train Loss = 1.9145, Train Acc = 0.3234, Val Loss = 1.8781, Val Acc = 0.2465
Epoch 3: Train Loss = 1.8357, Train Acc = 0.3420, Val Loss = 1.8113, Val Acc = 0.2951
Epoch 4: Train Loss = 1.7723, Train Acc = 0.3750, Val Loss = 1.7649, Val Acc = 0.3438
Epoch 5: Train Loss = 1.7230, Train Acc = 0.3880, Val Loss = 1.7287, Val Acc = 0.3438
Epoch 6: Train Loss = 1.6790, Train Acc = 0.3976, Val Loss = 1.6966, Val Acc = 0.3576
Epoch 7: Train Loss = 1.6418, Train Acc = 0.4080, Val Loss = 1.6660, Val Acc = 0.3750
Epoch 8: Train Loss = 1.6048, Train Acc = 0.4297, Val Loss = 1.6346, Val Acc = 0.3854
Epoch 9: Train Loss = 1.5649, Train Acc = 0.4323, Val Loss = 1.6051, Val Acc = 0.3993
Epoch 10: Train Loss = 1.5351, Train Acc = 0.4622, Val Loss = 1.5768, Val Acc = 0.4132
Epoch 11: Train Loss = 1.5005, Train Acc = 0.4653, Val Loss = 1.5626, Val Acc = 0.4167
Epoch 12: Train Loss = 1.4654, Train Acc = 0.4939, V

0,1
Train Accuracy,▁▁▅▆▆▇▇█████████████████████████████████
Train Loss,█▅▄▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Val Accuracy,▁▁▅▅▆▇▇▇▇▇▇█████████████████████████████
Val Loss,█▅▄▃▃▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▃▃▃▄▄▅▅▅▅▅▅▅▆▆▆▆▅▆▆▆

0,1
Train Accuracy,1.0
Train Loss,0.0
Val Accuracy,0.93056
Val Loss,0.92219
