# Purpose of this notebook

Basically all my experimentation and everything goes here.

In [9]:
# Beat tracking example
import librosa

filename = librosa.example('nutcracker')

# load the audio as a waveform 'y', store sampling rate as 'sr'
y, sr = librosa.load(filename)

# Run the default beat tracker
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)

print(f'Estimated tempo: {tempo} beats per minute')

# convert the frame indices of beat events into timestamps
beat_times = librosa.frames_to_time(beat_frames, sr=sr)
print(beat_times)

Estimated tempo: [107.66601562] beats per minute
[  1.18421769   1.71827664   2.32199546   2.87927438   3.45977324
   4.01705215   4.59755102   5.13160998   5.7353288    6.29260771
   6.84988662   7.40716553   7.9876644    8.54494331   9.12544218
   9.65950113  10.21678005  10.72761905  11.28489796  11.79573696
  12.32979592  12.86385488  13.42113379  13.95519274  14.4892517
  15.02331066  15.55736961  16.09142857  16.62548753  17.15954649
  17.69360544  18.25088435  18.80816327  19.31900227  19.87628118
  20.38712018  20.92117914  21.4552381   21.98929705  22.52335601
  23.05741497  23.59147392  24.12553288  24.65959184  25.19365079
  25.72770975  26.26176871  26.81904762  27.35310658  27.88716553
  28.44444444  29.00172336  29.55900227  30.11628118  30.67356009
  31.20761905  31.78811791  32.34539683  32.85623583  33.36707483
  33.90113379  34.43519274  34.94603175  35.4800907   35.99092971
  36.52498866  37.03582766  37.56988662  38.12716553  38.66122449
  39.2185034   39.75256236  

In [13]:
# Integrating harmonic-percussive seperation, multiple spectral features
# And beat synchronous feature aggregation

import numpy as np
import librosa

# load the example clip
y, sr = librosa.load(librosa.ex('nutcracker'))

# set the hop length; at 22050 Hz, 512 Samples ~= 23ms
hop_length = 512

# seperate harmonics and percussives into two waveforms
y_harmoic, y_percussive = librosa.effects.hpss(y)

# beat track on percussive signal
tempo, beat_frames = librosa.beat.beat_track(y=y_percussive, sr=sr)

# Compute MFCC features from the raw signal
mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=13)

# and first order differences (delta features)
mfcc_delta = librosa.feature.delta(mfcc)

# stack and synchronize between beat events
# this time, we'll use the mean value (default) instead of medium
beat_mfcc_delta = librosa.util.sync(np.vstack([mfcc, mfcc_delta]), beat_frames)

# compute chroma features from the harmonic signal
chromagram = librosa.feature.chroma_cqt(y=y_harmoic, sr=sr)

# aggregate chroma features between beat events
# we'll use the median value of each feature between beat frames
beat_chroma = librosa.util.sync(chromagram, beat_frames, aggregate=np.median)

# finally stack all beat-synchronous features together
beat_features = np.vstack([beat_chroma, beat_mfcc_delta])
print(beat_features)

[[ 0.3054334   0.03334978  0.10536107 ...  0.07464188  0.08429885
   0.07849725]
 [ 0.2604909   0.04063905  0.08386959 ...  0.1279302   0.07352865
   0.09528373]
 [ 0.46387547  0.05806328  0.09554327 ...  0.18667205  0.23257327
   0.10365877]
 ...
 [-0.06548888 -0.14644979  0.05339237 ...  0.62217766  0.67595893
   0.01785581]
 [-0.02321988 -0.13737921 -0.0114388  ...  0.24913478  0.27645436
  -0.04623426]
 [ 0.02864812 -0.20102465 -0.05542236 ... -0.6156198   0.07662217
   0.07621391]]


### Basic Feature extraction from audio data

In [39]:
import os
import random
import librosa
import numpy as np
import soundfile as sf

In [35]:
def extract_mfcc_features(path, sr=None, max_pad_len=500):
    if isinstance(path, str):
        y, sr = librosa.load(path, sr=sr)
    else:
        y = path
    
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40) # increased mfccs for richer features

    # padding to ensure consistent input size
    pad_width = max_pad_len - mfccs.shape[1]

    if pad_width > 0:
        mfccs = np.pad(mfccs, pad_width=((0,0), (0, pad_width)), mode='constant')
    
    return mfccs.T

In [36]:
def extract_features_from_directory(base_dir):
    features = {}
    
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                print(f"Processing {file_path}...")
                mfcc_features = extract_mfcc_features(file_path)
                features[file_path] = mfcc_features
    
    return features

In [37]:
base_directory = '../archive'
features_dict = extract_features_from_directory(base_directory)

Processing ../archive\Actor_01\03-01-01-01-01-01-01.wav...
Processing ../archive\Actor_01\03-01-01-01-01-02-01.wav...
Processing ../archive\Actor_01\03-01-01-01-02-01-01.wav...
Processing ../archive\Actor_01\03-01-01-01-02-02-01.wav...
Processing ../archive\Actor_01\03-01-02-01-01-01-01.wav...
Processing ../archive\Actor_01\03-01-02-01-01-02-01.wav...
Processing ../archive\Actor_01\03-01-02-01-02-01-01.wav...
Processing ../archive\Actor_01\03-01-02-01-02-02-01.wav...
Processing ../archive\Actor_01\03-01-02-02-01-01-01.wav...
Processing ../archive\Actor_01\03-01-02-02-01-02-01.wav...
Processing ../archive\Actor_01\03-01-02-02-02-01-01.wav...
Processing ../archive\Actor_01\03-01-02-02-02-02-01.wav...
Processing ../archive\Actor_01\03-01-03-01-01-01-01.wav...
Processing ../archive\Actor_01\03-01-03-01-01-02-01.wav...
Processing ../archive\Actor_01\03-01-03-01-02-01-01.wav...
Processing ../archive\Actor_01\03-01-03-01-02-02-01.wav...
Processing ../archive\Actor_01\03-01-03-02-01-01-01.wav.

In [2]:
import os
import torch
import wandb
import librosa
import torchaudio

import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import torch.nn.functional as F

from sklearn.utils import class_weight
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold

In [3]:
class Config:
    # Data paths
    DATA_DIR = "../archive"

    # Audio processing parameters
    SAMPLE_RATE = 22050  # Standard sample rate
    DURATION = 3  # seconds
    N_MFCC = 20

    # Model hyperparameters
    BATCH_SIZE = 32
    LEARNING_RATE = 0.001
    NUM_EPOCHS = 10

    # Feature extraction parameters
    FEATURES = [
        "mfcc",
        "spectral_centroid",
        "chroma",
        "spectral_contrast",
        "zero_crossing_rate",
        "spectral_rolloff",
    ]

    # Augmentation parameters
    AUGMENTATION = True
    NOISE_FACTOR = 0.005
    SCALE_RANGE = (0.9, 1.1)


In [4]:
def extract_advanced_features(file_path):
    # Load the audio file
    y, sr = librosa.load(file_path, duration=Config.DURATION, sr=Config.SAMPLE_RATE)

    # Feature extraction
    features = []

    # MFCC features (increased resolution)
    if "mfcc" in Config.FEATURES:
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=Config.N_MFCC)
        mfccs_processed = np.mean(mfccs.T, axis=0)
        features.append(mfccs_processed)

    # Spectral Centroid
    if "spectral_centroid" in Config.FEATURES:
        spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)
        spectral_centroids_processed = np.mean(spectral_centroids)
        features.append([spectral_centroids_processed])

    # Chroma Features
    if "chroma" in Config.FEATURES:
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        chroma_processed = np.mean(chroma.T, axis=0)
        features.append(chroma_processed)

    # Spectral Contrast
    if "spectral_contrast" in Config.FEATURES:
        spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        spectral_contrast_processed = np.mean(spectral_contrast.T, axis=0)
        features.append(spectral_contrast_processed)

    # Zero Crossing Rate
    if "zero_crossing_rate" in Config.FEATURES:
        zcr = librosa.feature.zero_crossing_rate(y)
        zcr_processed = np.mean(zcr)
        features.append([zcr_processed])

    # Spectral Rolloff
    if "spectral_rolloff" in Config.FEATURES:
        spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        spectral_rolloff_processed = np.mean(spectral_rolloff)
        features.append([spectral_rolloff_processed])

    # Concatenate all features
    return np.concatenate(features)

In [5]:
def augment_features(features, noise_factor=Config.NOISE_FACTOR, scale_range=Config.SCALE_RANGE):
    if not Config.AUGMENTATION:
        return features

    # Add Gaussian noise
    noise = np.random.normal(0, noise_factor, features.shape)
    augmented_features = features + noise

    # Random scaling
    scale_factor = np.random.uniform(scale_range[0], scale_range[1])
    augmented_features *= scale_factor

    return augmented_features

In [6]:
def prepare_dataset(data_dir):
    features = []
    labels = []

    # Emotion mapping with potential for expansion
    emotion_map = {
        "01": "neutral",
        "02": "calm",
        "03": "happy",
        "04": "sad",
        "05": "angry",
        "06": "fearful",
        "07": "disgust",
        "08": "surprised",
    }

    # Walk through all directories and subdirectories
    for root, dirs, files in os.walk(data_dir):
        for filename in files:
            if filename.endswith(".wav"):
                # Full file path
                file_path = os.path.join(root, filename)

                try:
                    # Extract emotion from filename
                    emotion_code = filename.split("-")[2]
                    emotion = emotion_map.get(emotion_code, "unknown")

                    # Extract original features
                    file_features = extract_advanced_features(file_path)
                    features.append(file_features)
                    labels.append(emotion)

                    # Optional augmentation
                    if Config.AUGMENTATION:
                        augmented_features = augment_features(file_features)
                        features.append(augmented_features)
                        labels.append(emotion)

                except Exception as e:
                    print(f"Error processing {filename}: {e}")

    # Informative print about dataset
    print(f"Dataset Summary:")
    print(f"Total files processed: {len(features)}")

    # Count of emotions
    from collections import Counter

    emotion_counts = Counter(labels)
    for emotion, count in emotion_counts.items():
        print(f"{emotion.capitalize()} emotion: {count} samples")

    return np.array(features), np.array(labels)

In [7]:
class EmotionDataset(Dataset):
    """Enhanced Custom PyTorch Dataset for Emotion Recognition"""

    def __init__(self, features, labels, scaler=None):
        # Standardize features
        if scaler is None:
            self.scaler = StandardScaler()
            features = self.scaler.fit_transform(features)
        else:
            features = scaler.transform(features)

        self.features = torch.FloatTensor(features)

        # Encode labels
        self.label_encoder = LabelEncoder()
        self.labels = torch.LongTensor(self.label_encoder.fit_transform(labels))

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

    def get_num_classes(self):
        return len(self.label_encoder.classes_)

    def get_class_names(self):
        return self.label_encoder.classes_

In [8]:
class HybridEmotionRecognitionModel(nn.Module):
    """Advanced Hybrid Neural Network for Emotion Recognition"""

    def __init__(self, input_dim, num_classes):
        super().__init__()

        # Enhanced input projection with residual connection
        self.input_projection = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
        )

        # More complex convolutional layers with residual connections
        self.conv_layers = nn.ModuleList(
            [
                nn.Sequential(
                    nn.Conv1d(1, 64, kernel_size=3, padding=1),
                    nn.BatchNorm1d(64),
                    nn.ReLU(),
                    nn.MaxPool1d(2),
                ),
                nn.Sequential(
                    nn.Conv1d(64, 128, kernel_size=3, padding=1),
                    nn.BatchNorm1d(128),
                    nn.ReLU(),
                    nn.MaxPool1d(2),
                ),
            ]
        )

        # Bidirectional LSTM with more layers
        self.lstm_layers = nn.LSTM(
            input_size=128,
            hidden_size=256,
            num_layers=3,
            batch_first=True,
            bidirectional=True,
            dropout=0.4,
        )

        # More complex fully connected layers
        self.fc_layers = nn.Sequential(
            nn.Linear(512, 256),  # Note the 512 due to bidirectional LSTM
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),
        )

        self.output_layer = nn.Linear(128, num_classes)

    def forward(self, x):
        # Input projection
        x = self.input_projection(x)

        # Reshape for conv layers
        x = x.unsqueeze(1)

        # Convolutional layers with residual-like processing
        for conv_layer in self.conv_layers:
            x = conv_layer(x)

        # Prepare for LSTM
        x = x.permute(0, 2, 1)

        # LSTM processing
        lstm_out, _ = self.lstm_layers(x)
        x = lstm_out[:, -1, :]

        # Fully connected layers
        x = self.fc_layers(x)

        return self.output_layer(x)

In [9]:
def train_model(model, train_loader, val_loader, labels, num_epochs=Config.NUM_EPOCHS):
    # Compute class weights to handle class imbalance
    class_weights = class_weight.compute_class_weight(
        "balanced", classes=np.unique(labels), y=labels
    )
    class_weights = torch.FloatTensor(class_weights)

    # Loss with class weights
    criterion = nn.CrossEntropyLoss(weight=class_weights)

    # Adam with weight decay (L2 regularization)
    optimizer = optim.AdamW(
        model.parameters(), lr=Config.LEARNING_RATE, weight_decay=1e-5
    )

    # Learning rate scheduler
    scheduler = ReduceLROnPlateau(
        optimizer, mode="min", factor=0.5, patience=5, verbose=True
    )

    # Initialize wandb
    # wandb.init(
    #     project="SentimentSound",
    #     config={
    #         "learning_rate": Config.LEARNING_RATE,
    #         "batch_size": Config.BATCH_SIZE,
    #         "epochs": num_epochs,
    #         "augmentation": Config.AUGMENTATION,
    #     },
    # )

    # Training loop with more advanced techniques
    best_val_loss = float("inf")
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0

        for features, batch_labels in train_loader:
            optimizer.zero_grad()

            # Forward and backward pass
            outputs = model(features)
            loss = criterion(outputs, batch_labels)

            loss.backward()

            # Gradient clipping
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            train_total += batch_labels.size(0)
            train_correct += (predicted == batch_labels).sum().item()

        # Validation
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for features, batch_labels in val_loader:
                outputs = model(features)
                loss = criterion(outputs, batch_labels)

                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                val_total += batch_labels.size(0)
                val_correct += (predicted == batch_labels).sum().item()

        # Compute metrics
        train_accuracy = 100 * train_correct / train_total
        val_accuracy = 100 * val_correct / val_total

        # Learning rate scheduling
        scheduler.step(val_loss)

        # Logging to wandb
        # wandb.log(
        #     {
        #         "train_loss": train_loss / len(train_loader),
        #         "train_accuracy": train_accuracy,
        #         "val_loss": val_loss / len(val_loader),
        #         "val_accuracy": val_accuracy,
        #     }
        # )

        # Print epoch summary
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Train Loss: {train_loss / len(train_loader):.4f}")
        print(f"Train Accuracy: {train_accuracy:.2f}%")
        print(f"Val Loss: {val_loss / len(val_loader):.4f}")
        print(f"Val Accuracy: {val_accuracy:.2f}%")

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "best_emotion_model.pth")

    # Finish wandb run
    # wandb.finish()

    return model

In [10]:
def evaluate_model(model, test_loader, dataset):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for features, labels in test_loader:
            outputs = model(features)
            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.numpy())
            all_labels.extend(labels.numpy())

    # Classification Report
    class_names = dataset.get_class_names()
    print("\nClassification Report:")
    print(classification_report(all_labels, all_preds, target_names=class_names))

    # Confusion Matrix Visualization
    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        cm, annot=True, fmt="d", xticklabels=class_names, yticklabels=class_names
    )
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.savefig("confusion_matrix.png")
    plt.close()

In [11]:
torch.manual_seed(42)
np.random.seed(42)

features, labels = prepare_dataset(Config.DATA_DIR)

Dataset Summary:
Total files processed: 5760
Neutral emotion: 384 samples
Calm emotion: 768 samples
Happy emotion: 768 samples
Sad emotion: 768 samples
Angry emotion: 768 samples
Fearful emotion: 768 samples
Disgust emotion: 768 samples
Surprised emotion: 768 samples


In [12]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [13]:
train_dataset = EmotionDataset(X_train, y_train)
val_dataset = EmotionDataset(X_val, y_val)
test_dataset = EmotionDataset(X_test, y_test)


train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=Config.BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=Config.BATCH_SIZE)


In [14]:
model = HybridEmotionRecognitionModel(input_dim=len(X_train[0]), num_classes=train_dataset.get_num_classes())

In [15]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=Config.LEARNING_RATE)

In [16]:
train_model(
    model,
    train_loader,
    val_loader,
    labels,
    num_epochs=Config.NUM_EPOCHS
)



Epoch 1/10
Train Loss: 1.9264
Train Accuracy: 23.44%
Val Loss: 1.8741
Val Accuracy: 32.00%
Epoch 2/10
Train Loss: 1.6292
Train Accuracy: 35.92%
Val Loss: 1.3499
Val Accuracy: 46.75%
Epoch 3/10
Train Loss: 1.4448
Train Accuracy: 45.44%
Val Loss: 1.2561
Val Accuracy: 51.41%
Epoch 4/10
Train Loss: 1.3004
Train Accuracy: 50.71%
Val Loss: 1.3132
Val Accuracy: 47.40%
Epoch 5/10
Train Loss: 1.2306
Train Accuracy: 54.99%
Val Loss: 1.0980
Val Accuracy: 60.52%
Epoch 6/10
Train Loss: 1.1356
Train Accuracy: 57.41%
Val Loss: 0.9909
Val Accuracy: 63.34%
Epoch 7/10
Train Loss: 1.0722
Train Accuracy: 61.69%
Val Loss: 0.8876
Val Accuracy: 65.18%
Epoch 8/10
Train Loss: 1.0167
Train Accuracy: 62.94%
Val Loss: 0.8234
Val Accuracy: 70.39%
Epoch 9/10
Train Loss: 0.9540
Train Accuracy: 65.95%
Val Loss: 0.8440
Val Accuracy: 71.91%
Epoch 10/10
Train Loss: 0.8904
Train Accuracy: 68.77%
Val Loss: 0.7261
Val Accuracy: 75.70%


HybridEmotionRecognitionModel(
  (input_projection): Sequential(
    (0): Linear(in_features=42, out_features=512, bias=True)
    (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=512, out_features=256, bias=True)
    (5): ReLU()
  )
  (conv_layers): ModuleList(
    (0): Sequential(
      (0): Conv1d(1, 64, kernel_size=(3,), stride=(1,), padding=(1,))
      (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (1): Sequential(
      (0): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
  )
  (lstm_layers): LSTM(128, 

In [17]:
evaluate_model(model, test_loader, train_dataset)


Classification Report:
              precision    recall  f1-score   support

       angry       0.99      0.73      0.84       158
        calm       0.86      0.87      0.86       161
     disgust       0.67      0.77      0.71       148
     fearful       0.74      0.76      0.75       174
       happy       0.59      0.60      0.60       146
     neutral       0.68      0.78      0.73        68
         sad       0.67      0.80      0.73       148
   surprised       0.90      0.74      0.81       149

    accuracy                           0.76      1152
   macro avg       0.76      0.76      0.75      1152
weighted avg       0.77      0.76      0.76      1152

