In [1]:
import numpy as np
import pandas as pd
import librosa
import os
import joblib

# For progress bars
from tqdm.notebook import tqdm

# For splitting data
from sklearn.model_selection import train_test_split

# For preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [2]:
train=pd.read_csv('../Data/train_augmented.csv')
test=pd.read_csv('../Data/test_df.csv')
val=pd.read_csv('../Data/val_df.csv')

In [3]:
train

Unnamed: 0,emotions,path,type
0,sad,augmented_audio/03-01-04-02-02-02-08_aug_0.wav,
1,calm,augmented_audio/03-02-02-01-01-02-12_aug_1.wav,
2,angry,augmented_audio/03-01-05-01-01-01-09_aug_1.wav,
3,calm,augmented_audio/03-02-02-02-01-02-01_aug_1.wav,
4,sad,augmented_audio/03-02-04-01-01-01-22_aug_1.wav,
...,...,...,...
4405,calm,augmented_audio/03-02-02-01-02-02-21_aug_0.wav,
4406,calm,augmented_audio/03-01-02-02-02-02-02_aug_0.wav,
4407,neutral,augmented_audio/03-02-01-01-01-02-22_aug_1.wav,
4408,surprise,augmented_audio/03-01-08-02-02-02-21_aug_0.wav,


# Features in Audio Data

## MFCC
- What they are: The absolute gold standard for audio classification. They represent the short-term power spectrum of a sound, based on a linear cosine transform of a log power spectrum on a nonlinear mel scale of frequency.
- Why they're great: They are excellent at capturing the timbre and textural qualities of a sound, which are highly correlated with emotion in speech (e.g., the "raspiness" in an angry voice vs. the "smoothness" of a sad voice). They are also designed to mimic human hearing perception.
- How many: Typically, you'll extract between 13 and 40 MFCCs per frame. A good starting point is 20.

## Chroma Features (Chromagram)
- What they are: A representation of the 12 distinct pitch classes (C, C#, D, etc.) of the musical octave.
- Why they're useful: While primarily for music, they can capture intonation and pitch contours in speech. A rising intonation might signal a question or surprise, while a falling one might indicate a statement or sadness. This is particularly useful for the RAVDESS dataset since it contains both speech and song.

## Mel Spectrogram
- What it is: A spectrogram where the frequencies are converted to the mel scale. It's essentially the step right before calculating MFCCs.
- Why it's useful: It's a rich representation of the sound. Some advanced models, especially CNNs, can work directly on Mel Spectrograms as if they were images, learning the features automatically. It's a great alternative or addition to MFCCs.

## Other Useful Features (Per-Frame)
- Spectral Centroid: Indicates the "center of mass" of the spectrum. It relates to the "brightness" of a sound. An angry or surprised voice is often "brighter."
- Zero-Crossing Rate: The rate at which the signal changes sign (from positive to negative). It can help distinguish between voiced speech (low ZCR) and unvoiced/noisy sounds like 'sh' or static (high ZCR).
- RMS (Root Mean Square) Energy: Corresponds to the loudness or amplitude of the audio frame. Anger is often louder, while sadness can be quieter.

In [4]:
N_MFCC = 20
N_FFT = 2048
HOP_LENGTH = 512
SAMPLE_RATE = 22050

def extract_features_sequential(file_path):
    """Extracts MFCCs from an audio file and returns as a sequence."""
    try:
        y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC, n_fft=N_FFT, hop_length=HOP_LENGTH)
        # Transpose to get (time_steps, n_features)
        return mfccs.T
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

In [5]:
# Initialize tqdm for pandas
tqdm.pandas()

# --- Step 3: Apply Feature Extraction to all DataFrames ---
print("\nExtracting features... (This may take a while)")
train['features'] = train['path'].progress_apply(extract_features_sequential)
val['features'] = val['path'].progress_apply(extract_features_sequential)
test['features'] = test['path'].progress_apply(extract_features_sequential)


Extracting features... (This may take a while)


  0%|          | 0/4410 [00:00<?, ?it/s]

  0%|          | 0/491 [00:00<?, ?it/s]

  0%|          | 0/491 [00:00<?, ?it/s]

In [6]:
# Drop rows where feature extraction failed
train.dropna(subset=['features'], inplace=True)
val.dropna(subset=['features'], inplace=True)
test.dropna(subset=['features'], inplace=True)

# Determine a fixed length for sequences (e.g., 95th percentile of lengths)
all_lengths = pd.concat([train['features'], val['features'], test['features']]).apply(len)
FIXED_LENGTH = int(all_lengths.quantile(0.95))
print(f"\nUsing fixed sequence length: {FIXED_LENGTH}")

# Pad or truncate the features
X_train = pad_sequences(train['features'].tolist(), maxlen=FIXED_LENGTH, padding='post', truncating='post', dtype='float32')
X_val = pad_sequences(val['features'].tolist(), maxlen=FIXED_LENGTH, padding='post', truncating='post', dtype='float32')
X_test = pad_sequences(test['features'].tolist(), maxlen=FIXED_LENGTH, padding='post', truncating='post', dtype='float32')

print(f"\nShape of padded training features: {X_train.shape}")
print(f"Shape of padded validation features: {X_val.shape}")
print(f"Shape of padded test features: {X_test.shape}")


Using fixed sequence length: 223

Shape of padded training features: (4410, 223, 20)
Shape of padded validation features: (491, 223, 20)
Shape of padded test features: (491, 223, 20)


In [7]:
# --- Step 5: Encode the Target Labels ---

# Get the corresponding labels
y_train = train['emotions']
y_val = val['emotions']
y_test = test['emotions']

# Use LabelEncoder to convert emotion strings to integers
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.transform(y_val)
y_test_encoded = le.transform(y_test)

# One-hot encode the integer labels for the model's output layer
y_train_categorical = to_categorical(y_train_encoded)
y_val_categorical = to_categorical(y_val_encoded)
y_test_categorical = to_categorical(y_test_encoded)

print(f"\nShape of one-hot encoded training labels: {y_train_categorical.shape}")
# Save the label encoder classes for later prediction decoding
print(f"Label classes: {le.classes_}")



Shape of one-hot encoded training labels: (4410, 8)
Label classes: ['angry' 'calm' 'disgust' 'fear' 'happy' 'neutral' 'sad' 'surprise']


In [8]:
# --- Step 6: Scale the Features ---
# This is a crucial step for model performance

scaler = StandardScaler()

# Reshape data from 3D to 2D for the scaler
nsamples_train, nsteps_train, nfeatures_train = X_train.shape
X_train_2d = X_train.reshape((nsamples_train * nsteps_train, nfeatures_train))

# Fit the scaler ONLY on the training data
print("\nFitting scaler on training data...")
scaler.fit(X_train_2d)

# Transform the training, validation, and test data
X_train_scaled_2d = scaler.transform(X_train_2d)

# Reshape validation and test sets to 2D
nsamples_val, nsteps_val, nfeatures_val = X_val.shape
X_val_2d = X_val.reshape((nsamples_val * nsteps_val, nfeatures_val))
X_val_scaled_2d = scaler.transform(X_val_2d)

nsamples_test, nsteps_test, nfeatures_test = X_test.shape
X_test_2d = X_test.reshape((nsamples_test * nsteps_test, nfeatures_test))
X_test_scaled_2d = scaler.transform(X_test_2d)

# Reshape all data back to 3D for the sequence model
X_train_scaled = X_train_scaled_2d.reshape(X_train.shape)
X_val_scaled = X_val_scaled_2d.reshape(X_val.shape)
X_test_scaled = X_test_scaled_2d.reshape(X_test.shape)

print(f"\nShape of final scaled training features: {X_train_scaled.shape}")



Fitting scaler on training data...

Shape of final scaled training features: (4410, 223, 20)


In [9]:
print("\n--- Preprocessing Complete ---")
print("You now have the following model-ready variables:")
print(f"X_train_scaled: {X_train_scaled.shape}")
print(f"y_train_categorical: {y_train_categorical.shape}")
print(f"X_val_scaled: {X_val_scaled.shape}")
print(f"y_val_categorical: {y_val_categorical.shape}")
print(f"X_test_scaled: {X_test_scaled.shape}")
print(f"y_test_categorical: {y_test_categorical.shape}")
print("\nThese are ready to be fed into a sequence model like an LSTM or GRU.")


--- Preprocessing Complete ---
You now have the following model-ready variables:
X_train_scaled: (4410, 223, 20)
y_train_categorical: (4410, 8)
X_val_scaled: (491, 223, 20)
y_val_categorical: (491, 8)
X_test_scaled: (491, 223, 20)
y_test_categorical: (491, 8)

These are ready to be fed into a sequence model like an LSTM or GRU.


# Saving the files for Later use

In [10]:
output_dir = 'processed_data'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


In [11]:
# --- 1. Save the NumPy arrays ---
# We use np.savez_compressed to save multiple arrays into a single,
# efficient .npz file. This is much better than saving 6 separate files.

data_arrays_path = os.path.join(output_dir, 'audio_data_processed.npz')
np.savez_compressed(
    data_arrays_path,
    X_train=X_train_scaled,
    X_val=X_val_scaled,
    X_test=X_test_scaled,
    y_train=y_train_categorical,
    y_val=y_val_categorical,
    y_test=y_test_categorical
)

In [12]:
scaler_path = os.path.join(output_dir, 'scaler.joblib')
joblib.dump(scaler, scaler_path)

label_encoder_path = os.path.join(output_dir, 'label_encoder.joblib')
joblib.dump(le, label_encoder_path)

print(f"--- Data saving complete! ---")
print(f"Arrays saved to: {data_arrays_path}")
print(f"Scaler saved to: {scaler_path}")
print(f"Label encoder saved to: {label_encoder_path}")
print(f"\nFind these files in the '{output_dir}' directory.")

--- Data saving complete! ---
Arrays saved to: processed_data/audio_data_processed.npz
Scaler saved to: processed_data/scaler.joblib
Label encoder saved to: processed_data/label_encoder.joblib

Find these files in the 'processed_data' directory.


# Feature Engineering After Modelling

In [13]:
train

Unnamed: 0,emotions,path,type,features
0,sad,augmented_audio/03-01-04-02-02-02-08_aug_0.wav,,"[[-483.3294, 0.91852564, 8.517652, 5.6402187, ..."
1,calm,augmented_audio/03-02-02-01-01-02-12_aug_1.wav,,"[[-770.82745, 2.6873112, 2.682191, 2.6736703, ..."
2,angry,augmented_audio/03-01-05-01-01-01-09_aug_1.wav,,"[[-337.3017, -9.244927, -14.508394, 9.422054, ..."
3,calm,augmented_audio/03-02-02-02-01-02-01_aug_1.wav,,"[[-260.23245, 3.3638034, 6.661133, 0.99075234,..."
4,sad,augmented_audio/03-02-04-01-01-01-22_aug_1.wav,,"[[-751.0063, 1.6978652, 1.6965203, 1.6942803, ..."
...,...,...,...,...
4405,calm,augmented_audio/03-02-02-01-02-02-21_aug_0.wav,,"[[-275.2522, 32.63665, -30.75, 38.10554, -27.9..."
4406,calm,augmented_audio/03-01-02-02-02-02-02_aug_0.wav,,"[[-847.4977, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
4407,neutral,augmented_audio/03-02-01-01-01-02-22_aug_1.wav,,"[[-395.50806, -2.6607165, -1.1190903, 1.626747..."
4408,surprise,augmented_audio/03-01-08-02-02-02-21_aug_0.wav,,"[[-701.22375, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...."


In [14]:
# === CONFIGURATION ===
SAMPLE_RATE = 22050
N_FFT = 2048
HOP_LENGTH = 512
N_MFCC = 13

# === ADVANCED FEATURE EXTRACTION ===
def extract_advanced_features(file_path):
    try:
        y, sr = librosa.load(file_path, sr=SAMPLE_RATE)

        # Static Features
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC, n_fft=N_FFT, hop_length=HOP_LENGTH)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH)
        spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH)
        tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)
        rms = librosa.feature.rms(y=y, frame_length=N_FFT, hop_length=HOP_LENGTH)

        # Dynamic Features
        delta_mfccs = librosa.feature.delta(mfccs)
        delta2_mfccs = librosa.feature.delta(mfccs, order=2)

        # Synchronize feature lengths
        ref_len = mfccs.shape[1]
        def sync(feat):
            if feat.shape[1] > ref_len:
                return feat[:, :ref_len]
            elif feat.shape[1] < ref_len:
                return np.pad(feat, ((0, 0), (0, ref_len - feat.shape[1])), mode='constant')
            return feat

        features = np.vstack([
            mfccs,
            delta_mfccs,
            delta2_mfccs,
            sync(chroma),
            sync(spectral_contrast),
            sync(tonnetz),
            sync(rms)
        ])

        return features.T  # shape: (time_steps, n_features)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# === APPLY FEATURE EXTRACTION ===
def extract_features_to_df(df):
    tqdm.pandas()
    df['features'] = df['path'].progress_apply(extract_advanced_features)
    df.dropna(subset=['features'], inplace=True)
    return df

# === PAD SEQUENCES AND ENCODE LABELS ===
def preprocess_data(train_df, val_df, test_df):
    # Extract features
    train_df = extract_features_to_df(train_df)
    val_df = extract_features_to_df(val_df)
    test_df = extract_features_to_df(test_df)

    # Determine fixed length
    fixed_length = int(train_df['features'].apply(len).quantile(0.95))
    print(f"Using fixed sequence length of: {fixed_length}")

    # Pad sequences
    X_train = pad_sequences(train_df['features'].tolist(), maxlen=fixed_length, padding='post', truncating='post', dtype='float32')
    X_val = pad_sequences(val_df['features'].tolist(), maxlen=fixed_length, padding='post', truncating='post', dtype='float32')
    X_test = pad_sequences(test_df['features'].tolist(), maxlen=fixed_length, padding='post', truncating='post', dtype='float32')

    # Encode labels
    le = LabelEncoder()
    y_train = le.fit_transform(train_df['emotions'])
    y_val = le.transform(val_df['emotions'])
    y_test = le.transform(test_df['emotions'])

    # Scale features
    scaler = StandardScaler()
    ns, ts, nf = X_train.shape
    X_train_scaled = scaler.fit_transform(X_train.reshape(ns * ts, nf)).reshape(ns, ts, nf)
    X_val_scaled = scaler.transform(X_val.reshape(-1, nf)).reshape(X_val.shape)
    X_test_scaled = scaler.transform(X_test.reshape(-1, nf)).reshape(X_test.shape)

    return X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, scaler, le

# === SAVE ARTIFACTS ===
def save_processed_data(X_train, y_train, X_val, y_val, X_test, y_test, scaler, le, output_dir='processed_data_advanced'):
    os.makedirs(output_dir, exist_ok=True)
    np.savez_compressed(
        os.path.join(output_dir, 'audio_data_advanced.npz'),
        X_train=X_train, X_val=X_val, X_test=X_test,
        y_train=y_train, y_val=y_val, y_test=y_test
    )
    joblib.dump(scaler, os.path.join(output_dir, 'scaler_advanced.joblib'))
    joblib.dump(le, os.path.join(output_dir, 'label_encoder_advanced.joblib'))
    print("\n--- ALL DONE! ---")
    print(f"Saved processed files to '{output_dir}'")

In [15]:
X_train, y_train, X_val, y_val, X_test, y_test, scaler, le = preprocess_data(train, val, test)
save_processed_data(X_train, y_train, X_val, y_val, X_test, y_test, scaler, le)

  0%|          | 0/4410 [00:00<?, ?it/s]



  0%|          | 0/491 [00:00<?, ?it/s]

  0%|          | 0/491 [00:00<?, ?it/s]

Using fixed sequence length of: 223

--- ALL DONE! ---
Saved processed files to 'processed_data_advanced'


# MEL spectogram

In [17]:
# --- Configuration for Mel Spectrograms ---
SAMPLE_RATE = 22050
N_FFT = 2048
HOP_LENGTH = 512
N_MELS = 128  # Height of the spectrogram 'image'
FIXED_TIME_STEPS = 174 # Width of the 'image' (~4 seconds)

def extract_mel_spectrogram(file_path):
    """ Extracts and resizes a Mel Spectrogram. """
    try:
        y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        
        current_width = mel_spec_db.shape[1]
        if current_width > FIXED_TIME_STEPS:
            mel_spec_db = mel_spec_db[:, :FIXED_TIME_STEPS]
        else:
            pad_width = FIXED_TIME_STEPS - current_width
            mel_spec_db = np.pad(mel_spec_db, pad_width=((0, 0), (0, pad_width)), mode='constant')
        return mel_spec_db
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None
# 3. Extract features
tqdm.pandas()
print("Extracting Mel Spectrograms for training set...")
train['features'] = train['path'].progress_apply(extract_mel_spectrogram)
print("Extracting Mel Spectrograms for validation set...")
val['features'] = val['path'].progress_apply(extract_mel_spectrogram)
# ... do the same for test_df ...
train.dropna(subset=['features'], inplace=True)
val.dropna(subset=['features'], inplace=True)

# 4. Create X arrays (stacking the spectrograms)
X_train = np.array(train['features'].tolist())
X_val = np.array(val['features'].tolist())

# 5. Scale features
# We scale each spectrogram (image) by its own mean/std. This is a common image processing step.
# For simplicity here, we'll use a global scaler.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
X_val = scaler.transform(X_val.reshape(-1, X_val.shape[-1])).reshape(X_val.shape)

# 6. Encode labels
le = LabelEncoder()
y_train = le.fit_transform(train['emotions'])
y_val = le.transform(val['emotions'])

# 7. Save everything
output_dir = 'processed_data_spectrogram'
os.makedirs(output_dir, exist_ok=True)
np.savez_compressed(
    os.path.join(output_dir, 'audio_data_spectrogram.npz'),
    X_train=X_train, X_val=X_val, y_train=y_train, y_val=y_val
)
joblib.dump(scaler, os.path.join(output_dir, 'scaler_spectrogram.joblib'))
joblib.dump(le, os.path.join(output_dir, 'label_encoder_spectrogram.joblib'))
print(f"\n--- DONE! Spectrogram data saved to '{output_dir}' ---")

Extracting Mel Spectrograms for training set...


  0%|          | 0/4410 [00:00<?, ?it/s]

Extracting Mel Spectrograms for validation set...


  0%|          | 0/491 [00:00<?, ?it/s]


--- DONE! Spectrogram data saved to 'processed_data_spectrogram' ---


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# You can reuse the LuongAttention class from before.
class LuongAttention(nn.Module):
    # ... (same as before) ...

class Spec2D_CNN_LSTM_Attention_Model(nn.Module):
    def __init__(self, num_classes, lstm_hidden_size=256, lstm_layers=2):
        super(Spec2D_CNN_LSTM_Attention_Model, self).__init__()
        
        self.lstm_hidden_size = lstm_hidden_size
        
        # --- 2D CNN Feature Extractor ---
        self.cnn = nn.Sequential(
            # Input shape: (batch_size, 1, n_mels, time_steps) -> (B, 1, 128, 174)
            nn.Conv2d(1, 32, kernel_size=(3, 3), stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)), # (B, 32, 64, 87)

            nn.Conv2d(32, 64, kernel_size=(3, 3), stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)), # (B, 64, 32, 43)
            
            nn.Conv2d(64, 128, kernel_size=(3, 3), stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))  # (B, 128, 16, 21)
        )
        
        # --- Prepare for LSTM ---
        # The output of the CNN is a 2D feature map. We treat the time dimension
        # as a sequence and flatten the channel and frequency dimensions together.
        # Output features from CNN: 128 channels * 16 frequency bins = 2048
        self.lstm_input_size = 128 * 16 
        
        self.lstm = nn.LSTM(
            input_size=self.lstm_input_size, 
            hidden_size=self.lstm_hidden_size,
            num_layers=lstm_layers,
            batch_first=True,
            dropout=0.5,
            bidirectional=True
        )
        
        # --- Attention and Classifier ---
        self.attention = LuongAttention()
        self.fc = nn.Linear(self.lstm_hidden_size * 4, num_classes) # *4 from concat(hidden, context)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        # x shape: (batch_size, n_mels, time_steps) -> (B, 128, 174)
        
        # Add a channel dimension for the 2D CNN
        x = x.unsqueeze(1) # -> (B, 1, 128, 174)
        
        # 1. Pass through CNN
        x_cnn = self.cnn(x) # -> (B, 128, 16, 21)
        
        # 2. Prepare for LSTM: need (batch_size, time_steps, features)
        # Permute to bring time dimension forward: (B, 21, 128, 16)
        x_cnn = x_cnn.permute(0, 3, 1, 2)
        
        # Flatten the channel and frequency dimensions
        batch_size, time_steps, C, H = x_cnn.shape
        x_lstm_in = x_cnn.reshape(batch_size, time_steps, C * H) # -> (B, 21, 2048)
        
        # 3. Pass through LSTM
        lstm_outputs, (hidden, cell) = self.lstm(x_lstm_in)
        
        # 4. Apply Attention
        last_hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        context_vector, _ = self.attention(lstm_outputs, last_hidden)
        
        # 5. Combine and Classify
        combined_vector = torch.cat((last_hidden, context_vector), dim=1)
        logits = self.fc(self.dropout(combined_vector))
        
        return logits

In [None]:
# In your Kaggle notebook after loading the NEW spectrogram arrays

# --- Model Setup ---
NUM_CLASSES = len(le.classes_) # Using the filtered set

# Instantiate the correct 2D CNN model
model = Spec2D_CNN_LSTM_Attention_Model(
    num_classes=NUM_CLASSES,
    lstm_hidden_size=256 # Start with a powerful LSTM
).to(device)

# --- Optimizer, Loss, etc. ---
LEARNING_RATE = 0.001
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss()
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)
early_stopper = EarlyStopping(patience=7, verbose=True, path='checkpoint_2dcnn.pt')

print("--- Starting training with 2D-CNN -> LSTM -> Attention model ---")
# Your training loop remains the same.