In [1]:
import os
import numpy as np
import pandas as pd
import librosa
from glob import glob
from joblib import Parallel, delayed
import tensorflow as tf
import tensorflow.keras.layers as L
from tensorflow.keras.models import Model
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

2025-06-22 07:44:24.975172: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750578265.163741      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750578265.219739      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
def noise(data, noise_factor=0.035):
    noise_amp = noise_factor * np.random.uniform() * np.amax(data)
    return data + noise_amp * np.random.normal(size=data.shape[0])

def stretch(data, rate_min=0.8, rate_max=1.2):
    rate = np.random.uniform(low=rate_min, high=rate_max)
    return librosa.effects.time_stretch(data, rate=rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high=5) * 1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_steps_min=-4, pitch_steps_max=4):
    pitch_steps = np.random.randint(low=pitch_steps_min, high=pitch_steps_max)
    return librosa.effects.pitch_shift(data, sr=sampling_rate, n_steps=pitch_steps)


# --- Feature Extraction Function ---
def extract_features(data, sampling_rate, n_mels=128, fmax=8000, n_mfcc=40):
    stft = np.abs(librosa.stft(data, n_fft=2048, hop_length=512))
    mel_spec = librosa.feature.melspectrogram(S=stft**2, sr=sampling_rate, n_mels=n_mels, fmax=fmax)
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
    
    mfccs = librosa.feature.mfcc(y=data, sr=sampling_rate, n_mfcc=n_mfcc)
    
    chroma = librosa.feature.chroma_stft(S=stft, sr=sampling_rate)
    
    zcr = librosa.feature.zero_crossing_rate(y=data)
    
    rmse = librosa.feature.rms(y=data)
    
    # Concatenate features. Ensure they have compatible dimensions (frames).
    # We'll pad them to MAX_FRAMES later.
    # For now, we'll concatenate along the feature dimension (axis=0).
    
    # Ensure all features have the same number of frames for concatenation
    min_frames = min(log_mel_spec.shape[1], mfccs.shape[1], chroma.shape[1], zcr.shape[1], rmse.shape[1])
    
    combined_features = np.vstack([
        log_mel_spec[:, :min_frames],
        mfccs[:, :min_frames],
        chroma[:, :min_frames],
        zcr[:, :min_frames],
        rmse[:, :min_frames]
    ])
    
    return combined_features

In [3]:
# Combine all audio files from both folders
audio_dirs = [
    '/kaggle/input/audio/Audio_Song_Actors_01-24',
    '/kaggle/input/audio/Audio_Speech_Actors_01-24'
]
audio_files = []
for d in audio_dirs:
    audio_files.extend(glob(os.path.join(d, '**', '*.wav'), recursive=True))
print(f"Total audio files found: {len(audio_files)}")

# Load all_labels.csv for mapping file names to emotions
labels_df = pd.read_csv('/kaggle/input/audio/all_labels.csv')
labels_dict = {os.path.basename(row['filename']): row['emotion'] for _, row in labels_df.iterrows()}

Total audio files found: 2452


In [4]:
# --- Define parameters ---
N_MELS = 128 # For Mel Spectrograms
N_MFCC = 40 # For MFCCs
MAX_FRAMES = 175 # Adjusted based on typical audio duration and feature concatenation

# --- process_file function ---
def process_file(file_path):
    file_name = os.path.basename(file_path)
    emotion = labels_dict.get(file_name)
    if emotion is None:
        return []

    try:
        data, sr = librosa.load(file_path, duration=3, offset=0.5, sr=22050) 
        if len(data) < 3 * sr:
             return []
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return []

    augmented_datas = [
        data,
        noise(data),
        stretch(data),
        pitch(data, sr)
    ]
    
    result = []
    for aug_data in augmented_datas:
        features = extract_features(aug_data, sr, n_mels=N_MELS, n_mfcc=N_MFCC)
        result.append((features, emotion))
    return result

results = Parallel(n_jobs=4)(delayed(process_file)(f) for f in audio_files)
features_and_labels = [item for sublist in results for item in sublist]

# Calculate total number of features from extract_features
# Get one sample to determine the total number of features (rows in the feature matrix)
if features_and_labels:
    sample_features = features_and_labels[0][0]
    TOTAL_FEATURES = sample_features.shape[0]
    print(f"Total number of combined features per frame: {TOTAL_FEATURES}")
else:
    TOTAL_FEATURES = N_MELS + N_MFCC + 12 + 1 + 1 # n_mels + n_mfcc + chroma (12) + zcr (1) + rmse (1)
    print(f"No features processed. Defaulting TOTAL_FEATURES to {TOTAL_FEATURES}")


def pad_features(f, max_frames=MAX_FRAMES, total_features=TOTAL_FEATURES):
    if f.shape[1] < max_frames:
        pad_width = max_frames - f.shape[1]
        return np.pad(f, ((0, 0), (0, pad_width)), mode='constant')
    else:
        return f[:, :max_frames]

X_padded = [pad_features(f, max_frames=MAX_FRAMES, total_features=TOTAL_FEATURES) for f, _ in features_and_labels]

X = np.array(X_padded)
X = X[..., np.newaxis]
Y = np.array([l for _, l in features_and_labels])

print(f"Final X shape: {X.shape}")

Total number of combined features per frame: 182
Final X shape: (8300, 182, 175, 1)


In [5]:
Y = np.array(Y).reshape(-1, 1) 

encoder = OneHotEncoder(sparse_output=False)
Y_encoded = encoder.fit_transform(Y)

print("Encoder classes:", encoder.categories_)
print("Shape of Y_encoded:", Y_encoded.shape)
print("Sample one-hot rows:", Y_encoded[:5])

Encoder classes: [array(['angry', 'calm', 'disgust', 'fearful', 'happy', 'neutral', 'sad',
       'surprised'], dtype='<U9')]
Shape of Y_encoded: (8300, 8)
Sample one-hot rows: [[1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]]


In [6]:
# Step 1: Split into 80% train+val and 20% test
X_temp, X_test, Y_temp, Y_test = train_test_split(
    X, Y_encoded, test_size=0.2, random_state=42, shuffle=True, stratify=Y
)

# Step 2: Split the 80% further into 90% train and 10% val
X_train, X_val, Y_train, Y_val = train_test_split(
    X_temp, Y_temp, test_size=0.10, random_state=42, shuffle=True, stratify=Y_temp
)

print(f"Train samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Test samples: {len(X_test)}")

unique_emotions = len(np.unique(Y))
print(f"Number of unique emotions: {unique_emotions}")


Train samples: 5976
Validation samples: 664
Test samples: 1660
Number of unique emotions: 8


In [7]:
# callbacks
early_stopping = EarlyStopping(monitor='val_accuracy',mode='max',patience=15,restore_best_weights=True)
lr_reduction = ReduceLROnPlateau(monitor='val_accuracy',patience=5,verbose=1,factor=0.5,min_lr=0.00001)
model_checkpoint = ModelCheckpoint('best_model2_weights.keras', monitor='val_accuracy', save_best_only=True)

In [8]:
def cnn_rnn_model(input_shape, num_classes):
    inputs = tf.keras.Input(shape=input_shape)
    
    # First Block
    x = L.Conv2D(64, (3, 3), activation='relu', padding='same')(inputs)
    x = L.BatchNormalization()(x)
    x = L.Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = L.BatchNormalization()(x)
    x = L.MaxPooling2D((2, 2))(x)
    x = L.Dropout(0.25)(x)

    # Second Block
    x = L.Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = L.BatchNormalization()(x)
    x = L.Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = L.BatchNormalization()(x)
    x = L.MaxPooling2D((2, 2))(x)
    x = L.Dropout(0.25)(x)

    # Third Block
    x = L.Conv2D(256, (3, 3), activation='relu', padding='same')(x)
    x = L.BatchNormalization()(x)
    x = L.Conv2D(256, (3, 3), activation='relu', padding='same')(x)
    x = L.BatchNormalization()(x)
    x = L.MaxPooling2D((2, 2))(x)
    x = L.Dropout(0.25)(x)

    # Bridge from CNN to RNN
    shape = tf.keras.backend.int_shape(x)
    x = L.Reshape((shape[1], shape[2] * shape[3]))(x)
    
    # RNN Back-End
    x = L.Bidirectional(L.LSTM(256, return_sequences=True))(x)
    x = L.Dropout(0.3)(x)
    x = L.Bidirectional(L.LSTM(128, return_sequences=False))(x) # Final LSTM layer
    x = L.BatchNormalization()(x)
    x = L.Dropout(0.4)(x)
    
    # Classifier Head
    x = L.Dense(256, activation='relu')(x)
    x = L.BatchNormalization()(x)
    x = L.Dropout(0.4)(x)
    outputs = L.Dense(num_classes, activation='softmax')(x)
    
    return tf.keras.Model(inputs, outputs)

model = cnn_rnn_model((TOTAL_FEATURES, MAX_FRAMES, 1), num_classes=8)
model.summary()

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5), # Slightly lower learning rate
    loss='categorical_crossentropy',
    metrics=['accuracy']
)


I0000 00:00:1750578480.769529      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [9]:
callbacks = [early_stopping, lr_reduction, model_checkpoint]

history = model.fit(
    X_train, Y_train,
    validation_data=(X_val, Y_val),
    epochs=200, # Increased epochs to allow more training given early stopping
    batch_size=64, # Increased batch size
    callbacks=callbacks,
    verbose=1
)

Epoch 1/200


E0000 00:00:1750578496.024612      19 meta_optimizer.cc:966] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inStatefulPartitionedCall/functional_1/dropout_1/stateless_dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer
I0000 00:00:1750578497.167342      92 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 450ms/step - accuracy: 0.1341 - loss: 3.0971 - val_accuracy: 0.0723 - val_loss: 2.1537 - learning_rate: 5.0000e-05
Epoch 2/200
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 410ms/step - accuracy: 0.1843 - loss: 2.6690 - val_accuracy: 0.1672 - val_loss: 2.0984 - learning_rate: 5.0000e-05
Epoch 3/200
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 410ms/step - accuracy: 0.2398 - loss: 2.4502 - val_accuracy: 0.2681 - val_loss: 1.9490 - learning_rate: 5.0000e-05
Epoch 4/200
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 410ms/step - accuracy: 0.2698 - loss: 2.3419 - val_accuracy: 0.3208 - val_loss: 1.8331 - learning_rate: 5.0000e-05
Epoch 5/200
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 410ms/step - accuracy: 0.2975 - loss: 2.1632 - val_accuracy: 0.3268 - val_loss: 1.8063 - learning_rate: 5.0000e-05
Epoch 6/200
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━

In [10]:
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(Y_test, axis=1)

cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", cm)

report = classification_report(y_true, y_pred, target_names=encoder.categories_[0])
print("Classification Report:\n", report)

overall_accuracy = accuracy_score(y_true, y_pred)
print(f"Overall Accuracy: {overall_accuracy*100:.2f}%")

macro_f1 = f1_score(y_true, y_pred, average='macro')
print(f"Macro F1 Score: {macro_f1*100:.2f}%")

per_class_accuracy = cm.diagonal() / cm.sum(axis=1)
for idx, acc in enumerate(per_class_accuracy):
    print(f"Accuracy for class {encoder.categories_[0][idx]}: {acc*100:.2f}%")

[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 82ms/step
Confusion Matrix:
 [[246   3   9  19   5   2   3   0]
 [  0 278   0   0   0   1   0   0]
 [  1   0 129   0   1   0   5   3]
 [  4   5   0 216   2   0  12   1]
 [  6  12   2  13 213   3   2   3]
 [  0   7   2   0   0 103   3   0]
 [  0  26   1  10   4   1 223   2]
 [  0   0   0   0   1   0   1  77]]
Classification Report:
               precision    recall  f1-score   support

       angry       0.96      0.86      0.90       287
        calm       0.84      1.00      0.91       279
     disgust       0.90      0.93      0.91       139
     fearful       0.84      0.90      0.87       240
       happy       0.94      0.84      0.89       254
     neutral       0.94      0.90      0.92       115
         sad       0.90      0.84      0.86       267
   surprised       0.90      0.97      0.93        79

    accuracy                           0.89      1660
   macro avg       0.90      0.90      0.90      1660
weight

In [11]:
model.save('best_model_improved.h5')