<a href="https://colab.research.google.com/github/Patelsandesh998/COA-LAB-9/blob/main/DeepLearningLab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# STEP 0 — Mount Google Drive (run once)

In [170]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# STEP 1 — Imports

In [171]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, optimizers


In [172]:
dataset_path = "/content/drive/MyDrive/RAVDESS/"

# STEP 2 — Settings (change these if needed)

In [173]:
TESS_PATH = "/content/drive/MyDrive/TESS Toronto emotional speech set data/"

N_MFCC = 40
MAX_PAD_LEN = 200
RANDOM_STATE = 42
BATCH_SIZE = 32
EPOCHS = 25

SAVED_MODEL_PATH = "/content/drive/MyDrive/ser_cnn_lstm_tess.h5"


 Step 3: Load TESS Dataset (file paths + labels)

In [174]:
file_paths = []
labels = []

for folder in os.listdir(TESS_PATH):
    folder_path = os.path.join(TESS_PATH, folder)

    if not os.path.isdir(folder_path):
        continue

    for fname in os.listdir(folder_path):
        if not fname.lower().endswith(".wav"):
            continue

        parts = fname.rsplit('.', 1)[0].split('_')

        if len(parts) >= 3:
            emotion = parts[2].lower()
            if emotion == "ps":
                emotion = "surprise"

            file_paths.append(os.path.join(folder_path, fname))
            labels.append(emotion)

print("Files:", len(file_paths))
print(pd.Series(labels).value_counts())


Files: 2800
sad         400
fear        400
angry       400
neutral     400
surprise    400
disgust     400
happy       400
Name: count, dtype: int64



# STEP 4 — Encode labels

In [175]:
le = LabelEncoder()
y = le.fit_transform(labels)
class_names = list(le.classes_)

print("Classes:", class_names)


Classes: [np.str_('angry'), np.str_('disgust'), np.str_('fear'), np.str_('happy'), np.str_('neutral'), np.str_('sad'), np.str_('surprise')]


# STEP 5 — MFCC extraction function (fixed-size by padding/truncating)

In [176]:
def extract_mfcc_fixed(path, n_mfcc=N_MFCC, max_pad=MAX_PAD_LEN, sr_target=22050):
    try:
        audio, sr = librosa.load(path, sr=sr_target)
    except:
        return np.zeros((n_mfcc, max_pad), dtype=np.float32)

    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)

    if mfcc.shape[1] < max_pad:
        pad = max_pad - mfcc.shape[1]
        mfcc = np.pad(mfcc, ((0,0),(0,pad)), mode='constant')
    else:
        mfcc = mfcc[:, :max_pad]

    return mfcc.astype(np.float32)


Step 6: Extract MFCC Features

In [177]:
X_list = []
print("Extracting MFCCs...")

for p in tqdm(file_paths):
    X_list.append(extract_mfcc_fixed(p))

X = np.array(X_list)          # (N, 40, 200)
X = X[..., np.newaxis]        # reshape → (N, 40, 200, 1)
y = np.array(y)

print("X shape:", X.shape)
print("y shape:", y.shape)


Extracting MFCCs...


100%|██████████| 2800/2800 [19:52<00:00,  2.35it/s]

X shape: (2800, 40, 200, 1)
y shape: (2800,)





Step 7: Split Train, Validation, Test

In [179]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=RANDOM_STATE, stratify=y_temp
)

print("Train:", X_train.shape)
print("Val:", X_val.shape)
print("Test:", X_test.shape)


Train: (2100, 40, 200, 1)
Val: (350, 40, 200, 1)
Test: (350, 40, 200, 1)


Step 8: BASELINE MODEL BLOCK

In [181]:
DATA_PATH = "/content/drive/MyDrive/Emotions"   # change here

def extract_features(file_path):
    data, sr = librosa.load(file_path, duration=3, offset=0.5)
    mfcc = librosa.feature.mfcc(y=data, sr=sr, n_mfcc=40)
    mfcc_scaled = np.mean(mfcc.T, axis=0)
    return mfcc_scaled


Step 9: Prepare Data for Baseline (Flatten MFCCs)

In [183]:
X_flat = X.reshape(X.shape[0], -1)

X_train_b, X_temp_b, y_train_b, y_temp_b = train_test_split(
    X_flat, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y
)

X_val_b, X_test_b, y_val_b, y_test_b = train_test_split(
    X_temp_b, y_temp_b, test_size=0.5, random_state=RANDOM_STATE, stratify=y_temp_b
)

print("Baseline shapes:", X_train_b.shape, X_val_b.shape, X_test_b.shape)


Baseline shapes: (2100, 8000) (350, 8000) (350, 8000)


 Step 10: Baseline Model (MLP)

In [184]:
baseline_model = models.Sequential([
    layers.Dense(256, activation='relu', input_shape=(X_flat.shape[1],)),
    layers.Dropout(0.3),

    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),

    layers.Dense(len(class_names), activation='softmax')
])

baseline_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

baseline_model.summary()


Step 11: Train Baseline

In [195]:
history_base = baseline_model.fit(
    X_train_b, y_train_b,
    validation_data=(X_val_b, y_val_b),
    epochs=50,
    batch_size=64,
    verbose=2
)


Epoch 1/50
33/33 - 1s - 38ms/step - accuracy: 0.9976 - loss: 0.0060 - val_accuracy: 1.0000 - val_loss: 5.0037e-05
Epoch 2/50
33/33 - 1s - 37ms/step - accuracy: 0.9981 - loss: 0.0051 - val_accuracy: 0.9971 - val_loss: 0.0098
Epoch 3/50
33/33 - 1s - 36ms/step - accuracy: 0.9990 - loss: 0.0046 - val_accuracy: 0.9943 - val_loss: 0.0135
Epoch 4/50
33/33 - 1s - 37ms/step - accuracy: 0.9995 - loss: 0.0014 - val_accuracy: 0.9943 - val_loss: 0.0301
Epoch 5/50
33/33 - 2s - 59ms/step - accuracy: 0.9995 - loss: 0.0030 - val_accuracy: 0.9943 - val_loss: 0.0305
Epoch 6/50
33/33 - 2s - 58ms/step - accuracy: 0.9981 - loss: 0.0054 - val_accuracy: 0.9943 - val_loss: 0.0299
Epoch 7/50
33/33 - 2s - 63ms/step - accuracy: 0.9990 - loss: 0.0038 - val_accuracy: 0.9943 - val_loss: 0.0300
Epoch 8/50
33/33 - 1s - 38ms/step - accuracy: 0.9981 - loss: 0.0052 - val_accuracy: 0.9943 - val_loss: 0.0312
Epoch 9/50
33/33 - 1s - 39ms/step - accuracy: 1.0000 - loss: 7.3033e-04 - val_accuracy: 0.9943 - val_loss: 0.0315
Ep

In [194]:
history_base = baseline_model.fit(
    X_train_b, y_train_b,
    validation_data=(X_val_b, y_val_b),
    epochs=20,
    batch_size=32,
    verbose=2
)


Epoch 1/20
66/66 - 6s - 94ms/step - accuracy: 0.9995 - loss: 0.0023 - val_accuracy: 0.9971 - val_loss: 0.0349
Epoch 2/20
66/66 - 12s - 179ms/step - accuracy: 0.9990 - loss: 0.0030 - val_accuracy: 0.9971 - val_loss: 0.0435
Epoch 3/20
66/66 - 4s - 55ms/step - accuracy: 0.9990 - loss: 0.0064 - val_accuracy: 0.9943 - val_loss: 0.0500
Epoch 4/20
66/66 - 4s - 58ms/step - accuracy: 0.9986 - loss: 0.0030 - val_accuracy: 0.9943 - val_loss: 0.0176
Epoch 5/20
66/66 - 3s - 46ms/step - accuracy: 0.9986 - loss: 0.0053 - val_accuracy: 0.9943 - val_loss: 0.0180
Epoch 6/20
66/66 - 4s - 59ms/step - accuracy: 0.9990 - loss: 0.0034 - val_accuracy: 0.9943 - val_loss: 0.0189
Epoch 7/20
66/66 - 2s - 38ms/step - accuracy: 0.9990 - loss: 0.0047 - val_accuracy: 0.9971 - val_loss: 0.0153
Epoch 8/20
66/66 - 2s - 31ms/step - accuracy: 0.9976 - loss: 0.0047 - val_accuracy: 0.9971 - val_loss: 0.0143
Epoch 9/20
66/66 - 3s - 39ms/step - accuracy: 0.9986 - loss: 0.0042 - val_accuracy: 0.9971 - val_loss: 0.0144
Epoch 10

Step 11: Evaluate Baseline

In [196]:
loss_b, acc_b = baseline_model.evaluate(X_test_b, y_test_b)
print("Baseline Accuracy:", acc_b)


[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9983 - loss: 0.0030
Baseline Accuracy: 0.9971428513526917


Step 12: Build CNN + LSTM Hybrid Model