In [2]:
# %load_ext cudf.pandas

In [3]:
# Importing required libraries
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import IPython.display as ipd  # To play sound in the notebook

# Importing data and preprocessing
ref = pd.read_csv("Data_path.csv")

# Extracting features using MFCC
df = pd.DataFrame(columns=["feature"])

In [4]:
# Loop through each audio file
for index, path in enumerate(ref.path):
    X, sample_rate = librosa.load(
        path, res_type="kaiser_fast", duration=2.5, sr=44100, offset=0.5
    )
    # Extract MFCC features
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0)
    df.loc[index] = [mfccs]

In [5]:
# Concatenate extracted features with metadata
df = pd.concat([ref, pd.DataFrame(df["feature"].values.tolist())], axis=1)
df = df.fillna(0)

In [6]:
# Splitting data into train and test sets
from sklearn.model_selection import train_test_split

In [7]:
X = df.drop(["path", "labels", "source"], axis=1).values
y = df.labels.values

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

In [9]:
# Normalizing data
from sklearn.preprocessing import StandardScaler

In [10]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
# Reshaping data for Conv1D
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
lb = LabelEncoder()
y_train = lb.fit_transform(y_train)
y_test = lb.transform(y_test)
num_classes = len(lb.classes_)

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Conv1D,
    MaxPooling1D,
    Flatten,
    Dense,
    Dropout,
    BatchNormalization,
    Activation,
)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [17]:
# Building the model
model = Sequential()
model.add(
    Conv1D(256, 8, padding="same", input_shape=(X_train.shape[1], X_train.shape[2]))
)
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(MaxPooling1D(pool_size=8))
model.add(Dropout(0.25))

model.add(Conv1D(128, 8, padding="same"))
model.add(BatchNormalization())
model.add(Activation("relu"))

model.add(Conv1D(128, 8, padding="same"))
model.add(BatchNormalization())
model.add(Activation("relu"))

model.add(Conv1D(128, 8, padding="same"))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(MaxPooling1D(pool_size=8))
model.add(Dropout(0.25))

model.add(Conv1D(64, 8, padding="same"))
model.add(BatchNormalization())
model.add(Activation("relu"))

model.add(Conv1D(64, 8, padding="same"))
model.add(BatchNormalization())
model.add(Activation("relu"))

model.add(Flatten())
model.add(Dense(num_classes, activation="softmax"))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [18]:
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)

In [19]:
early_stopping = EarlyStopping(
    monitor="val_loss", patience=10, restore_best_weights=True
)
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.2, patience=3, min_lr=1e-6)

In [20]:
# Training the model
history = model.fit(
    X_train,
    y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_test, y_test),
    callbacks=[early_stopping, reduce_lr],
)

Epoch 1/50


I0000 00:00:1720461955.967047   16076 service.cc:145] XLA service 0x7f53900199c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1720461955.967101   16076 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2024-07-08 18:05:56.040118: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-07-08 18:05:56.370572: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907


[1m 23/286[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 7ms/step - accuracy: 0.1620 - loss: 2.6826

I0000 00:00:1720461961.900716   16076 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 23ms/step - accuracy: 0.2131 - loss: 2.2959 - val_accuracy: 0.0839 - val_loss: 3.4025 - learning_rate: 0.0010
Epoch 2/50
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.2931 - loss: 1.9765 - val_accuracy: 0.3065 - val_loss: 1.9963 - learning_rate: 0.0010
Epoch 3/50
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.3339 - loss: 1.8707 - val_accuracy: 0.3397 - val_loss: 1.8719 - learning_rate: 0.0010
Epoch 4/50
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.3529 - loss: 1.8026 - val_accuracy: 0.3680 - val_loss: 1.8131 - learning_rate: 0.0010
Epoch 5/50
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.3738 - loss: 1.7694 - val_accuracy: 0.3910 - val_loss: 1.7069 - learning_rate: 0.0010
Epoch 6/50
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5m

In [21]:
# Evaluating the model
score = model.evaluate(X_test, y_test, verbose=0)
print("Test accuracy:", score[1])

Test accuracy: 0.4557711184024811


In [22]:
# Saving the model
model.save("emotion_model.h5")
print("Model saved as 'emotion_model.h5'")



Model saved as 'emotion_model.h5'
