In [1]:
import pandas as pd
import numpy as np
import librosa

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

import keras
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import (
    Dense,
    Conv1D,
    MaxPooling1D,
    Flatten,
    Dropout,
    BatchNormalization,
)
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

### Read dataset

In [None]:
# train_dataset = pd.read_excel("./datasets/train_dataset.xlsx")
# test_dataset = pd.read_excel("./datasets/test_dataset.xlsx")

### Data Augmentation

In [3]:
def noise(data):
    """
    Add random Gaussian noise to the audio signal for data augmentation.

    This function adds controlled random noise to the input audio signal. The amplitude
    of the noise is proportional to the maximum amplitude of the input signal,
    making it adaptive to different audio volumes.

    Parameters
    ----------
    data : numpy.ndarray
        Input audio signal time series

    Returns
    -------
    numpy.ndarray
        Audio signal with added noise, same shape as input

    Notes
    -----
    The noise generation process:
    1. Calculate noise amplitude as 3.5% of input signal's max amplitude
    2. Generate Gaussian noise with the same length as input
    3. Scale noise by calculated amplitude
    4. Add scaled noise to original signal

    The noise amplitude is randomized using uniform distribution to create
    variety in the augmented data.
    """
    noise_amp = 0.035 * np.random.uniform() * np.amax(data)
    data = data + noise_amp * np.random.normal(size=data.shape[0])
    return data


def stretch(data, rate=0.8):
    """
    Time-stretch the audio signal without changing its pitch.

    Parameters
    ----------
    data : numpy.ndarray
        Input audio signal
    rate : float, optional
        Stretching rate. Values > 1 speed up the audio, values < 1 slow it down.
        Default is 0.8 (20% slower).

    Returns
    -------
    numpy.ndarray
        Time-stretched audio signal
    """
    return librosa.effects.time_stretch(data, rate=rate)


def shift(data):
    """
    Randomly shift the audio signal in time.

    Parameters
    ----------
    data : numpy.ndarray
        Input audio signal

    Returns
    -------
    numpy.ndarray
        Time-shifted audio signal, shifted by -5000 to 5000 samples
    """
    shift_range = int(np.random.uniform(low=-5, high=5) * 1000)
    return np.roll(data, shift_range)


def pitch(data, sampling_rate, pitch_factor=0.7):
    """
    Shift the pitch of the audio signal.

    Parameters
    ----------
    data : numpy.ndarray
        Input audio signal
    sampling_rate : int
        Sampling rate of the input audio
    pitch_factor : float, optional
        Number of semitones to shift. Default is 0.7 (lower pitch)

    Returns
    -------
    numpy.ndarray
        Pitch-shifted audio signal
    """
    return librosa.effects.pitch_shift(data, sr=sampling_rate, n_steps=pitch_factor)

We use only noise and stretch, copying the steps from kaggle notebook

In [4]:
def extract_audio_features(data, sample_rate):
    """
    Extract audio features from the input audio data for emotion recognition.

    This function extracts multiple audio features that are useful for speech emotion recognition:
    - Zero Crossing Rate (ZCR): Rate at which the signal changes from positive to negative
    - Chroma STFT: Represents the spectral energy across the 12 pitch classes
    - MFCC (Mel-frequency cepstral coefficients): Represents the short-term power spectrum
    - RMS (Root Mean Square): Represents the loudness of the signal
    - Mel Spectrogram: Represents the power spectral density on mel-scale

    Parameters
    ----------
    data : numpy.ndarray
        Audio time series data loaded using librosa

    Returns
    -------
    numpy.ndarray
        1D array containing concatenated features in the following order:
        [ZCR, Chroma STFT, MFCC, RMS, Mel Spectrogram]

    Notes
    -----
    All features are averaged across time using mean to get a fixed-length
    representation regardless of the input audio length.
    """
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result = np.hstack((result, zcr))  # stacking horizontally

    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft))

    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc))

    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms))

    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel))

    return result


def augment_and_get_features(path):
    """
    Load an audio file, apply data augmentation, and extract features.

    This function performs the following steps:
    1. Loads the audio file
    2. Extracts features from the original audio
    3. Applies noise augmentation and extracts features
    4. Applies time stretching followed by pitch shifting and extracts features

    Parameters
    ----------
    path : str
        Path to the audio file

    Returns
    -------
    numpy.ndarray
        2D array of shape (3, n_features) containing features from:
        - Row 0: Original audio
        - Row 1: Noise augmented audio
        - Row 2: Stretch and pitch augmented audio
    """
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path)

    # without augmentation
    res1 = extract_audio_features(data, sample_rate)
    result = np.array(res1)

    # data with noise
    noise_data = noise(data)
    res2 = extract_audio_features(noise_data, sample_rate)
    result = np.vstack((result, res2))  # stacking vertically

    # data with stretching and pitching
    new_data = stretch(data)
    data_stretch_pitch = pitch(new_data, sample_rate)
    res3 = extract_audio_features(data_stretch_pitch, sample_rate)
    result = np.vstack((result, res3))  # stacking vertically

    return result


def get_features(path):
    """
    Load an audio file, apply data augmentation, and extract features.

    This function performs the following steps:
    1. Loads the audio file
    2. Extracts features from the original audio

    Parameters
    ----------
    path : str
        Path to the audio file

    Returns
    -------
    numpy.ndarray
        2D array of shape (1, n_features) containing features from:
        - Row 0: Original audio
    """
    data, sample_rate = librosa.load(path)

    result = extract_audio_features(data, sample_rate)
    result = np.array(result)

    return result

In [None]:
# x_train = train_dataset["Path"]
# y_train = train_dataset["Emotion"]
# x_test = test_dataset["Path"]
# y_test = test_dataset["Emotion"]

In [None]:
# x_train_augmented, y_train_augmented = [], []
# for path, emotion in zip(x_train, y_train):
#     feature = augment_and_get_features(path)
#     for ele in feature:
#         x_train_augmented.append(ele)
#         # appending emotion 3 times as we have made 3 augmentation techniques on each audio file.
#         y_train_augmented.append(emotion)

  return pitch_tuning(


In [None]:
# len(x_train_augmented), len(y_train_augmented)

(27363, 27363)

In [None]:
# type(x_train_augmented), type(y_train_augmented)

(list, list)

no. of features

In [None]:
# len(x_train_augmented[0])

162

In [None]:
# Features = pd.DataFrame(x_train_augmented)
# Features["Emotion"] = y_train_augmented
# Features.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,153,154,155,156,157,158,159,160,161,Emotion
0,0.069695,0.61271,0.648887,0.599935,0.5985,0.709648,0.713479,0.66304,0.670842,0.74925,...,3.074146e-10,2.838658e-10,2.670537e-10,2.544914e-10,2.451081e-10,2.383348e-10,2.333977e-10,2.301813e-10,2.281968e-10,happy
1,0.263717,0.750874,0.801533,0.780455,0.810607,0.888619,0.840166,0.741537,0.753543,0.808243,...,0.001514909,0.00144296,0.001469808,0.001439112,0.001536119,0.001364884,0.001444743,0.001415148,0.001463326,happy


In [18]:
train_augmented_path = "./datasets/temp_data/train_augmented.xlsx"
# train_features.to_excel("./datasets/temp_data/train_augmented.xlsx", index=False)

### Data Preparation

In [19]:
train_features = pd.read_excel(train_augmented_path)

In [None]:
# x_test, y_test = [], []
# for path, emotion in zip(test_dataset.Path, test_dataset.Emotion):
#     features = get_features(path)
#     x_test.append(features)
#     y_test.append(emotion)

In [None]:
# x_test, y_test

([array([ 1.05957031e-01,  5.50422013e-01,  5.69365740e-01,  5.87658823e-01,
          5.34460306e-01,  5.24038911e-01,  5.86367667e-01,  6.52217567e-01,
          6.83909774e-01,  6.97024465e-01,  6.89759910e-01,  6.45590663e-01,
          5.48579574e-01, -3.79041351e+02,  1.32453537e+02,  9.80857372e+00,
          4.25753899e+01, -1.73352051e+01,  1.58216572e+01, -1.71710129e+01,
          9.08379078e+00, -9.73803711e+00,  1.99654663e+00, -6.91072106e-01,
         -2.81553316e+00,  8.30588245e+00, -5.86521673e+00,  4.72376299e+00,
         -1.02306986e+01,  3.81318831e+00, -5.91137743e+00, -3.81919122e+00,
         -3.48868966e+00,  1.95917282e-02,  7.83000216e-02,  2.83634625e-02,
          4.55273353e-02,  7.63707310e-02,  5.81243187e-02,  6.80552348e-02,
          3.91508877e-01,  6.49020553e-01,  6.46133602e-01,  2.29226038e-01,
          1.79059282e-01,  4.37338613e-02,  4.41719554e-02,  2.38719597e-01,
          6.88985705e-01,  9.44931269e-01,  6.43798172e-01,  4.45230156e-01,

In [None]:
# test_features = pd.DataFrame(x_test)
# test_features["Emotion"] = y_test
# test_features.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,153,154,155,156,157,158,159,160,161,Emotion
0,0.105957,0.550422,0.569366,0.587659,0.53446,0.524039,0.586368,0.652218,0.68391,0.697024,...,1.340393e-10,1.242501e-10,1.173462e-10,1.122502e-10,1.085056e-10,1.058445e-10,1.039424e-10,1.027299e-10,1.019779e-10,disgust
1,0.115606,0.644729,0.674718,0.625934,0.646124,0.67909,0.708628,0.74882,0.753075,0.766021,...,3.588771e-10,3.328298e-10,3.14258e-10,3.003943e-10,2.900659e-10,2.826407e-10,2.772607e-10,2.737894e-10,2.716342e-10,disgust


In [20]:
test_features_path = "./datasets/temp_data/test_features.xlsx"
# test_features.to_excel(test_features_path, index=False)

In [21]:
test_features = pd.read_excel(test_features_path)

In [22]:
y_train = train_features[["Emotion"]].copy()
y_train.columns = ["Emotion"]
x_train = train_features.drop("Emotion", axis=1)
x_train.shape, y_train.shape

((27363, 162), (27363, 1))

In [None]:
y_test = test_features[["Emotion"]].copy()
y_test.columns = ["Emotion"]
x_test = test_features.drop("Emotion", axis=1)
x_test.shape, y_test.shape

In [None]:


# # Separate features (X) and labels (Y)
# Y = train_dataset[["labels"]].copy()
# # Y.columns = ['Emotion']  # Rename the column to match our convention
# X = train_dataset.drop("labels", axis=1)
# X.shape, Y.shape

In [None]:
# test_dataset = pd.read_csv("./datasets/test_dataset.csv")

# y_test = test_dataset[["Emotion"]].copy()
# x_test = test_dataset.drop("Emotion", axis=1)
# x_test.shape, y_test.shape

One Hot Encoding

In [None]:
encoder = OneHotEncoder()
Y = encoder.fit_transform(np.array(Y).reshape(-1, 1)).toarray()
# y_test = encoder.fit_transform(np.array(y_test).reshape(-1, 1)).toarray()

In [None]:
y_test = encoder.fit_transform(np.array(y_test).reshape(-1, 1)).toarray()

In [None]:
Y, encoder.categories_

In [None]:
y_test, encoder.categories_

Scaler

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X

In [None]:
x_test = scaler.fit_transform(x_test)
x_test

In [None]:
X.shape

In [None]:
X = np.expand_dims(X, axis=2)
X.shape

### Modelling

In [None]:
model = Sequential()
model.add(
    Conv1D(
        256,
        kernel_size=5,
        strides=1,
        padding="same",
        activation="relu",
        input_shape=(X.shape[1], 1),
    )
)
model.add(MaxPooling1D(pool_size=5, strides=2, padding="same"))

model.add(Conv1D(256, kernel_size=5, strides=1, padding="same", activation="relu"))
model.add(MaxPooling1D(pool_size=5, strides=2, padding="same"))

model.add(Conv1D(128, kernel_size=5, strides=1, padding="same", activation="relu"))
model.add(MaxPooling1D(pool_size=5, strides=2, padding="same"))
model.add(Dropout(0.2))

model.add(Conv1D(64, kernel_size=5, strides=1, padding="same", activation="relu"))
model.add(MaxPooling1D(pool_size=5, strides=2, padding="same"))

model.add(Flatten())
model.add(Dense(units=32, activation="relu"))
model.add(Dropout(0.3))

model.add(Dense(units=8, activation="softmax"))
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

model.summary()

In [None]:
rlrp = ReduceLROnPlateau(
    monitor="loss", factor=0.4, verbose=0, patience=2, min_lr=10e-8
)
history = model.fit(
    X, Y, batch_size=64, epochs=50, validation_data=(x_test, y_test), callbacks=[rlrp]
)