## Emotion recognition

Building emotion recognition model based on CREMA-D dataset and providing our own recordings to test its ability to generalise. 

In [None]:
import os
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense



### Convert wav to npy

In [None]:
def wav_to_npy(input_wav_path):
    audio_data, _ = librosa.load(input_wav_path, sr=None)

    return audio_data

def save_all_to_single_npy(directory, output_npy_path, target_length=None):
    X = []

    for filename in os.listdir(directory):
        if filename.endswith(".wav"):
            input_wav_path = os.path.join(directory, filename)
            loaded_data = wav_to_npy(input_wav_path)

            # Ensure that the array has the target length by padding or truncating
            if target_length is not None and len(loaded_data) != target_length:
                if len(loaded_data) < target_length:
                    # Pad with zeros if too short
                    loaded_data = np.pad(loaded_data, (0, target_length - len(loaded_data)))
                else:
                    # Truncate if too long
                    loaded_data = loaded_data[:target_length]

            X.append(loaded_data)

    # Save the entire list as a single NumPy file
    np.save(output_npy_path, np.array(X))

input_wav_directory = 'AudioWAV/'
all_data_path = 'all_data.npy'
target_length = 10000  

save_all_to_single_npy(input_wav_directory, all_data_path, target_length=target_length)


### Load CREMA-D

In [None]:
# Data load function courtesy of https://www.kaggle.com/code/shivamburnwal/speech-emotion-recognition

def load_and_process_data(dataset_path):
    crema_directory_list = os.listdir(dataset_path)

    file_emotion = []
    file_path = []

    for file in crema_directory_list:
        # storing file paths
        file_path.append(dataset_path + file)
        # storing file emotions
        part = file.split('_')
        if part[2] == 'SAD':
            file_emotion.append('sad')
        elif part[2] == 'ANG':
            file_emotion.append('angry')
        elif part[2] == 'DIS':
            file_emotion.append('disgust')
        elif part[2] == 'FEA':
            file_emotion.append('fear')
        elif part[2] == 'HAP':
            file_emotion.append('happy')
        elif part[2] == 'NEU':
            file_emotion.append('neutral')
        else:
            file_emotion.append('Unknown')

    # Create a DataFrame for emotion of files
    emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

    # Create a DataFrame for the path of files
    path_df = pd.DataFrame(file_path, columns=['Path'])

    return path_df, emotion_df


# Set the path to the CREMA-D dataset
crema_path = "AudioWAV/"

# Load and process data
recordings_df, labels_df = load_and_process_data(crema_path)


### Model Development

In [None]:
def encode_labels(emotion_labels):
    unique_labels = np.unique(emotion_labels)
    label_to_index = {label: i for i, label in enumerate(unique_labels)}
    encoded_labels = [label_to_index[label] for label in emotion_labels]
    return np.array(encoded_labels)

# Load the single NumPy file
X = np.load(all_data_path)

# Encode emotion labels
Y = encode_labels(labels_df['Emotions'])

display(X.shape)
display(Y.shape)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=69)

# Assuming x_train and x_test are your input data
# Convert to non-negative integers
x_train_non_negative = (x_train + 1) * 5000 
x_test_non_negative = (x_test + 1) * 5000

# Convert to integers
x_train_indices = x_train_non_negative.astype(int)
x_test_indices = x_test_non_negative.astype(int)

# Ensure values are within the vocabulary size (10000)
x_train_indices = np.clip(x_train_indices, 0, 9999)
x_test_indices = np.clip(x_test_indices, 0, 9999)

# Assuming your data has shape (number_of_samples, sequence_length)
input_shape = x_train.shape[1:]

# Define the model
model = Sequential()

# Embedding layer
model.add(Embedding(input_dim=10000, output_dim=32, input_length=input_shape[0]))

# Convolutional layers
model.add(Conv1D(32, 3, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Conv1D(64, 3, activation='relu'))
model.add(MaxPooling1D(2))

# Flatten layer
model.add(Flatten())

# Dense layers
model.add(Dense(64, activation='relu'))
model.add(Dense(6, activation='softmax'))  

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

# Fit the model
model.fit(x_train_indices, y_train, epochs=3, batch_size=32)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(x_test_indices, y_test, verbose=1)
print(f"\nTest Accuracy: {test_accuracy * 100:.2f}%")

### Confusion matrix

In [None]:
y_pred = model.predict(x_test_indices)
y_pred_classes = np.argmax(y_pred, axis=1)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_classes)

# Plot confusion matrix
labels = ['Anger', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad']  
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=labels)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()