In [128]:
import librosa
import soundfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [129]:
# speech emotion recognition: recognises the emotion of a speech

# steps:

# cleaning and pre-processing DONE
# *feature engineering* DONE
# loading and splitting data (training, validating, and testing) DONE
# selecting model DONE
# train model DONE
# validate model
# predict model on test data DONE
# evaluate model on test set

In [161]:
# DataFlair - Extract features (mfcc, chroma, mel) from a sound file
def extract_feature(file_name, mfcc=True, chroma=True, mel=True):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate
        if chroma:
            stft = np.abs(librosa.stft(X))
        result = np.array([])
        if mfcc:
            mfccs = np.mean(
                librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0
            )
            result = np.hstack((result, mfccs))
        if chroma:
            chroma = np.mean(
                librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0
            )
            result = np.hstack((result, chroma))
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
            result = np.hstack((result, mel))
    return result

In [131]:
emotions = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised",
}

observed_emotions=['calm', 'happy', 'fearful', 'disgust']


In [132]:
# DataFlair - Load the data and extract features for each sound file
def load_data(folder, test_size=0.2):
    # X: array containing features of each recording
    # y: array containing labels (emotion)
    X, y = [], []
    for file in glob.glob(folder):
        file_name = os.path.basename(file)
        emotion = emotions[file_name.split("-")[2]]
        # if emotion not in observed_emotions:
        #     continue
        feature = extract_feature(file, mfcc=True, chroma=True, mel=True)
        X.append(feature)
        y.append(emotion)
        print(f"Processed {file_name}")
    return train_test_split(np.array(X), y, test_size=test_size, random_state=9)

In [133]:
X_train, X_test, y_train, y_test = load_data("./data/Actor_*/*.wav", test_size=0.25)

Processed 03-01-01-01-01-01-01.wav
Processed 03-01-01-01-01-02-01.wav
Processed 03-01-01-01-02-01-01.wav
Processed 03-01-01-01-02-02-01.wav
Processed 03-01-02-01-01-01-01.wav
Processed 03-01-02-01-01-02-01.wav
Processed 03-01-02-01-02-01-01.wav
Processed 03-01-02-01-02-02-01.wav
Processed 03-01-02-02-01-01-01.wav
Processed 03-01-02-02-01-02-01.wav
Processed 03-01-02-02-02-01-01.wav
Processed 03-01-02-02-02-02-01.wav
Processed 03-01-03-01-01-01-01.wav
Processed 03-01-03-01-01-02-01.wav
Processed 03-01-03-01-02-01-01.wav
Processed 03-01-03-01-02-02-01.wav
Processed 03-01-03-02-01-01-01.wav
Processed 03-01-03-02-01-02-01.wav
Processed 03-01-03-02-02-01-01.wav
Processed 03-01-03-02-02-02-01.wav
Processed 03-01-04-01-01-01-01.wav
Processed 03-01-04-01-01-02-01.wav
Processed 03-01-04-01-02-01-01.wav
Processed 03-01-04-01-02-02-01.wav
Processed 03-01-04-02-01-01-01.wav
Processed 03-01-04-02-01-02-01.wav
Processed 03-01-04-02-02-01-01.wav
Processed 03-01-04-02-02-02-01.wav
Processed 03-01-05-0

In [158]:
print(f'Features extracted: {X_train.shape}')

Features extracted: (360, 180)


In [142]:
# Initialise model
model = MLPClassifier(
    alpha=0.01,
    batch_size=256,
    epsilon=1e-08,
    hidden_layer_sizes=(300,),
    learning_rate="adaptive",
    max_iter=500,
)

In [152]:
# Train model
model.fit(X_train, y_train)

In [153]:
# Predict model
y_pred = model.predict(X_test)

In [154]:
print(y_pred)

['disgust' 'fearful' 'neutral' 'sad' 'neutral' 'angry' 'disgust' 'sad'
 'happy' 'angry' 'surprised' 'neutral' 'sad' 'sad' 'sad' 'neutral' 'sad'
 'surprised' 'sad' 'happy' 'disgust' 'surprised' 'fearful' 'sad' 'sad'
 'neutral' 'neutral' 'sad' 'sad' 'neutral' 'neutral' 'calm' 'happy'
 'fearful' 'sad' 'disgust' 'neutral' 'happy' 'sad' 'fearful' 'neutral'
 'neutral' 'fearful' 'disgust' 'angry' 'neutral' 'happy' 'angry' 'happy'
 'neutral' 'fearful' 'angry' 'fearful' 'sad' 'happy' 'neutral' 'angry'
 'surprised' 'happy' 'angry' 'fearful' 'sad' 'angry' 'disgust' 'fearful'
 'happy' 'happy' 'neutral' 'happy' 'neutral' 'surprised' 'angry' 'angry'
 'neutral' 'happy' 'sad' 'neutral' 'fearful' 'calm' 'surprised' 'angry'
 'neutral' 'surprised' 'angry' 'angry' 'calm' 'surprised' 'sad' 'neutral'
 'neutral' 'surprised' 'happy' 'neutral' 'angry' 'neutral' 'neutral' 'sad'
 'calm' 'calm' 'angry' 'calm' 'sad' 'neutral' 'neutral' 'sad' 'angry'
 'angry' 'neutral' 'neutral' 'happy' 'sad' 'neutral' 'neutral' 'n

In [155]:
# Calculate accuracy of model
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)

In [156]:
print(accuracy*100)

46.111111111111114


In [183]:
# single file testing
filename = "data/Actor_18/03-01-05-01-02-02-18.wav"
inp = [extract_feature(filename)]
pred_out = model.predict(inp)

real_out = emotions[filename.split("-")[2]]
print(f"Predicted: {pred_out[0].upper()}, Actual: {real_out.upper()}")

Predicted: ANGRY, Actual: ANGRY
