In [1]:
import soundfile
import numpy as np
import librosa
import glob
import os
from sklearn.model_selection import train_test_split

# All emotions on RAVDESS dataset
int2emotion = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

# we allow only these emotions
AVAILABLE_EMOTIONS = {
    "angry",
    "sad",
    "neutral",
    "happy"
}

In [2]:
def extract_feature(file_name, **kwargs):
    """
    Extract feature from audio file `file_name`
        Features supported:
            - MFCC (mfcc)
            - Chroma (chroma)
            - MEL Spectrogram Frequency (mel)
            - Contrast (contrast)
            - Tonnetz (tonnetz)
        e.g:
        `features = extract_feature(path, mel=True, mfcc=True)`
    """
    mfcc = kwargs.get("mfcc")
    chroma = kwargs.get("chroma")
    mel = kwargs.get("mel")
    contrast = kwargs.get("contrast")
    tonnetz = kwargs.get("tonnetz")

    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate
        if chroma or contrast:
            stft = np.abs(librosa.stft(X))
        result = np.array([])
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, chroma))
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
            result = np.hstack((result, mel))
        if contrast:
            contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, contrast))
        if tonnetz:
            tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
            result = np.hstack((result, tonnetz))
    return result

In [3]:
# def load_data(test_size=0.2):
test_size=0.2
X, y = [], []
for file in glob.glob("data/Actor_*/*.wav"):
    # get the base name of the audio file
    basename = os.path.basename(file)
    # get the emotion label
    emotion = int2emotion[basename.split("-")[2]]
    # we allow only AVAILABLE_EMOTIONS we set
    if emotion not in AVAILABLE_EMOTIONS:
        continue
    # extract speech features
    features = extract_feature(file, mfcc=True, chroma=True, mel=True)
    # add to data
    X.append(features)
    y.append(emotion)
# split the data to training and testing and return it
train_test_split(np.array(X), y, test_size=test_size, random_state=7)

[array([[-4.56013794e+02,  4.94241371e+01, -2.04614887e+01, ...,
          1.95963599e-04,  1.31591820e-04,  1.37439391e-04],
        [-4.22311615e+02,  1.79562702e+01, -1.75061035e+01, ...,
          6.64979918e-04,  3.57129029e-04,  2.87930830e-04],
        [-4.02408173e+02,  1.06497669e+01, -1.86298046e+01, ...,
          5.05983923e-03,  3.38716642e-03,  1.73404091e-03],
        ...,
        [-6.21472900e+02,  2.93360786e+01, -1.64034863e+01, ...,
          1.90901155e-05,  1.05907347e-05,  1.02430777e-05],
        [-6.76129761e+02,  6.07729759e+01,  8.59902096e+00, ...,
          1.32644539e-06,  1.24813118e-06,  8.28826273e-07],
        [-5.95663818e+02,  6.33558578e+01, -9.15794277e+00, ...,
          6.76750278e-05,  3.17838239e-05,  1.83957854e-05]]),
 array([[-4.84102509e+02,  3.35011826e+01, -4.24036026e+00, ...,
          3.55753629e-03,  1.79984665e-03,  1.16297312e-03],
        [-4.35551697e+02,  3.20093689e+01,  5.28842092e-01, ...,
          1.10966375e-03,  1.26860233e

In [4]:
def load_data(test_size=0.25):
    test_size=0.2
    X, y = [], []
    for file in glob.glob("data/Actor_*/*.wav"):
        # get the base name of the audio file
        basename = os.path.basename(file)
        # get the emotion label
        emotion = int2emotion[basename.split("-")[2]]
        # we allow only AVAILABLE_EMOTIONS we set
        if emotion not in AVAILABLE_EMOTIONS:
            continue
        # extract speech features
        features = extract_feature(file, mfcc=True, chroma=True, mel=True)
        # add to data
        X.append(features)
        y.append(emotion)
    # split the data to training and testing and return it
    return train_test_split(np.array(X), y, test_size=test_size, random_state=7)

In [5]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from utils import load_data

import os
import pickle

In [6]:
# load RAVDESS dataset
X_train, X_test, y_train, y_test = load_data(test_size=0.30)

# print some details
# number of samples in training data
print("[+] Number of training samples:", X_train.shape[0])
# number of samples in testing data
print("[+] Number of testing samples:", X_test.shape[0])

# number of features used
# this is a vector of features extracted
# using utils.extract_features() method
print("[+] Number of features:", X_train.shape[1])
# best model, determined by a grid search
model_params = {
    'alpha': 0.01,
    'batch_size': 256,
    'epsilon': 1e-08,
    'hidden_layer_sizes': (300,),
    'learning_rate': 'adaptive',
    'max_iter': 500,
}


# Initialize Multi Layer Perceptron classifier with Best Parameters
model = MLPClassifier(**model_params)

# train the model
print("[*] Training the model...")
model.fit(X_train, y_train)

# predict 25% of data to measure how good we are
y_pred = model.predict(X_test)

# calculate the accuracy
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)

print("Accuracy: {:.2f}%".format(accuracy*100))

# now we save the model
# make result directory if doesn't exist yet
if not os.path.isdir("result"):
    os.mkdir("result")

pickle.dump(model, open("result/mlp_classifier2.model", "wb"))

[+] Number of training samples: 470
[+] Number of testing samples: 202
[+] Number of features: 180
[*] Training the model...
Accuracy: 76.73%


## Splitting Data

In [7]:
baseline = "03-01-07-02-02-01-18.wav"
em=baseline.split("-")[2]
emotion = int2emotion[baseline.split("-")[2]]

print(em)
print(emotion)

07
disgust


In [28]:
RATE = 16000
from scipy.io import wavfile
import noisereduce as nr
# load data
rate, data = wavfile.read("result/test.wav")
# perform noise reduction
reduced_noise = nr.reduce_noise(y=data, sr=RATE)
wavfile.write("result/myred_noise.wav", rate, reduced_noise)