In [1]:
import librosa
import soundfile,time
import os, glob, pickle
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [2]:
def extract_feature(file_name, mfcc, chroma, mel):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate
        if chroma:
            stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        result=np.hstack((result, mfccs))
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
        result=np.hstack((result, chroma))
        if mel:
            mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
        result=np.hstack((result, mel))
    return result

In [3]:
emotions={
'01':'neutral',
'02':'calm',
'03':'happy',
'04':'sad',
'05':'angry',
'06':'fearful',
'07':'disgust',
'08':'surprised'
}
# Emotions to observe
observed_emotions=['calm', 'happy', 'angry', 'disgust']

In [4]:
#Load the data and extract features for each sound file
from glob import glob
import os
import glob
def load_data(test_size=0.33):
    x,y=[],[]
    for file in glob.glob(r'C:\Users\sunka\Project folder\SER-ravdess-data\Actor_*\\*.wav'):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature=extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append([emotion,file_name])
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

In [5]:
# Split the dataset
x_train,x_test,y_train,y_test=load_data(test_size=0.25)

In [6]:
print(np.shape(x_train),np.shape(x_test), np.shape(y_train),np.shape(y_test))
y_test_map = np.array(y_test).T
y_test = y_test_map[0]
test_filename = y_test_map[1]
y_train_map = np.array(y_train).T
y_train = y_train_map[0]
train_filename = y_train_map[1]
print(np.shape(y_train),np.shape(y_test))
print(*test_filename,sep="\n")

(576, 180) (192, 180) (576, 2) (192, 2)
(576,) (192,)
03-01-03-01-01-01-15.wav
03-01-02-02-01-01-09.wav
03-01-03-01-02-02-21.wav
03-01-03-02-02-02-05.wav
03-01-07-01-02-01-21.wav
03-01-02-02-02-01-13.wav
03-01-03-02-02-02-19.wav
03-01-03-02-01-02-03.wav
03-01-07-02-02-01-13.wav
03-01-03-02-02-01-16.wav
03-01-03-02-02-01-06.wav
03-01-07-01-02-01-15.wav
03-01-03-02-01-01-18.wav
03-01-03-02-01-02-13.wav
03-01-07-02-02-01-14.wav
03-01-05-02-01-02-11.wav
03-01-02-02-02-02-02.wav
03-01-03-02-01-01-19.wav
03-01-07-02-02-02-12.wav
03-01-07-01-02-01-03.wav
03-01-02-01-01-01-05.wav
03-01-07-02-02-01-01.wav
03-01-07-02-01-01-08.wav
03-01-02-01-01-02-03.wav
03-01-03-01-02-01-06.wav
03-01-03-01-02-01-04.wav
03-01-07-01-02-02-09.wav
03-01-07-02-01-02-10.wav
03-01-02-01-02-01-14.wav
03-01-02-01-01-02-09.wav
03-01-03-02-02-01-13.wav
03-01-07-02-02-02-22.wav
03-01-03-02-02-01-23.wav
03-01-05-01-01-02-09.wav
03-01-05-02-02-01-13.wav
03-01-02-01-01-02-24.wav
03-01-02-02-01-02-18.wav
03-01-05-02-01-01-22.

In [7]:
# Get the shape of the training and testing datasets
print((x_train[0], x_test[0]))

# Get the number of features extracted
print(f'Features extracted: {x_train.shape[1]}')

(array([-5.22061890e+02,  3.50668907e+01,  3.75342965e+00,  8.25096607e+00,
       -6.82484770e+00, -2.17128828e-01, -1.53533993e+01, -8.40860748e+00,
       -1.37559566e+01,  9.54228580e-01, -8.50486183e+00, -2.91087580e+00,
       -6.54408514e-01, -2.16127944e+00,  2.59967774e-01, -3.94956857e-01,
        3.93508315e+00,  3.49161673e+00,  1.61637747e+00, -7.00250864e-01,
       -2.44239020e+00, -5.55101514e-01,  7.47600019e-01, -5.85062504e-01,
       -8.76787230e-02,  5.93730545e+00,  5.41902065e+00,  5.92085218e+00,
        1.71201110e+00,  2.79930902e+00,  4.34409046e+00,  6.64095545e+00,
        8.33745575e+00,  4.66863585e+00, -8.62458497e-02,  7.11737514e-01,
        3.79981446e+00,  5.90181828e+00,  1.02495813e+00, -1.52339302e-02,
        5.53609192e-01,  5.79114377e-01,  6.16039157e-01,  6.00282311e-01,
        6.41938984e-01,  7.51618505e-01,  7.65470445e-01,  7.59810686e-01,
        7.16218531e-01,  6.45714700e-01,  6.92020059e-01,  6.72237158e-01,
        1.72313396e-03, 

In [8]:
# Initialize the Multi Layer Perceptron Classifier
model=MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)

In [9]:
#Train the model
model.fit(x_train,y_train)

MLPClassifier(alpha=0.01, batch_size=256, hidden_layer_sizes=(300,),
              learning_rate='adaptive', max_iter=500)

In [10]:
y_pred=model.predict(x_test)
accuracy=accuracy_score(y_true=y_test,y_pred=y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 70.31%


In [11]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       angry       0.67      0.59      0.63        37
        calm       0.77      0.95      0.85        57
     disgust       0.70      0.54      0.61        48
       happy       0.63      0.66      0.65        50

    accuracy                           0.70       192
   macro avg       0.69      0.69      0.68       192
weighted avg       0.70      0.70      0.70       192



In [12]:
# Split the dataset
x_train,x_test,y_train,y_test=load_data(test_size=0.25)
print(x_train)
print(x_test)
print(y_train)
print(y_test)

[[-5.22061890e+02  3.50668907e+01  3.75342965e+00 ...  1.65243153e-04
   1.04321596e-04  6.55571566e-05]
 [-5.78170044e+02  5.42816353e+01  4.19424200e+00 ...  4.20206983e-04
   1.69682709e-04  5.22450318e-05]
 [-6.50705750e+02  5.30211639e+01 -4.92040443e+00 ...  4.75216802e-05
   3.46632514e-05  1.62844426e-05]
 ...
 [-5.50096191e+02  1.70297680e+01 -1.14575634e+01 ...  1.51764631e-04
   1.16828531e-04  8.47479314e-05]
 [-5.55357605e+02  4.71569710e+01  1.10750742e+01 ...  1.61086457e-04
   1.04962470e-04  6.52811723e-05]
 [-5.04816345e+02  3.53618660e+01 -1.43495789e+01 ...  6.08151488e-04
   5.55269769e-04  4.47782222e-04]]
[[-6.01385986e+02  6.33781929e+01 -1.15760441e+01 ...  7.30556349e-06
   4.64438199e-06  1.78243295e-06]
 [-7.86435913e+02  5.99123268e+01  2.08771191e+01 ...  4.30930726e-07
   4.01358051e-07  4.06331111e-07]
 [-5.60618164e+02  5.78945961e+01 -8.87167645e+00 ...  6.46989676e-04
   3.31015413e-04  1.85833647e-04]
 ...
 [-4.44762360e+02  2.36132107e+01 -1.4835466