In [1]:
# Importing required libraries 
# Keras
from tensorflow import keras
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model, model_from_json
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.layers import Input, Flatten, Dropout, Activation, BatchNormalization
from tensorflow.keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
import tensorflow.keras.utils as utils
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint

# sklearn
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Other  
import librosa
import librosa.display
import json
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
import pandas as pd
import seaborn as sns
import glob 
import os
import pickle
import IPython.display as ipd  # To play sound in the notebook

import time

# start_time = time.time()
# print(f"--- {time.time() - start_time} seconds ---")

In [2]:
ref = pd.read_csv('RAVDESS_dataframe.csv')
ref.head()

Unnamed: 0,gender,emotion,labels,path
0,male,angry,male_angry,dataset/RAVDESS_Audio_Speech/Actor_01/03-01-05...
1,male,neutral,male_neutral,dataset/RAVDESS_Audio_Speech/Actor_01/03-01-02...
2,male,disgust,male_disgust,dataset/RAVDESS_Audio_Speech/Actor_01/03-01-07...
3,male,neutral,male_neutral,dataset/RAVDESS_Audio_Speech/Actor_01/03-01-02...
4,male,surprise,male_surprise,dataset/RAVDESS_Audio_Speech/Actor_01/03-01-08...


In [3]:
start_time = time.time()


# Note this takes a couple of minutes (~10 mins) as we're iterating over 4 datasets 
df = pd.DataFrame(columns=['feature'])

# loop feature extraction over the entire dataset
counter=0
for index,path in enumerate(ref.path):
    X, sample_rate = librosa.load(path
                                  , res_type='kaiser_fast'
                                  ,duration=2.5
                                  ,sr=44100
                                  ,offset=0.5
                                 )
    sample_rate = np.array(sample_rate)
    
    # mean as the feature. Could do min and max etc as well. 
    mfccs = np.mean(librosa.feature.mfcc(y=X, 
                                        sr=sample_rate, 
                                        n_mfcc=13),
                    axis=0)
    df.loc[counter] = [mfccs]
    counter=counter+1   

# Check a few records to make sure its processed successfully
print(len(df))




print(f"--- {time.time() - start_time} seconds ---")

df.head()

1440
--- 134.5748724937439 seconds ---


Unnamed: 0,feature
0,"[-55.663086, -56.170334, -56.157845, -54.66347..."
1,"[-70.74651, -70.74651, -70.02528, -69.13127, -..."
2,"[-69.40937, -69.40937, -69.40937, -69.40937, -..."
3,"[-69.67329, -69.693306, -69.693306, -69.693306..."
4,"[-64.51157, -64.51157, -64.51157, -64.51157, -..."


In [9]:
df_processed = pd.concat([ref,pd.DataFrame(df['feature'].values.tolist())],axis=1)
df_processed[:5]

Unnamed: 0,gender,emotion,labels,path,0,1,2,3,4,5,...,206,207,208,209,210,211,212,213,214,215
0,male,angry,male_angry,dataset/RAVDESS_Audio_Speech/Actor_01/03-01-05...,-55.663086,-56.170334,-56.157845,-54.663475,-55.166775,-56.156944,...,-55.155067,-56.156944,-56.156944,-56.156944,-55.844219,-56.156944,-56.156944,-56.156944,-56.156944,-56.156944
1,male,neutral,male_neutral,dataset/RAVDESS_Audio_Speech/Actor_01/03-01-02...,-70.746513,-70.746513,-70.025284,-69.131271,-70.746513,-70.746513,...,-30.967125,-32.430031,-32.474728,-34.334457,-38.280952,-39.110352,-41.010277,-40.282722,-41.454048,-44.383205
2,male,disgust,male_disgust,dataset/RAVDESS_Audio_Speech/Actor_01/03-01-07...,-69.40937,-69.40937,-69.40937,-69.40937,-69.40937,-69.40937,...,-33.514477,-34.081306,-33.27766,-35.757454,-37.576988,-37.910069,-38.851265,-40.609707,-42.126114,-44.842896
3,male,neutral,male_neutral,dataset/RAVDESS_Audio_Speech/Actor_01/03-01-02...,-69.673286,-69.693306,-69.693306,-69.693306,-69.693306,-69.693306,...,-61.967167,-61.018559,-60.341145,-63.465332,-64.500137,-61.646843,-58.001488,-58.848484,-62.603935,-61.121773
4,male,surprise,male_surprise,dataset/RAVDESS_Audio_Speech/Actor_01/03-01-08...,-64.511574,-64.511574,-64.511574,-64.511574,-64.511574,-64.511574,...,-61.820129,-63.520226,-64.511574,-64.511574,-64.511574,-64.463341,-64.511574,-64.511574,-64.511574,-64.511574


In [16]:
df_processed.drop(['path','labels','gender','emotion'], axis=1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
0,-55.663086,-56.170334,-56.157845,-54.663475,-55.166775,-56.156944,-56.156944,-56.156944,-56.156944,-56.156944,...,-55.155067,-56.156944,-56.156944,-56.156944,-55.844219,-56.156944,-56.156944,-56.156944,-56.156944,-56.156944
1,-70.746513,-70.746513,-70.025284,-69.131271,-70.746513,-70.746513,-70.746513,-70.746513,-70.746513,-70.746513,...,-30.967125,-32.430031,-32.474728,-34.334457,-38.280952,-39.110352,-41.010277,-40.282722,-41.454048,-44.383205
2,-69.409370,-69.409370,-69.409370,-69.409370,-69.409370,-69.409370,-69.409370,-69.409370,-69.409370,-69.409370,...,-33.514477,-34.081306,-33.277660,-35.757454,-37.576988,-37.910069,-38.851265,-40.609707,-42.126114,-44.842896
3,-69.673286,-69.693306,-69.693306,-69.693306,-69.693306,-69.693306,-69.693306,-69.620773,-69.693306,-68.906570,...,-61.967167,-61.018559,-60.341145,-63.465332,-64.500137,-61.646843,-58.001488,-58.848484,-62.603935,-61.121773
4,-64.511574,-64.511574,-64.511574,-64.511574,-64.511574,-64.511574,-64.511574,-64.511574,-64.511574,-64.511574,...,-61.820129,-63.520226,-64.511574,-64.511574,-64.511574,-64.463341,-64.511574,-64.511574,-64.511574,-64.511574
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1435,-64.968552,-64.968552,-64.968552,-64.968552,-64.968552,-64.968552,-64.968552,-64.968552,-64.968552,-67.252640,...,-64.832550,-64.678078,-64.360786,-64.642807,-62.274776,-62.417873,-63.560123,-63.105118,-63.359077,-63.284668
1436,-64.793808,-64.793808,-64.793808,-65.559990,-65.851753,-65.446579,-66.626091,-66.020622,-66.533432,-64.211411,...,-58.620567,-60.016014,-61.026798,-61.306980,-61.006649,-60.669014,-62.031731,-64.399406,-64.815628,-64.799431
1437,-45.345825,-45.345825,-45.345825,-45.345825,-45.345825,-45.345825,-45.345825,-45.345825,-45.345825,-45.345825,...,-43.515678,-43.144165,-45.263775,-45.334873,-44.143791,-44.305386,-43.153782,-43.479736,-44.287987,-44.375999
1438,-56.940815,-56.940815,-56.940815,-56.940815,-56.940815,-56.943363,-57.077030,-56.940815,-56.940815,-57.228634,...,-43.485580,-42.686459,-44.226131,-44.101894,-43.651344,-45.493496,-47.759689,-50.536503,-51.336391,-50.690861


## Splitting dataset: Training and validation:

In [20]:
# Split between train and test 
X_train, X_test, y_train, y_test = train_test_split(df_processed.drop(['path','labels','gender','emotion'], axis=1),
                                                        df_processed.labels,
                                                        test_size=0.25,
                                                        shuffle=True,
                                                        random_state=42,
                                                   )

# Lets see how the data present itself before normalisation 
print(f'Train data shape: {X_train.shape}\nValid data shape: {X_test.shape}')

Train data shape: (1080, 216)
Valid data shape: (360, 216)


## Normalising data:

In [21]:
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

In [22]:
mean.shape

(216,)

In [23]:
X_train = (X_train - mean)/std
X_test = (X_test - mean)/std

In [27]:
X_train.iloc[0]

0      2.222088
1      2.190244
2      1.728756
3      1.503745
4      1.552858
         ...   
211   -0.036250
212   -0.073344
213   -0.027081
214   -0.067113
215   -0.120260
Name: 296, Length: 216, dtype: float64

In [29]:
df_processed.drop(['path','labels','gender','emotion'], axis=1).loc[296]

0     -37.781998
1     -37.955757
2     -41.867718
3     -43.668159
4     -43.195824
         ...    
211   -47.582119
212   -48.289742
213   -48.056358
214   -48.014282
215   -48.463047
Name: 296, Length: 216, dtype: float64

In [33]:
X_train.shape

(1080, 216)

In [31]:
# Lets few preparation steps to get it into the correct format for Keras 
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

# one hot encode the target 
lb = LabelEncoder()
y_train = utils.to_categorical(lb.fit_transform(y_train))
y_test = utils.to_categorical(lb.fit_transform(y_test))

print(X_train.shape)
print(lb.classes_)
#print(y_train[0:10])
#print(y_test[0:10])

# Pickel the lb object for future use 
filename = 'labels'
outfile = open(filename,'wb')
pickle.dump(lb,outfile)
outfile.close()

(1080, 216)
['female_angry' 'female_disgust' 'female_fear' 'female_happy'
 'female_neutral' 'female_sad' 'female_surprise' 'male_angry'
 'male_disgust' 'male_fear' 'male_happy' 'male_neutral' 'male_sad'
 'male_surprise']
