In [2]:
import numpy as np
import os
import random
import sys
import glob 
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.utils.multiclass import unique_labels



from tqdm import tqdm

In [3]:
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

def gender(g):
    """Returns Gender Label"""
    if int(g[0:2]) % 2 == 0:
        return 'female'
    else:
        return 'male'

In [4]:
   #x,y=[],[]
count = 0
data_df = pd.DataFrame(columns=['emotion','gender'])

for file in tqdm(glob.glob(r'C:\Users\KIIT\Downloads\speech-emotion-recognition-ravdess-data\Actor_*\*.wav')):
    file_name=os.path.basename(file)
    X, sample_rate = librosa.load(file)
    sample_rate = np.array(sample_rate)
    mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    emotion=emotions[file_name.split("-")[2]] + '_' + gender(file_name.split("-")[-1])
    feature=mfccs
    data_df.loc[count] =[feature,emotion]
    count += 1

100%|██████████████████████████████████████████████████████████████████████████████| 1440/1440 [10:08<00:00,  2.37it/s]


In [4]:
df = pd.DataFrame(data_df['emotion'].values.tolist())
labels = data_df.gender

In [5]:
newdf = pd.concat([df,labels], axis=1)

In [6]:
newdf = newdf.rename(index=str, columns={"0": "label"})
len(newdf)

1440

In [7]:
newdf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,gender
0,-704.991064,65.552982,-9.380876,21.504936,-0.276333,6.958925,-8.179528,0.057792,-12.863832,-2.448066,...,-3.357799,-1.452551,-3.102374,-2.007415,-2.600937,-0.14204,-2.568272,-2.855393,-1.964762,neutral_male
1,-701.11437,67.235451,-12.94483,26.17365,-0.208568,6.976875,-6.947678,-1.823795,-13.554232,-1.115351,...,-3.712457,-1.077917,-3.538705,-1.986742,-1.754591,-0.547054,-3.607596,-2.770094,-2.511671,neutral_male
2,-698.758275,68.301303,-9.659004,22.483389,-2.427078,7.409661,-6.918469,-2.170394,-12.12491,-3.208555,...,-3.065188,-1.410419,-2.798651,-1.90788,-2.614495,-0.576813,-3.077169,-3.281919,-2.324095,neutral_male
3,-692.78496,67.000889,-7.761935,22.749996,-1.31028,9.602793,-6.667698,-3.617485,-12.181872,-2.933774,...,-3.332885,-1.728847,-3.82006,-2.2443,-2.215645,-0.340175,-2.781173,-3.174051,-3.409281,neutral_male
4,-735.084608,74.199983,-8.838192,25.473145,-1.49922,9.268571,-9.208855,-0.166669,-11.989154,-4.528281,...,-2.708373,-0.498136,-3.852988,-0.499538,-1.548224,-1.062739,-2.359443,-3.298791,-3.043729,calm_male


In [8]:
def noise(data):
    """
    Adding White Noise.
    """
    # you can take any distribution from https://docs.scipy.org/doc/numpy-1.13.0/reference/routines.random.html
    noise_amp = 0.005*np.random.uniform()*np.amax(data)
    data = data.astype('float64') + noise_amp * np.random.normal(size=data.shape[0])
    return data
    
def shift(data):
    """
    Random Shifting.
    """
    s_range = int(np.random.uniform(low=-5, high = 5)*500)
    return np.roll(data, s_range)
    
def stretch(data, rate=0.8):
    """
    Streching the Sound.
    """
    data = librosa.effects.time_stretch(data, rate)
    return data
    
def pitch(data, sample_rate):
    """
    Pitch Tuning.
    """
    bins_per_octave = 12
    pitch_pm = 2
    pitch_change =  pitch_pm * 2*(np.random.uniform())   
    data = librosa.effects.pitch_shift(data.astype('float64'), 
                                      sample_rate, n_steps=pitch_change, 
                                      bins_per_octave=bins_per_octave)
    return data
    
def dyn_change(data):
    """
    Random Value Change.
    """
    dyn_change = np.random.uniform(low=1.5,high=3)
    return (data * dyn_change)
    
def speedNpitch(data):
    """
    peed and Pitch Tuning.
    """
    # you can change low and high here
    length_change = np.random.uniform(low=0.8, high = 1)
    speed_fac = 1.0  / length_change
    tmp = np.interp(np.arange(0,len(data),speed_fac),np.arange(0,len(data)),data)
    minlen = min(data.shape[0], tmp.shape[0])
    data *= 0
    data[0:minlen] = tmp[0:minlen]
    return data

In [9]:
count = 0
data_df1 = pd.DataFrame(columns=['emotion', 'gender'])
for file in tqdm(glob.glob(r'C:\Users\KIIT\Downloads\speech-emotion-recognition-ravdess-data\Actor_*\*.wav')):
    file_name=os.path.basename(file)
    X, sample_rate = librosa.load(file)
    X = noise(X)
    mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    emotion=emotions[file_name.split("-")[2]] + '_' + gender(file_name.split("-")[-1])
    feature=mfccs
    data_df1.loc[count] = [feature,emotion]
    count += 1

100%|██████████████████████████████████████████████████████████████████████████████| 1440/1440 [06:59<00:00,  3.43it/s]


In [10]:
df1 = pd.DataFrame(data_df1['emotion'].values.tolist())
labels1 = data_df1.gender
syndf1 = pd.concat([df1,labels1], axis=1)
syndf1 = syndf1.rename(index=str, columns={"0": "label"})
syndf1 = syndf1.fillna(0)
syndf1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,gender
0,-656.062849,54.715947,-5.743136,17.267533,2.179606,4.282772,-6.103129,-1.994194,-11.756605,-3.272871,...,-3.252345,-1.563655,-2.949473,-2.308516,-2.859863,-0.640899,-2.297152,-2.74309,-1.832065,neutral_male
1,-562.923043,38.399558,-0.022564,13.51052,3.905905,-0.723945,-4.124307,-6.548708,-10.491046,-3.505689,...,-2.797904,-1.913211,-2.940464,-2.911551,-1.999019,-1.32752,-2.914354,-2.679934,-2.653478,neutral_male
2,-593.861821,46.224529,-1.802237,13.599634,1.158377,2.068886,-4.659675,-6.007241,-9.855551,-4.380712,...,-1.997681,-1.597688,-2.452388,-2.448987,-2.539611,-1.292639,-2.511299,-2.976704,-2.001421,neutral_male
3,-614.62983,48.076173,-1.456146,16.544416,0.473376,4.771367,-5.078395,-6.339201,-10.371577,-4.518591,...,-3.330508,-2.221484,-3.618277,-2.427304,-1.693009,-0.806968,-2.675643,-3.326947,-3.570014,neutral_male
4,-638.456424,50.492438,0.918036,15.839533,3.649894,2.881943,-6.162561,-3.918966,-9.300149,-4.811132,...,-2.310058,-0.843362,-2.979387,-1.321568,-1.396936,-1.322536,-1.54679,-3.10818,-2.949897,calm_male


In [11]:
count = 0
data_df2 = pd.DataFrame(columns=['emotion', 'gender'])
for file in tqdm(glob.glob(r'C:\Users\KIIT\Downloads\speech-emotion-recognition-ravdess-data\Actor_*\*.wav')):
    file_name=os.path.basename(file)
    X, sample_rate = librosa.load(file)
    X = pitch(X, sample_rate)
    mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    emotion=emotions[file_name.split("-")[2]] + '_' + gender(file_name.split("-")[-1])
    feature=mfccs
    data_df2.loc[count] = [feature,emotion]
    count += 1

100%|██████████████████████████████████████████████████████████████████████████████| 1440/1440 [17:37<00:00,  1.36it/s]


In [12]:
df2 = pd.DataFrame(data_df2['emotion'].values.tolist())
labels2 = data_df2.gender
syndf2 = pd.concat([df2,labels2], axis=1)
syndf2 = syndf2.rename(index=str, columns={"0": "label"})
syndf2 = syndf2.fillna(0)
syndf2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,gender
0,-735.216265,55.724076,-5.977094,17.668855,-4.198805,-1.511455,-12.658381,-5.559117,-10.377854,8.340349,...,-4.151514,-2.001226,-4.041617,-2.897908,-3.781026,-0.407662,-4.219977,-2.220474,-2.651851,neutral_male
1,-723.236539,61.549799,-14.95051,23.52166,-2.408447,3.643548,-8.5861,-4.246263,-14.794387,-0.807452,...,-3.506613,-1.013812,-3.826184,-2.389493,-1.779196,-1.65033,-4.687337,-1.880009,-2.373854,neutral_male
2,-713.713653,62.456502,-8.483185,19.221893,-5.241186,3.840153,-10.806706,-5.129085,-13.877272,2.721254,...,-0.836615,-2.793481,-2.100003,-2.929281,-2.161687,-4.603761,-1.709972,-1.560325,-2.372128,neutral_male
3,-710.946749,61.714634,-6.703565,20.994669,-2.891452,7.122429,-11.037637,-6.338575,-13.531384,2.545414,...,-1.636259,-3.436058,-1.255951,-2.388142,-2.271846,-4.368859,-3.162007,-2.098035,-2.078004,neutral_male
4,-767.915415,69.386903,-7.726097,25.24216,-3.945032,6.925332,-14.083567,-1.433074,-15.740353,3.999215,...,-0.683692,-3.074233,-0.238004,-5.100559,-3.836748,-4.810905,-1.800329,-1.688533,-1.178533,calm_male


In [13]:
combined_df = pd.concat([newdf, syndf1, syndf2], ignore_index=True)
combined_df = combined_df.fillna(0)
combined_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,gender
0,-704.991064,65.552982,-9.380876,21.504936,-0.276333,6.958925,-8.179528,0.057792,-12.863832,-2.448066,...,-3.357799,-1.452551,-3.102374,-2.007415,-2.600937,-0.14204,-2.568272,-2.855393,-1.964762,neutral_male
1,-701.11437,67.235451,-12.94483,26.17365,-0.208568,6.976875,-6.947678,-1.823795,-13.554232,-1.115351,...,-3.712457,-1.077917,-3.538705,-1.986742,-1.754591,-0.547054,-3.607596,-2.770094,-2.511671,neutral_male
2,-698.758275,68.301303,-9.659004,22.483389,-2.427078,7.409661,-6.918469,-2.170394,-12.12491,-3.208555,...,-3.065188,-1.410419,-2.798651,-1.90788,-2.614495,-0.576813,-3.077169,-3.281919,-2.324095,neutral_male
3,-692.78496,67.000889,-7.761935,22.749996,-1.31028,9.602793,-6.667698,-3.617485,-12.181872,-2.933774,...,-3.332885,-1.728847,-3.82006,-2.2443,-2.215645,-0.340175,-2.781173,-3.174051,-3.409281,neutral_male
4,-735.084608,74.199983,-8.838192,25.473145,-1.49922,9.268571,-9.208855,-0.166669,-11.989154,-4.528281,...,-2.708373,-0.498136,-3.852988,-0.499538,-1.548224,-1.062739,-2.359443,-3.298791,-3.043729,calm_male


In [14]:
from sklearn import StratifiedShuffleSplit

In [15]:
X = combined_df.drop(['gender'], axis=1)
y = combined_df.gender
xxx = StratifiedShuffleSplit(1, test_size=0.2, random_state=12)
for train_index, test_index in xxx.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [28]:
X_train.shape

(3456, 40)

In [16]:
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder

Using TensorFlow backend.


In [17]:
lb = LabelEncoder()

y_train = np_utils.to_categorical(lb.fit_transform(y_train))
y_test = np_utils.to_categorical(lb.fit_transform(y_test))

In [18]:
x_traincnn =np.expand_dims(X_train, axis=2)
x_testcnn= np.expand_dims(X_test, axis=2)

In [29]:
x_traincnn.shape

(3456, 40, 1)

In [23]:
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.layers import BatchNormalization
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input, Flatten, Dropout, Activation
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from keras.models import Model
from keras import optimizers
from keras.callbacks import ModelCheckpoint

In [42]:
model = Sequential()

model.add(Conv1D(filters = 64,kernel_size = (3),strides=1,padding='same',data_format='channels_last',input_shape=(X_train.shape[1],1)))
model.add(BatchNormalization())
model.add(Activation('elu'))
model.add(MaxPooling1D(pool_size = 4, strides = 1))


model.add(Conv1D(filters=64, kernel_size = 3, strides=1,padding='same'))
model.add(BatchNormalization())
model.add(Activation('elu'))
model.add(MaxPooling1D(pool_size = 4, strides = 1))


model.add(Conv1D(filters=128, kernel_size = 3, strides=1,padding='same'))
model.add(BatchNormalization())
model.add(Activation('elu'))
model.add(MaxPooling1D(pool_size = 4, strides = 1))


model.add(Conv1D(filters=128, kernel_size = 3, strides=1,padding='same'))
model.add(BatchNormalization())
model.add(Activation('elu'))
model.add(MaxPooling1D(pool_size = 4, strides = 1))


model.add(LSTM(units=64))


model.add(Dense(units=16,activation='softmax'))




In [43]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_23 (Conv1D)           (None, 40, 64)            256       
_________________________________________________________________
batch_normalization_23 (Batc (None, 40, 64)            256       
_________________________________________________________________
activation_23 (Activation)   (None, 40, 64)            0         
_________________________________________________________________
max_pooling1d_23 (MaxPooling (None, 37, 64)            0         
_________________________________________________________________
conv1d_24 (Conv1D)           (None, 37, 64)            12352     
_________________________________________________________________
batch_normalization_24 (Batc (None, 37, 64)            256       
_________________________________________________________________
activation_24 (Activation)   (None, 37, 64)            0         
__________

In [44]:

model.compile(optimizer='Adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [None]:
cnnhistory=model.fit(x_traincnn, y_train, batch_size=16, epochs=700, validation_data=(x_testcnn, y_test))

Train on 3456 samples, validate on 864 samples
Epoch 1/700
Epoch 2/700
Epoch 3/700
Epoch 4/700
Epoch 5/700
Epoch 6/700
Epoch 7/700
Epoch 8/700
Epoch 9/700
Epoch 10/700
Epoch 11/700
Epoch 12/700