## Importing the libraries

In [105]:
import os
import numpy as np
np.random.seed(1969)
import tensorflow as tf
tf.set_random_seed(1969)

from scipy import signal
from glob import glob
import re
import pandas as pd
import gc
from scipy.io import wavfile

from keras import optimizers, losses, activations, models
from keras.layers import GRU, Convolution2D, Dense, Input, Flatten, Dropout, MaxPooling2D, BatchNormalization, Conv3D, ConvLSTM2D,Bidirectional,LSTM
from keras.callbacks import TensorBoard
from keras.models import Sequential
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank
import os
import pandas as pd
import librosa
import glob 
from keras import *
from keras.layers import *
from keras.callbacks import *
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder

## Defining Utility Functions

In [107]:
L=16000
def custom_fft(y, fs):
    T = 1.0 / fs
    N = y.shape[0]
    yf = fft(y)
    xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
    # FFT is simmetrical, so we take just the first half
    # FFT is also complex, to we take just the real part (abs)
    vals = 2.0/N * np.abs(yf[0:N//2])
    return xf, vals

# Function for getting log-spectrogram from raw wave files
def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

# Function for padding audio files
def pad_audio(samples):
    if len(samples) >= L: return samples
    else: return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0))
    
# Function for chopping audio files
def chop_audio(samples, L=16000, num=20):
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]
        
def label_transform(labels):
    nlabels = []
    for label in labels:
        if label == '_background_noise_':
            nlabels.append('silence')
        elif label not in legal_labels:
            nlabels.append('unknown')
        else:
            nlabels.append(label)
    return pd.get_dummies(pd.Series(nlabels))

# Function for getting spectrogram from raw wave files
def get_spectrogram(wav):
    D = librosa.stft(wav, n_fft=480, hop_length=160,
                     win_length=480, window='hamming')
    spect, phase = librosa.magphase(D)
    return spect

## Creating Training Set

In [113]:
new_sample_rate=16000
y_train = []
x_train = np.zeros((7354,99,26),np.float32)
#x_train = np.zeros((4368,99,26),np.float32)
ix = 0
mylist1=os.listdir('meld/train/')
for file in mylist1:
    mylist= os.listdir('meld/train/'+file+"/")
    for index,y in enumerate(mylist):
        samples, sample_rate = librosa.load('meld/train/'+file+"/"+y,mono=True,sr=16000)
        samples=np.array(samples*32768,dtype = "int16")

        samples = pad_audio(samples)

        if len(samples) > 16000:
            n_samples = chop_audio(samples)
        else: n_samples = [samples]
        for samples in n_samples:
            resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
            _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
            filter_banks = logfbank(samples)
            filter_banks -= (np.mean(filter_banks, axis=0) + 1e-8)
            x_train[ix,:,:] = filter_banks
        y_train.append(file)
        ix += 1

x_train = np.array(x_train)
lb = LabelEncoder()
y_train = np_utils.to_categorical(lb.fit_transform(y_train))

## Creating Validation Set

In [114]:
new_sample_rate = 16000
y_test = []
x_test = np.zeros((830,99,26),np.float32)
ix = 0
mylist1=os.listdir('meld/val/')
for file in mylist1:
    mylist= os.listdir('meld/val/'+file+"/")
    for index,y in enumerate(mylist):
        #Loading the sound file
        samples, sample_rate = librosa.load('meld/val/'+file+"/"+y,mono=True,sr=16000)
        samples=np.array(samples*32768,dtype = "int16")
        #Padding the smaller sound files
        samples = pad_audio(samples)
        # Chopping the larger sound files
        if len(samples) > 16000:
            n_samples = chop_audio(samples)
        else: n_samples = [samples]
        for samples in n_samples:
            resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
            _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
            
            filter_banks = logfbank(samples)
            filter_banks -= (np.mean(filter_banks, axis=0) + 1e-8)
            x_test[ix,:,:] = filter_banks
        y_test.append(file)
        ix += 1
# Converting test and train to numpy array
x_test = np.array(x_test)
y_test = np_utils.to_categorical(lb.transform(y_test))

In [116]:
#Clearing Keras Session
from keras import backend as K
K.clear_session()

## Defining Attention Mechanism

In [68]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

## Defining Cyclic Learning Rate

In [83]:
from keras.callbacks import *

class CyclicLR(Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or 
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.
    For more detail, please see paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        K.set_value(self.model.optimizer.lr, self.clr())
        
# Defining Cyclic Learning Rate
clr = CyclicLR(base_lr=0.001, max_lr=0.002,
              step_size=300., mode='exp_range',
              gamma=0.99994)

## Defining Basic LSTM

In [51]:
model = Sequential()
model.add(GRU(256,input_shape=(99,26)))
model.add(Dropout(0.5))
model.add(Dense(12, activation='softmax'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['categorical_accuracy'])
model.summary()
weights = model.get_weights()
history = model.fit(x_train, y_train, batch_size=256, validation_data=(x_train, y_train), epochs=20, shuffle=True, verbose=1)

## Defining the Final Model

In [117]:
from keras.optimizers import Adam
adam = Adam(lr=1e-4, clipnorm=1.0)
K.clear_session()
def model_lstm(input_shape):
    inp = Input(shape=(input_shape[1], input_shape[2],))
    lstm = Bidirectional(LSTM(128, return_sequences=True))(inp)
    gru = Bidirectional(LSTM(64, return_sequences=True))(lstm)
    attention1 = Attention(input_shape[1])(lstm)
    attention2 = Attention(input_shape[1])(gru)
    avg_pool = GlobalAveragePooling1D()(gru)
    max_pool = GlobalMaxPooling1D()(gru)
    x = concatenate([attention1, attention2,avg_pool,max_pool], axis=1)
    x = BatchNormalization()(x)
    x = Dense(16, activation="relu")(x)
    x = Dropout(0.2)(x)
    x = Dense(5, activation="softmax")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy",'categorical_accuracy'])
    
    return model

In [118]:
model = model_lstm(x_train.shape)

In [119]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 99, 26)       0                                            
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 99, 256)      158720      input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 99, 128)      164352      bidirectional_1[0][0]            
__________________________________________________________________________________________________
attention_1 (Attention)         (None, 256)          355         bidirectional_1[0][0]            
____________________________________________________________________________________________

In [None]:
# Training the model
model.fit(x_train, y_train, batch_size=128, epochs=20, validation_data=[x_test, y_test],shuffle=True,callbacks=[clr])

In [24]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

<Figure size 640x480 with 1 Axes>

## Saving and loading the model for predictions

In [25]:
model.save('Emotion_Voice_Detection_Model_RNN.h5')

In [26]:
import json
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)

In [27]:
# loading json and creating model
from keras.models import model_from_json
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("Emotion_Voice_Detection_Model_RNN.h5")
print("Loaded model from disk")

Loaded model from disk


In [28]:
# Compiling the model
loaded_model.compile(loss='binary_crossentropy', optimizer="adam", metrics=["categorical_accuracy"])
score = loaded_model.evaluate(x_test, y_test, verbose=0)

In [29]:
score

[0.37828371251922055, 0.5951807228915663]

In [30]:
# Predicting with the loaded model
preds = loaded_model.predict(x_test, 
                         batch_size=32, 
                         verbose=1)
preds1=preds.argmax(axis=1)
abc = preds1.astype(int).flatten()
predictions = (lb.inverse_transform((abc)))



In [39]:
preddf = pd.DataFrame({'predictedvalues': predictions})
preddf["predictedvalues"].value_counts()

neutral    740
happy       89
disgust      1
Name: predictedvalues, dtype: int64