## Loading libraries

In [180]:
import os
import numpy as np
np.random.seed(1969)
import tensorflow as tf
tf.set_random_seed(1969)

from scipy import signal
from glob import glob
import re
import pandas as pd
import gc
from scipy.io import wavfile

from keras import optimizers, losses, activations, models
from keras.layers import GRU, Convolution2D, Dense, Input, Flatten, Dropout, MaxPooling2D, BatchNormalization, Conv3D, ConvLSTM2D,Conv1D,Activation,LSTM
from keras.callbacks import TensorBoard
from keras.models import Sequential
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank

import os
import pandas as pd
import librosa
import glob

from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder

from keras import backend as K
from keras.layers import Dense, SimpleRNN, LSTM, Bidirectional, TimeDistributed, Conv1D, ZeroPadding1D, GRU
from keras.layers import Lambda, Input, Dropout, Masking, BatchNormalization, Activation
from keras.models import Model
import json

## Defining utility functions

In [182]:
L=16000
def custom_fft(y, fs):
    T = 1.0 / fs
    N = y.shape[0]
    yf = fft(y)
    xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
    # FFT is simmetrical, so we take just the first half
    # FFT is also complex, to we take just the real part (abs)
    vals = 2.0/N * np.abs(yf[0:N//2])
    return xf, vals

def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

# Function for padding smaller wave files
def pad_audio(samples):
    if len(samples) >= L: return samples
    else: return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0))
    
# Function for choppipng larger wave files
def chop_audio(samples, L=16000, num=20):
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]

def get_spectrogram(wav):
    D = librosa.stft(wav, n_fft=480, hop_length=160,
                     win_length=480, window='hamming')
    spect, phase = librosa.magphase(D)
    return spect

## Loading the training file

In [184]:
new_sample_rate = 16000
y_train = []
x_train = []
mylist1=os.listdir('meld/train/')
for file in mylist1:
    mylist= os.listdir('meld/train/'+file+"/")
    for index,y in enumerate(mylist):
        # Loadingthe wave files
        samples, sample_rate = librosa.load('meld/train/'+file+"/"+y,mono=True,sr=16000)
        samples=np.array(samples*32768,dtype = "int16")
        samples = pad_audio(samples)
        if len(samples) > 16000:
            n_samples = chop_audio(samples)
        else: n_samples = [samples]
        for samples in n_samples:
            # Resampling the wave files
            resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
            # Log specgrams of wave files
            _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
            x_train.append(specgram)
            y_train.append(file)
# Converting training files to numpy array
x_train = np.array(x_train)
lb = LabelEncoder()
y_train = np_utils.to_categorical(lb.fit_transform(y_train))

## Loading the validation file

In [185]:
new_sample_rate = 16000
y_test = []
x_test = []
mylist1=os.listdir('meld/val/')
for file in mylist1:
    mylist= os.listdir('meld/val/'+file+"/")
    for index,y in enumerate(mylist):
        samples, sample_rate = librosa.load('meld/val/'+file+"/"+y,mono=True,sr=16000)
        samples=np.array(samples*32768,dtype = "int16")
        samples = pad_audio(samples)
        if len(samples) > 16000:
            n_samples = chop_audio(samples)
        else: n_samples = [samples]
        for samples in n_samples:
            # Resampling wave files
            resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
            # Log specgram of wave files
            _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
            x_test.append(specgram)
            y_test.append(file)

x_test = np.array(x_test)
y_test = np_utils.to_categorical(lb.transform(y_test))

In [190]:
# Adding extra dimension for training 2D CNN
x_train = x_train.reshape(tuple(list(x_train.shape) + [1]))
x_test = x_test.reshape(tuple(list(x_test.shape) + [1]))

In [206]:
K.clear_session()

## Defining and training 2D CNN

In [207]:
input_shape = (99, 161, 1)
nclass = 5
inp = Input(shape=input_shape)
norm_inp = BatchNormalization()(inp)
img_1 = Convolution2D(8, kernel_size=2, activation=activations.relu)(norm_inp)
img_1 = Convolution2D(8, kernel_size=2, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Convolution2D(16, kernel_size=3, activation=activations.relu)(img_1)
img_1 = Convolution2D(16, kernel_size=3, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Convolution2D(32, kernel_size=3, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Flatten()(img_1)

dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(img_1))
dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(dense_1))
dense_1 = Dense(nclass, activation=activations.softmax)(dense_1)

model = models.Model(inputs=inp, outputs=dense_1)
opt = optimizers.Adam()

model.compile(optimizer=opt, loss=losses.categorical_crossentropy,metrics=["accuracy","categorical_accuracy"])
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 101, 40, 1)        0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 101, 40, 1)        4         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 100, 39, 8)        40        
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 99, 38, 8)         264       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 49, 19, 8)         0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 49, 19, 8)         0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 47, 17, 16)        1168

In [None]:
# Training the model
cnn_history=model.fit(x_train, y_train, batch_size=64,  epochs=10, validation_data=(x_test, y_test),shuffle=True)

## Defining and training 1D CNN

In [50]:
def cnn_lstm(input_dim, output_dim, dropout=0.2, n_layers=1):
    dtype = 'float32'
    input_data = Input(name='the_input', shape=input_dim, dtype=dtype)
    x = Conv1D(filters=256, kernel_size=10, strides=4, name='conv_1')(input_data)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(dropout, name='dropout_1')(x)
    x = LSTM(128, activation='relu', return_sequences=True,
             dropout=dropout, name='lstm_1')(x)
    x = LSTM(128, activation='relu', return_sequences=False,
             dropout=dropout, name='lstm_2')(x)
    x = Dense(units=64, activation='relu', name='fc')(x)
    x = Dropout(dropout, name='dropout_2')(x)
    y_pred = Dense(units=output_dim, activation='softmax', name='softmax')(x)
    network_model = Model(inputs=input_data, outputs=y_pred)
    return network_model

input_dim = (99, 81)
K.clear_session()
model = cnn_lstm(input_dim, 5)
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       (None, 99, 81)            0         
_________________________________________________________________
conv_1 (Conv1D)              (None, 23, 256)           207616    
_________________________________________________________________
batch_normalization_1 (Batch (None, 23, 256)           1024      
_________________________________________________________________
activation_1 (Activation)    (None, 23, 256)           0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 23, 256)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 23, 128)           197120    
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               1315

In [None]:
# Training the model
from keras.optimizers import Adam, SGD
sgd = SGD(lr=0.00001, clipnorm=1.0)
adam = Adam(lr=1e-4, clipnorm=1.0)
model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['accuracy','categorical_accuracy'])
history = model.fit(x_train, y_train,
                    batch_size=128, epochs=10,
                    validation_data=(x_test, y_test),shuffle=True,verbose=2)

## Saving and loading the model for predictions

In [165]:
# Saving the model
model.save('Emotion_Voice_Detection_Model_Filters.h5')

In [166]:
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)

In [167]:
# loading json and creating model
from keras.models import model_from_json
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("Emotion_Voice_Detection_Model_Filters.h5")
print("Loaded model from disk")

Loaded model from disk


In [169]:
# Compiling the loaded model
#loaded_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=["accuracy","categorical_accuracy"])
loaded_model.compile(optimizer=opt, loss="categorical_crossentropy",metrics=["accuracy","categorical_accuracy"])
score = loaded_model.evaluate(x_test, y_test, verbose=0)

In [170]:
score

[1.0788483412869005, 0.6228915662650603, 0.6228915662650603]

In [171]:
# Predicting and checking predictions
preds = loaded_model.predict(x_test, 
                         batch_size=32, 
                         verbose=1)
preds1=preds.argmax(axis=1)
abc = preds1.astype(int).flatten()
predictions = (lb.inverse_transform((abc)))



In [179]:
preddf = pd.DataFrame({'predictedvalues': predictions})
preddf["predictedvalues"].value_counts()

neutral    830
Name: predictedvalues, dtype: int64