In [1]:
# Importing required libraries 
# Keras
import keras
from keras import regularizers
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Embedding, LSTM
from keras.layers import Input, Flatten, Dropout, Activation, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from keras.utils import np_utils
from tensorflow.keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

# sklearn
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import torch

# Other  
import librosa
import librosa.display
import json
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
import pandas as pd
import seaborn as sns
import glob 
import os
from tqdm import tqdm
import pickle
import IPython.display as ipd  # To play sound in the notebook

In [None]:
devices = [d for d in range(torch.cuda.device_count())]
print(devices)

[0]


###### Let us create functions for data augmentation.
###### There are multiple data augmentation methods. But I've mentioned only 6 methods. Among these 6 methods we gonna use only 2 methods that we'll discuss later.

###### For sake of notebook I've written all 6 methods in below cell.

In [3]:
#  Augmentation Methods

def noise(data):
    """
    Adding White Noise
    ------------------
    White Noises are random sample distribution at
    regular intervals with mean of standard deviation of 1.
    
    """
    noise_amp = 0.05 * np.random.uniform() * np.random.normal(size = data.shape[0])
    # more noise reduce the value to 0.5
    data = data.astype('float64') + noise_amp * np.random.normal(size = data.shape[0])
    return data


def shift(data):
    """
    Random shifting
    ----------------
    The pixels of the image can be shifted horizontally or vertically.
    Here Image is mfcc spectrum
    """
    s_range = int(np.random.uniform(low = -5, high = 5)*1000)
    return np.roll(data, s_range)

def stretch(data, rate = 0.8):
    """
    Streching the sound. Note that this expands the dataset slightly
    """
    data = librosa.effects.time_stretch(data, rate)
    return data

def pitch(data, sample_rate):
    """
    Pitch Tuning
    """
    bins_per_octave = 12
    pitch_pm = 2
    pitch_change = pitch_pm * 2 * (np.random.uniform())
    data = librosa.effects.pitch_shift(data.astype('float64'),
                                      sample_rate,
                                      n_steps = pitch_change,
                                      bins_per_octave = bins_per_octave
                                      )
    return data

def dyn_change(data):
    """
    Random value change
    """
    dyn_change = np.random.uniform(low = -0.5, high = 7)
    return data * dyn_change

def speedNpitch(data):
    """
    speed and Pitch Tuning
    """
    length_change = np.random.uniform(low = 0.8, high = 1)
    speed_fac = 1.2 / length_change # try changing 1.0 to 2.0...
    tmp = np.interp(np.arange(0, len(data), speed_fac), np.arange(0, len(data)), data)
    minlen = min(data.shape[0], tmp.shape[0])
    data *=0
    data[0:minlen] = tmp[0:minlen]
    return data


# Confusion matrix heat map plot

def print_confusion_matrix(confusion_matrix, class_names, figsize = (10, 7), fontsize = 14):
    """Prints a confusion matrix, as returned by sklearn.metrics.confusion_matrix, as a heatmap.
    
    Arguments
    ---------
    confusion_matrix: numpy.ndarray
        The numpy.ndarray object returned from a call to sklearn.metrics.confusion_matrix. 
        Similarly constructed ndarrays can also be used.
    class_names: list
        An ordered list of class names, in the order they index the given confusion matrix.
    figsize: tuple
        A 2-long tuple, the first value determining the horizontal size of the ouputted figure,
        the second determining the vertical size. Defaults to (10,7).
    fontsize: int
        Font size for axes labels. Defaults to 14.
        
    Returns
    -------
    matplotlib.figure.Figure
        The resulting confusion matrix figure
    """
    df_cm = pd.DataFrame(confusion_matrix, index = class_names, columns = class_names)
    fig = plt.figure(figsize = figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot = True, fmt = 'd')
    except ValueError:
        raise ValueError('Confusion matrix must be integers.')
    
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
                                 
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

Explore Augmentation method

###### Now its time to see how augmentation affects the original examples.

###### Initially we'll apply all steps on single audio example to get the idea how exactly augmentation process works.

In [None]:
# Use one audio file in previous parts again
fname = 'E:\Important\Code\MinorProject\\final datasets\SAVEE\ALL\DC_a01.wav'
data, sampling_rate = librosa.load(fname)
librosa.display.waveshow(data, sr = sampling_rate)

# Audio
ipd.Audio(data, rate = sampling_rate)

adding static white noise in the bckg

In [None]:
x = noise(data)
plt.figure(figsize = (15, 5))
librosa.display.waveshow(x, sr = sampling_rate)

ipd.Audio(x, rate = sampling_rate)

Shift

In [None]:
x = shift(data)
plt.figure(figsize = (15, 5))
# orginial plot
librosa.display.waveshow(data, sr = sampling_rate)
plt.title('Original Audio Plot', size = 24)
plt.show()
plt.figure(figsize = (15, 5))
librosa.display.waveshow(x, sr = sampling_rate)
plt.title('Augmented Audio Plot', size = 24)
plt.show()
ipd.Audio(x, rate = sampling_rate)

###### So its not very noticable but what I've done there is move the audio randomly to either the left or right direction, within the fix audio duration. So if you compare this to the original plot, you can see the same audio wave pattern, except there's a tiny bit of delay before the speaker starts speaking.

# stretch

In [None]:
x = stretch(data)
plt.figure(figsize = (15, 5))
# orginial plot
librosa.display.waveshow(data, sr = sampling_rate)
plt.title('Original Audio Plot', size = 24)
plt.show()
plt.figure(figsize = (15, 5))
librosa.display.waveshow(x, sr = sampling_rate)
plt.title('Augmented Audio Plot', size = 24)
plt.show()
ipd.Audio(x, rate = sampling_rate)

###### This one is one of the more dramatic augmentation methods. The method literally stretches the audio. So the duration is longer, but the audio wave gets strecthed too. Thus introducing and effect that sounds like a slow motion sound. If you look at the audio wave itself, you'll notice that compared to the orginal audio, the strected audio seems to hit a higher frequency note. Thus creating a more diverse data for augmentation. It does introduce a bit of a challenge in the data prep stage cause it lengthens the audio duration. Something to consider especially when doing a 2D CNN.

## changing pitch 

In [None]:
x = pitch(data, sampling_rate)
plt.figure(figsize = (15, 5))
# orginial plot
librosa.display.waveshow(data, sr = sampling_rate)
plt.title('Original Audio Plot', size = 24)
plt.show()
plt.figure(figsize = (15, 5))
librosa.display.waveshow(x, sr = sampling_rate)
plt.title('Augmented Audio Plot', size = 24)
plt.show()
ipd.Audio(x, rate = sampling_rate)

Dynamic Change

In [None]:
x = dyn_change(data)
plt.figure(figsize = (15, 5))
# orginial plot
librosa.display.waveshow(data, sr = sampling_rate)
plt.title('Original Audio Plot', size = 24)
plt.show()
plt.figure(figsize = (15, 5))
librosa.display.waveshow(x, sr = sampling_rate)
plt.title('Augmented Audio Plot', size = 24)
plt.show()
ipd.Audio(x, rate = sampling_rate)

Speed and Pitch

In [None]:
x = speedNpitch(data)
plt.figure(figsize = (15, 5))
# orginial plot
librosa.display.waveshow(data, sr = sampling_rate)
plt.title('Original Audio Plot', size = 24)
plt.show()
plt.figure(figsize = (15, 5))
librosa.display.waveshow(x, sr = sampling_rate)
plt.title('Augmented Audio Plot', size = 24)
plt.show()
ipd.Audio(x, rate = sampling_rate)

###### I really like this augmentation method. It dramatically alters the audio in many ways. It compresses the audio wave but keeping the audio duration the same. If you listen to it, the effect is opposite of the stretch augmentation method. An angry person when applied this augmentation method, to the human ear, will really alter the emotion interpretation of this audio. Not sure if this is counter productive to the algorithm, but lets try it. Another potential, downside is that there will be silence in the later part of the audio.

# Data Preparation and Preprocessing

We'll use only two augmentation methods but can be used many more as per convenience.

White Noise
Speed and Pitch

In [4]:
# let's pick up the meta-data that we got from our first part of the kernel
ref = pd.read_csv('E:\Important\Code\MinorProject\Data_path.csv')
ref.head()

Unnamed: 0,labels,source,path
0,male_angry,SAVEE,E:\Important\Code\MinorProject\final datasets\...
1,male_angry,SAVEE,E:\Important\Code\MinorProject\final datasets\...
2,male_angry,SAVEE,E:\Important\Code\MinorProject\final datasets\...
3,male_angry,SAVEE,E:\Important\Code\MinorProject\final datasets\...
4,male_angry,SAVEE,E:\Important\Code\MinorProject\final datasets\...


In [7]:
# this will take time (~16 mins) as we are iterating over 4 datasets and with augmentation

df = pd.DataFrame(columns = ['feature'])
df_noise = pd.DataFrame(columns = ['feature'])
df_speedNpitch = pd.DataFrame(columns = ['feature'])
count = 0

# loop feature extraction over the entire dataset
for i in tqdm(ref.path):
    # first load the audio
    X, sample_rate = librosa.load(i,
                                 res_type = 'kaiser_fast',
                                 duration = 2.5,
                                 sr = 44100,
                                 offset = 0.5
                                 )
    # take mfcc and mean as a feature. we can also apply min and max, so on 
    mfccs = np.mean(librosa.feature.mfcc(y = X,
                                        sr = np.array(sample_rate),
                                        n_mfcc = 13), axis = 0)
    df.loc[count] = [mfccs]
    
    # random shifting  ----
            # *****
    # stretch ----
            # *****
    # pitch ----
            # *****
    # noise
    aug = noise(X)
    aug = np.mean(librosa.feature.mfcc(y = aug,
                                      sr = np.array(sample_rate),
                                      n_mfcc = 13), axis = 0)
    df_noise.loc[count] = [aug]
    
    # speed and pitch
    aug = speedNpitch(X)
    aug = np.mean(librosa.feature.mfcc(y = aug,
                                      sr = np.array(sample_rate),
                                      n_mfcc = 13), axis = 0)
    df_speedNpitch.loc[count] = [aug]
    
    count += 1

df.head()

100%|██████████| 12162/12162 [14:53<00:00, 13.61it/s]


Unnamed: 0,feature
0,"[-4.6414213, -3.860898, -6.21919, -5.9265423, ..."
1,"[-8.690716, -12.522837, -22.928043, -23.243807..."
2,"[-8.814859, -12.819055, -24.178183, -23.84745,..."
3,"[-2.2684252, -4.317077, -12.285238, -13.083024..."
4,"[-13.485307, -16.26042, -25.884357, -27.827044..."


###### We'll need to stack them on top of each other to make a larger dataset. But before we do so, we need to make sure the number of columns for the 3 datasets are the same. So lets check it out before we stack them

###### NOTE: If using the stretch augmentation, the audio duration becomes artifically longer and thus the number of columns will be different from original. So some padding will need to be done to the original dataset

In [8]:
# combine
df = pd.concat([ref, pd.DataFrame(df['feature'].values.tolist())], axis = 1)
df_noise = pd.concat([ref, pd.DataFrame(df_noise['feature'].values.tolist())], axis = 1)
df_speedNpitch = pd.concat([ref, pd.DataFrame(df_speedNpitch['feature'].values.tolist())], axis = 1)
print(df.shape, df_noise.shape, df_speedNpitch.shape)

(12162, 219) (12162, 219) (12162, 219)


In [9]:
df = pd.concat([df, df_noise, df_speedNpitch], axis = 0, sort = False)
df = df.fillna(0)
del df_noise, df_speedNpitch
df.head()

Unnamed: 0,labels,source,path,0,1,2,3,4,5,6,...,206,207,208,209,210,211,212,213,214,215
0,male_angry,SAVEE,E:\Important\Code\MinorProject\final datasets\...,-4.641421,-3.860898,-6.21919,-5.926542,-5.850419,-4.80896,-2.513003,...,-4.088852,-5.023864,-5.254714,-5.234095,-5.310307,-5.621666,-6.072197,-6.611348,-3.999875,1.390506
1,male_angry,SAVEE,E:\Important\Code\MinorProject\final datasets\...,-8.690716,-12.522837,-22.928043,-23.243807,-22.926605,-23.432241,-14.830004,...,-22.627258,-22.633406,-22.511597,-24.300154,-24.496809,-22.895985,-23.511503,-24.342152,-16.465857,-8.936035
2,male_angry,SAVEE,E:\Important\Code\MinorProject\final datasets\...,-8.814859,-12.819055,-24.178183,-23.84745,-15.182783,-10.732485,-8.681472,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,male_angry,SAVEE,E:\Important\Code\MinorProject\final datasets\...,-2.268425,-4.317077,-12.285238,-13.083024,-12.041327,-11.819768,-9.414148,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,male_angry,SAVEE,E:\Important\Code\MinorProject\final datasets\...,-13.485307,-16.26042,-25.884357,-27.827044,-27.593534,-26.666508,-18.659023,...,-25.291666,-25.854906,-26.821354,-25.436455,-24.179941,-23.281618,-24.167494,-25.228062,-20.599659,-15.929615


In [10]:
df.shape

(36486, 219)

In [11]:
# split the data into train and test
x_train, x_test, y_train, y_test = train_test_split(df.drop(['path', 'labels', 'source'], axis = 1), 
                                                   df.labels, 
                                                   test_size = 0.25,
                                                   shuffle = True,
                                                   random_state = 0)

# displaying data
x_train[40:50]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
5987,-22.07966,-16.708488,-15.398371,-13.871598,-11.843664,-12.924116,-13.052355,-13.427746,-13.055657,-12.775974,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3958,-3.653722,-7.661983,-26.530659,-27.909025,-27.70042,-26.315804,-27.802151,-29.928383,-29.454929,-29.48716,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1839,-28.083273,-24.595102,-24.012786,-23.211279,-25.260085,-23.876274,-23.39841,-22.840234,-21.840656,-23.988655,...,-23.809757,-22.581333,-21.7823,-22.448495,-23.435837,-24.129177,-25.341792,-25.317694,-25.808319,-24.724925
5858,-13.882131,-17.262764,-15.676321,-15.626604,-14.589589,-16.99189,-16.74522,-15.996406,-15.752399,-17.419657,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7726,-19.437696,-15.434696,-14.510869,-15.494093,-15.506308,-17.594122,-19.24325,-18.499249,-16.805786,-17.297819,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4477,-23.883625,-25.533552,-28.410902,-28.562437,-29.18375,-29.930553,-27.535269,-30.217377,-36.815121,-41.187675,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9252,-16.816902,-12.784061,-11.61111,-11.699586,-11.94774,-11.783565,-12.503565,-12.104957,-13.62177,-17.221586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4455,-29.376492,-31.519032,-29.201942,-27.746609,-27.722698,-28.101051,-27.932087,-27.813494,-30.226576,-34.526054,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7013,-9.242125,-6.113397,-7.197112,-8.741385,-7.697558,-7.44977,-8.596224,-8.702652,-7.000876,-6.63398,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1871,-34.430426,-28.712948,-26.137941,-23.711412,-21.214732,-22.722746,-23.059275,-25.183112,-26.203176,-24.932066,...,-23.301339,-19.78802,-20.100005,-20.359791,-19.383941,-21.89243,-18.865014,-20.083867,-21.729085,-21.497134


#### lets normalise our dataset

In [12]:
mean = np.mean(x_train, axis = 0)
std = np.std(x_train, axis = 0)

x_train = (x_train - mean) / std
x_test = (x_test - mean) / std

# dataset after normalization
x_train[40:50]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
5987,-0.176529,0.199646,0.432959,0.546989,0.701484,0.61153,0.595854,0.563301,0.58812,0.606993,...,0.507877,0.495638,0.495427,0.496005,0.483019,0.483133,0.483699,0.473083,0.469982,0.470913
3958,1.320582,0.939085,-0.447011,-0.560179,-0.548131,-0.442364,-0.563682,-0.733493,-0.701313,-0.708452,...,0.507877,0.495638,0.495427,0.496005,0.483019,0.483133,0.483699,0.473083,0.469982,0.470913
1839,-0.664324,-0.444986,-0.247981,-0.189656,-0.355817,-0.250379,-0.217488,-0.176431,-0.102622,-0.27563,...,-0.780821,-0.738657,-0.694376,-0.728448,-0.810334,-0.847582,-0.910996,-0.931754,-0.965791,-0.898324
5858,0.489522,0.154341,0.410988,0.408567,0.485087,0.291406,0.305544,0.361429,0.376082,0.241458,...,0.507877,0.495638,0.495427,0.496005,0.483019,0.483133,0.483699,0.473083,0.469982,0.470913
7726,0.038131,0.303763,0.503113,0.419019,0.412844,0.244012,0.109165,0.164729,0.293257,0.251049,...,0.507877,0.495638,0.495427,0.496005,0.483019,0.483133,0.483699,0.473083,0.469982,0.470913
4477,-0.323101,-0.521693,-0.595638,-0.611715,-0.665027,-0.726836,-0.542702,-0.756205,-1.280027,-1.629475,...,0.507877,0.495638,0.495427,0.496005,0.483019,0.483133,0.483699,0.473083,0.469982,0.470913
9252,0.251071,0.520419,0.732329,0.718301,0.693282,0.701288,0.638997,0.66726,0.543608,0.25705,...,0.507877,0.495638,0.495427,0.496005,0.483019,0.483133,0.483699,0.473083,0.469982,0.470913
4455,-0.769398,-1.010931,-0.658167,-0.547369,-0.549887,-0.582859,-0.573897,-0.567282,-0.761986,-1.105096,...,0.507877,0.495638,0.495427,0.496005,0.483019,0.483133,0.483699,0.473083,0.469982,0.470913
7013,0.866524,1.065663,1.081241,0.951622,1.028224,1.042348,0.946167,0.934649,1.064193,1.090468,...,0.507877,0.495638,0.495427,0.496005,0.483019,0.483133,0.483699,0.473083,0.469982,0.470913
1871,-1.180031,-0.781569,-0.415968,-0.229102,-0.037017,-0.159599,-0.190827,-0.360559,-0.445636,-0.349892,...,-0.753303,-0.585974,-0.602485,-0.614519,-0.586722,-0.724226,-0.554544,-0.641338,-0.738854,-0.719573


Now we have to make our dataset keras compatible for computation.

In [13]:
# Lets few preparation steps to get it into the correct format for Keras 
X_train = np.array(x_train)
y_train = np.array(y_train)
X_test = np.array(x_test)
y_test = np.array(y_test)

# one hot encoding to labels
lb = LabelEncoder()
y_train = np_utils.to_categorical(lb.fit_transform(y_train))
y_test = np_utils.to_categorical(lb.fit_transform(y_test))
print('X_train Shape :: ', x_train.shape)
print('Label Classes :: ',lb.classes_)

X_train Shape ::  (27364, 216)
Label Classes ::  ['female_angry' 'female_disgust' 'female_fear' 'female_happy'
 'female_neutral' 'female_sad' 'female_surprise' 'male_angry'
 'male_disgust' 'male_fear' 'male_happy' 'male_neutral' 'male_sad'
 'male_surprise']


In [14]:
# Pickel the lb object for future use
filename = 'labels'
outfile = open(filename, 'wb')
pickle.dump(lb, outfile)
outfile.close()

In [15]:
x_train = np.expand_dims(x_train, axis = 2)
x_test = np.expand_dims(x_test, axis = 2)
x_train.shape

(27364, 216, 1)

We are going to use same model as we have used previously(without augmentation) so that we can compare new model with previous model.

In [19]:
# New model
model = Sequential()
model.add(Conv1D(256, 8, padding='same',input_shape=(x_train.shape[1],1)))  # X_train.shape[1] = No. of Columns
model.add(Activation('relu'))
model.add(Conv1D(256, 8, padding='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(MaxPooling1D(pool_size=(8)))
model.add(Conv1D(128, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(128, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(128, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(128, 8, padding='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(MaxPooling1D(pool_size=(8)))
model.add(Conv1D(64, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(64, 8, padding='same'))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(14, activation = 'Softmax'))  # target class number
opt = tf.keras.optimizers.legacy.RMSprop(learning_rate = 0.00001, decay = 1e-6)
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_24 (Conv1D)          (None, 216, 256)          2304      
                                                                 
 activation_24 (Activation)  (None, 216, 256)          0         
                                                                 
 conv1d_25 (Conv1D)          (None, 216, 256)          524544    
                                                                 
 batch_normalization_6 (Batc  (None, 216, 256)         1024      
 hNormalization)                                                 
                                                                 
 activation_25 (Activation)  (None, 216, 256)          0         
                                                                 
 dropout_6 (Dropout)         (None, 216, 256)          0         
                                                      

In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 16748867737807859481
xla_global_id: -1
]


In [4]:
import sys
print (sys.version)

print(tf.__version__)
# my output was => 1.13.1

3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]
2.10.1


In [5]:
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))

Device mapping: no known devices.


In [6]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [28]:
model.compile(loss='categorical_crossentropy', optimizer=opt,metrics=['accuracy'])
model_history=model.fit(x_train, y_train, batch_size=16, epochs=100, validation_data=(x_test, y_test),verbose=2)

Epoch 1/100


KeyboardInterrupt: 