In [5]:
import numpy as np
import os
import time
from resnet50 import ResNet50
from keras.preprocessing import image
from keras.layers import GlobalAveragePooling2D, Dense, Dropout,Activation,Flatten

from imagenet_utils import preprocess_input
from keras.layers import Input
from keras.models import Model
from keras.utils import np_utils
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import pandas as pd
import random
from scipy.io import wavfile
from sklearn.preprocessing import scale
import librosa.display
import librosa
import matplotlib.pyplot as plt

# Why Resnet
The main benefit of a very deep network is that it can represent very complex functions. It can also learn features at many different levels of abstraction, from edges (at the lower layers) to very complex features (at the deeper layers). However, using a deeper network doesn't always help. A huge barrier to training them is vanishing gradients: very deep networks often have a gradient signal that goes to zero quickly, thus making gradient descent unbearably slow. More specifically, during gradient descent, as we backprop from the final layer back to the first layer, we are multiplying by the weight matrix on each step, and thus the gradient can decrease exponentially quickly to zero (or, in rare cases, grow exponentially quickly and "explode" to take very large values). The following graphs show the training and test error as the number of iterations increases. With the increase in layers the error also increases. It is common to assume that overfitting is the reason for this problem but that is not the case.

# Iteration Vs Loss
<img src="IterVsloss.png">

ResNet uses residual blocks with  skip connections.  By stacking these ResNet blocks on top of each other, you can form a very deep network.
Having ResNet blocks with the shortcut also makes it very easy for one of the blocks to learn an identity function. This means that many such blocks can be stack on additional ResNet blocks with little risk of harming training set performance.



In [80]:
PATH = os.getcwd()
# Define data path
data_path = PATH + '/melspectrograms/training'
data_dir_list = os.listdir(data_path)

In [81]:
img_data_list=[]

for dataset in data_dir_list:
    img_list=os.listdir(data_path+'/'+ dataset)
    #print ('Loaded the images of dataset-'+'{}\n'.format(dataset))
    for img in img_list:
        img_path = data_path + '/'+ dataset + '/'+ img 
        img = image.load_img(img_path, target_size=(224, 224))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)
        #print('Input image shape:', x.shape)
        
        img_data_list.append(x)

In [47]:
img_data = np.array(img_data_list)
#img_data = img_data.astype('float32')
print (img_data.shape)
img_data=np.rollaxis(img_data,1,0)
print (img_data.shape)
img_data=img_data[0]
print (img_data.shape)

(120, 1, 224, 224, 3)
(1, 120, 224, 224, 3)
(120, 224, 224, 3)


In [48]:
num_classes = 2
num_of_samples = img_data.shape[0]
labels = np.ones((num_of_samples,),dtype='int64')

In [49]:
labels[0:40]=0
labels[40:80]=1

In [74]:
names=['coughing','NotCoughing']

In [52]:
Y = np_utils.to_categorical(labels, num_classes)

#Shuffle the dataset
x,y = shuffle(img_data,Y, random_state=2)
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

<img src="cnn.png">


The pixel values from the images are split into several feature maps.Only the dominant features i.e the highest value in a feature map is identified and a smaller feature map is created. This is done in order to reduce computation.This process of reducing the dimensions of an image by taking the maximum pixel value of a grid is called max-pooling.This also helps reduce overfitting and makes the model more generic.Again several such max-pooled feature maps are created which contain information of only dominant features.Later these f.maps are flattened and fed into the neural network.

In this case the .wav files are  converted into melspectrograms and these images are used to train the network.

# Fine Tuning the ResNet model
Instead of training a neural network from scratch(which needs a lot of computation) it can be retrained to classify into labels we want. This can be done by 'freezing' earlier layers and adding layers at the end. It can be seen that the total number of trainable parameters has reduced after freezing the layers. By freezing a layer we dont retrain the weights of that layer. We stack new layers and train them.
for layer in custom_resnet_model2.layers[:-6],from this line we freeze the network except for the last 6 layers.Two Dense and Dropout layers are added.Dropout is randomly deleting certain neurons in the layer so that they dont develop codependency and lead to overfitting.

The final network along with frozen and newly added layers is trained for 15 epochs.One epoch goes through the entire dataset each time altering the weigths of each layer.
The compile function takes three arguments. They are:

loss: Here categorical cross entropy loss is used as this is a multi class problem as opposed to binary cross entropy loss.

optimizer:Optimizer is used to update network weights iterative based in training data. Here adam optimizer is used.Adam is the improved version of stochastic gradient descent as this method computes individual adaptive learning rates for different parameters from estimates of first and second moments of the gradients.

metric:Accuracy after each epoch can be monitored.

In [60]:
model = ResNet50(weights='imagenet',include_top=False)
model.summary()
last_layer = model.output
x = GlobalAveragePooling2D()(last_layer)
x = Dense(512, activation='relu',name='fc-1')(x)
x = Dropout(0.5)(x)
x = Dense(256, activation='relu',name='fc-2')(x)
x = Dropout(0.5)(x)
out = Dense(num_classes, activation='softmax',name='output_layer')(x)

custom_resnet_model2 = Model(inputs=model.input, outputs=out)

custom_resnet_model2.summary()

for layer in custom_resnet_model2.layers[:-6]:
	layer.trainable = False

custom_resnet_model2.layers[-1].trainable

custom_resnet_model2.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

t=time.time()
hist = custom_resnet_model2.fit(X_train, y_train, batch_size=8, epochs=15, verbose=1, validation_data=(X_test, y_test))
print('Training time: %s' % (t - time.time()))
(loss, accuracy) = custom_resnet_model2.evaluate(X_test, y_test, batch_size=10, verbose=1)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_8 (InputLayer)             (None, None, None, 3) 0                                            
____________________________________________________________________________________________________
zero_padding2d_8 (ZeroPadding2D) (None, None, None, 3) 0           input_8[0][0]                    
____________________________________________________________________________________________________
conv1 (Conv2D)                   (None, None, None, 64 9472        zero_padding2d_8[0][0]           
____________________________________________________________________________________________________
bn_conv1 (BatchNormalization)    (None, None, None, 64 256         conv1[0][0]                      
___________________________________________________________________________________________

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_8 (InputLayer)             (None, None, None, 3) 0                                            
____________________________________________________________________________________________________
zero_padding2d_8 (ZeroPadding2D) (None, None, None, 3) 0           input_8[0][0]                    
____________________________________________________________________________________________________
conv1 (Conv2D)                   (None, None, None, 64 9472        zero_padding2d_8[0][0]           
____________________________________________________________________________________________________
bn_conv1 (BatchNormalization)    (None, None, None, 64 256         conv1[0][0]                      
___________________________________________________________________________________________

Train on 96 samples, validate on 24 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Training time: -927.270450592041


The wav files are converted into spectrograms and tested for accuracy. please find the wav files and corresponding pngs in the repo.

In [76]:
path_to_file =r'C:\Users\Sarthak\Desktop\ESC-50-master\cough.wav'
data, sr = librosa.load(path_to_file, sr=44100, mono=True)
data = scale(data)
    
melspec = librosa.feature.melspectrogram(y=data, sr=sr, n_mels=128)
log_melspec = librosa.power_to_db(melspec, ref=np.max)  
librosa.display.specshow(log_melspec, sr=sr)
      
plt.savefig( 'cough' + '.png')

from keras.preprocessing import image
from keras.applications.imagenet_utils import decode_predictions

img_path = r'C:\Users\Sarthak\Desktop\ESC-50-master\cough1.png'
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
preds = custom_resnet_model2.predict(x)
print(names[np.argmax(preds)])
print("[INFO] loss={:.4f}, accuracy: {:.4f}%".format(loss,accuracy * 100))

coughing
[INFO] loss=0.5345, accuracy: 83.3333%
