# Convolutional Neural Network - Model Training

#### Retrieve previously stored variables from Part 1 of CNN notebook

In [1]:
# Retrieve previously stored variables
%store -r x_train
%store -r x_test
%store -r y_train
%store -r y_test
%store -r yy
%store -r le
%store -r max_pad_length
print(x_train.shape)

(954, 40, 182)


#### Convolutional Neural Network Model Architecture

In [2]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn import metrics

num_rows = 40
num_columns = max_pad_length
num_channels = 1

x_train = x_train.reshape(x_train.shape[0], num_rows, num_columns, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_rows, num_columns, num_channels)

num_labels = yy.shape[1]

# Construct CNN model
model = Sequential()

model.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns, num_channels), activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

# Increasing nodes from 16, 32, 64, 128
model.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))
model.add(GlobalAveragePooling2D())

# Output based on number of category labels
# softmax is used as the output layer to make the output sum up to 1 -> can be used to interpret as probabilities
model.add(Dense(num_labels, activation='softmax'))

Using TensorFlow backend.


### Complilation of Archituecture

In [3]:
# Compile the model
# Optimizer "adam" is a typical optimizer used- variation SGD (stochastic gradient descent)
# Optimizer controls the learning rate
# SGD utilizes the gradient of the loss function with respects to the weight
# loss -> typical loss function 
# metrics is output to be displayed (accuracy is the output of the loss function (?))
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

### Display Model Architecture 
##### Test accuracy of model with no training - random weights

In [4]:
# Display model architecture summary 
model.summary()

# Calculate pre-training accuracy 
# Verbose - displays info if desired (verbose = 0 means silent, just print accuracy value)
# evaluate returns loss value and score value
score = model.evaluate(x_test, y_test, verbose=0)
accuracy = 100*score[1]

# Accuracy - the metrics value evaluated based on loss function
print("Pre-training accuracy: %.4f%%" % accuracy)

display(score)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 39, 181, 16)       80        
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 19, 90, 16)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 19, 90, 16)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 18, 89, 32)        2080      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 9, 44, 32)         0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 9, 44, 32)         0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 8, 43, 64)        

[11.38174973493852, 0.3553459048271179]

### Model Training

In [5]:
from keras.callbacks import ModelCheckpoint 
from datetime import datetime 

num_epochs = 150
num_batch_size = 8 # Arbitrarily chose the value 8

checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.basic_mlp.hdf5', 
                               verbose=1, save_best_only=True)

start = datetime.now()

# Train the model for a fix number of epochs
# validation_data - data to evaluate the loss at the end of each epoch
# callbacks - display ModelCheckpoint
model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(x_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Train on 954 samples, validate on 318 samples
Epoch 1/150

Epoch 00001: val_loss improved from inf to 0.48827, saving model to saved_models/weights.best.basic_mlp.hdf5
Epoch 2/150

Epoch 00002: val_loss did not improve from 0.48827
Epoch 3/150

Epoch 00003: val_loss improved from 0.48827 to 0.30107, saving model to saved_models/weights.best.basic_mlp.hdf5
Epoch 4/150

Epoch 00004: val_loss improved from 0.30107 to 0.27209, saving model to saved_models/weights.best.basic_mlp.hdf5
Epoch 5/150

Epoch 00005: val_loss did not improve from 0.27209
Epoch 6/150

Epoch 00006: val_loss improved from 0.27209 to 0.25341, saving model to saved_models/weights.best.basic_mlp.hdf5
Epoch 7/150

Epoch 00007: val_loss improved from 0.25341 to 0.22256, saving model to saved_models/weights.best.basic_mlp.hdf5
Epoch 8/150

Epoch 00008: val_loss did not improve from 0.22256
Epoch 9/150

Epoch 00009: val_loss improved from 0.22256 to 0.20151, saving model to saved_models/weights.best.basic_mlp.hdf5
Epoch 10/1

### Test Model using training and testing data sets

In [6]:
# Test the Model

# Evaluating the model on the training and testing set
score = model.evaluate(x_train, y_train, verbose=0)
print("Training Accuracy: ", score[1]*100, "%")

score = model.evaluate(x_test, y_test, verbose=0)
print("Testing Accuracy: ", score[1]*100, "%")

Training Accuracy:  99.58071112632751 %
Testing Accuracy:  98.11320900917053 %


### Test model using various audio files

In [7]:
# Test MFCC values of Longer Vs. Shorter Samples
# Creating a function that extracts the MFCC features of an audio file
def extract_features(file_name, max_pad_len):
    
    try:
        
        # Librosa extraction of audio array and sampling rate
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') # resampling at a "faster rate as opposed to higher quality"
        
        display(audio)
        # MFCC feature extraction of audio - mfccs is mfcc sequence (array), n_mfcc is number of MFCCs to return
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        # If the number of frames is less than the max_pad_len, zero-pad up to max_pad_len
        pad_width = max_pad_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')

    except Exception as e:
        print("Error encountered while parsing file ", file_name)
        return None
    
    return mfccs

In [17]:

def print_prediction(file_name, max_pad_length):
    prediction_feature = extract_features(file_name, max_pad_length)
    prediction_feature = prediction_feature.reshape(1, num_rows, num_columns, num_channels)

    predicted_vector = model.predict_classes(prediction_feature)
    predicted_class = le.inverse_transform(predicted_vector) 
    print("The predicted class is:", predicted_class[0], '\n') 

    predicted_proba_vector = model.predict_proba(prediction_feature) 
    predicted_proba = predicted_proba_vector[0]
    for i in range(len(predicted_proba)): 
        category = le.inverse_transform(np.array([i]))
        print(category[0], "\t\t : ", format(predicted_proba[i], '.32f') )

#### Validating a few UrbanSound8k datasets

In [9]:
# Validation 
import os
import librosa
from pathlib import Path

root_path = Path(os.getcwd()).parent.parent # Software Folder

# Random gun shot file file
filename = root_path / "Training_Dataset" / "audio" / "fold6" /  "135544-6-4-0.wav"
print_prediction(filename, max_pad_length)

# Siren file
filename = root_path / "Training_Dataset" / "audio" / "fold3" /  "184623-8-0-1.wav"
print_prediction(filename, max_pad_length)

array([-0.00010893, -0.00017632, -0.00017818, ..., -0.04449912,
       -0.04784342, -0.04944142], dtype=float32)

The predicted class is: gun_shot 

car_horn 		 :  0.00010090511204907670617103576660
gun_shot 		 :  0.92070323228836059570312500000000
siren 		 :  0.07919595390558242797851562500000


array([ 0.01967856, -0.00505069, -0.06804643, ..., -0.03580907,
        0.09565555,  0.19201374], dtype=float32)

The predicted class is: siren 

car_horn 		 :  0.00000000000000062305191913272729
gun_shot 		 :  0.00000000000019068333880194882068
siren 		 :  1.00000000000000000000000000000000


#### Tesing using microphone array recordings
###### **Envionment: EPH 425**

In [11]:
root_path = Path(os.getcwd()).parent.parent # Software Folder

gun_mic = root_path / "Testing_Dataset" / "Different_env" / "gunshot_micarray.wav"
car_mic = root_path / "Testing_Dataset" / "Different_env" / "carhorn_micarray.wav"
siren_mic = root_path / "Testing_Dataset" / "Different_env" / "siren_micarray.wav"

print("Below is the prediction for the gunshot\n")
print_prediction(gun_mic, max_pad_length)

print("Below is the prediction for the carhorn\n") 
print_prediction(car_mic, max_pad_length)

print("Below is the prediction for the siren\n") 
print_prediction(siren_mic, max_pad_length)

Below is the prediction for the gunshot



array([-0.00398104,  0.0070504 , -0.01208133, ..., -0.09954793,
       -0.10348408, -0.10246389], dtype=float32)

The predicted class is: gun_shot 

car_horn 		 :  0.00224094279110431671142578125000
gun_shot 		 :  0.92459446191787719726562500000000
siren 		 :  0.07316459715366363525390625000000
Below is the prediction for the carhorn



array([-0.00395943,  0.0063127 , -0.00976675, ...,  0.04595372,
        0.04855135,  0.07447524], dtype=float32)

The predicted class is: car_horn 

car_horn 		 :  0.99265444278717041015625000000000
gun_shot 		 :  0.00685489876195788383483886718750
siren 		 :  0.00049061636673286557197570800781
Below is the prediction for the siren



array([-0.72825736, -1.072797  , -0.9774418 , ...,  0.81695753,
        1.075544  ,  0.9845603 ], dtype=float32)

The predicted class is: siren 

car_horn 		 :  0.00000000037067890423791993725899
gun_shot 		 :  0.00000000000000000000037864351574
siren 		 :  1.00000000000000000000000000000000


#### iPhone Microphone testing for comparison

In [20]:

root_path = Path(os.getcwd()).parent.parent # Software Folder

gunshot = root_path / "Testing_Dataset" / "iPhone_data" / "gunShot.wav"

siren = root_path / "Testing_Dataset" / "iPhone_data" / "siren.wav"

carhorn = root_path / "Testing_Dataset" / "iPhone_data" / "carHorn.wav"

print("\nBelow is the prediction for the GUNSHOT\n")
print_prediction(gunshot, max_pad_length)

print("\nBelow is the prediction for the SIREN\n")
print_prediction(siren, max_pad_length)

print("\nBelow is the prediction for the CARHORN\n")
print_prediction(carhorn, max_pad_length)

Below is the prediction for the GUNSHOT



array([0.        , 0.        , 0.        , ..., 0.00044385, 0.00059537,
       0.        ], dtype=float32)

The predicted class is: car_horn 

car_horn 		 :  1.00000000000000000000000000000000
gun_shot 		 :  0.00000000000007144749337517977694
siren 		 :  0.00000000000000000000000006288511
Below is the prediction for the SIREN



array([-0.15548964, -0.09063674,  0.12174813, ..., -0.19627422,
       -0.19866323,  0.        ], dtype=float32)

The predicted class is: siren 

car_horn 		 :  0.00000000000000001190630786973211
gun_shot 		 :  0.00000000000005240817477799968038
siren 		 :  1.00000000000000000000000000000000
Below is the prediction for the CARHORN



array([ 0.        ,  0.        ,  0.        , ..., -0.00020375,
       -0.00019858,  0.        ], dtype=float32)

The predicted class is: car_horn 

car_horn 		 :  0.99999964237213134765625000000000
gun_shot 		 :  0.00000037998506741132587194442749
siren 		 :  0.00000000000000000000000000030553


### More testing conducted from recordings from the microphone array

In [22]:
# Victoria's testing for comparison

root_path = Path(os.getcwd()).parent.parent # Software Folder

gunshot = root_path / "Testing_Dataset" / "Same_Env" / "gunshot.wav"

siren = root_path / "Testing_Dataset" / "Same_Env" / "siren.wav"

carhorn = root_path / "Testing_Dataset" / "Same_Env" / "carhorn.wav"

# Testing on another car horn sound recorded from microphone array from UrbanSound8k dataset
videosound = root_path / "Testing_Dataset" / "Same_Env" / "videoSound.wav"

# Tesing on random siren sound found on youtube
youtube_sound = root_path / "Testing_Dataset" / "Foreign_Data" / "siren_youtube.wav"

print("Below is the prediction for the GUNSHOT\n")
print_prediction(gunshot, max_pad_length)

print("Below is the prediction for the SIREN\n")
print_prediction(siren, max_pad_length)

print("Below is the prediction for the CARHORN\n")
print_prediction(carhorn, max_pad_length)

print("THE PREDICTION BELOW IS THE FOREIGN SOUND RECORDED WITH THE MICROPHONE ARRAY\n")
print("We know the sound should be categorized to car horn\n")
print_prediction(videosound, max_pad_length)

print("The prediction below is a foreign sound found on an online source\n")
print("We know the soudn should be categorized to siren\n")
print_prediction(youtube_sound, max_pad_length)

Below is the prediction for the GUNSHOT



array([0.00132929, 0.01123005, 0.01600517, ..., 0.0115947 , 0.0059101 ,
       0.00452592], dtype=float32)

The predicted class is: gun_shot 

car_horn 		 :  0.01004302036017179489135742187500
gun_shot 		 :  0.98989516496658325195312500000000
siren 		 :  0.00006183744699228554964065551758
Below is the prediction for the SIREN



array([ 0.00168128, -0.00123748, -0.00409833, ..., -0.00118579,
        0.00104005,  0.00092989], dtype=float32)

The predicted class is: siren 

car_horn 		 :  0.00000000000000296005806637558624
gun_shot 		 :  0.00000000000000769082199184381587
siren 		 :  1.00000000000000000000000000000000
Below is the prediction for the CARHORN



array([ 0.00198785,  0.00100049,  0.00456577, ..., -0.00214608,
        0.00849659,  0.00240523], dtype=float32)

The predicted class is: car_horn 

car_horn 		 :  1.00000000000000000000000000000000
gun_shot 		 :  0.00000004647453621942077006679028
siren 		 :  0.00000000017003706775930993444490
THE PREDICTION BELOW IS THE FOREIGN SOUND RECORDED WITH THE MICROPHONE ARRAY

We know the sound should be categorized to car horn



array([0.01046848, 0.01890162, 0.02154594, ..., 0.0111305 , 0.00932156,
       0.00921465], dtype=float32)

The predicted class is: car_horn 

car_horn 		 :  1.00000000000000000000000000000000
gun_shot 		 :  0.00000000000000000000000000000000
siren 		 :  0.00000000000000000000000000000000
The prediction below is a foreign sound found on an online source

We know the soudn should be categorized to siren



array([-0.12262643, -0.13969955, -0.06495816, ...,  0.1658367 ,
        0.11608469,  0.        ], dtype=float32)

The predicted class is: siren 

car_horn 		 :  0.00000000000000000000000000000000
gun_shot 		 :  0.00000000000000000000000000000000
siren 		 :  1.00000000000000000000000000000000
