<b>Load preprocessed dataset

In [4]:
# retrieve the preprocessed data from previous notebook

%store -r x_train 
%store -r x_test 
%store -r y_train 
%store -r y_test 
%store -r yy 
%store -r le

In [5]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D, Conv2D, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn import metrics

num_rows = 40
num_columns = 63
num_channels = 1
SAMPLE_RATE = 16000

**Construct the Model**

In [6]:
x_train = x_train.reshape(x_train.shape[0], num_rows, num_columns, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_rows, num_columns, num_channels)

num_labels = yy.shape[1]
filter_size = 2

In [7]:
# Construct model 
model = Sequential()
model.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns, num_channels), activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))
model.add(GlobalAveragePooling2D())

model.add(Dense(num_labels, activation='softmax')) 

**Compile the Model**
<p>Loss function - we will use categorical_crossentropy. This is the most common choice for classification. A lower score indicates that the model is performing better.</p>
<p>Metrics - we will use the accuracy metric which will allow us to view the accuracy score on the validation data when we train the model.</p>
<p>Optimizer - here we will use adam which is a generally good optimizer for many use cases.</p>

In [8]:
# Compile the model
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam') 

In [9]:
# Display model architecture summary 
model.summary()

# Calculate pre-training accuracy 
score = model.evaluate(x_test, y_test, verbose=1)
accuracy = 100*score[1]

print("Pre-training accuracy: %.4f%%" % accuracy)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 39, 62, 16)        80        
                                                                 
 max_pooling2d (MaxPooling2D  (None, 19, 31, 16)       0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 19, 31, 16)        0         
                                                                 
 conv2d_1 (Conv2D)           (None, 18, 30, 32)        2080      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 9, 15, 32)        0         
 2D)                                                             
                                                                 
 dropout_1 (Dropout)         (None, 9, 15, 32)         0

**Training**

In [10]:
from keras.callbacks import ModelCheckpoint 
from datetime import datetime 

num_epochs = 72
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath='saved_models/mfcc_weights.best.basic_cnn.hdf5', 
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(x_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/72
Epoch 1: val_loss improved from inf to 1.56936, saving model to saved_models\mfcc_weights.best.basic_cnn.hdf5
Epoch 2/72
Epoch 2: val_loss improved from 1.56936 to 1.34589, saving model to saved_models\mfcc_weights.best.basic_cnn.hdf5
Epoch 3/72
Epoch 3: val_loss improved from 1.34589 to 1.21468, saving model to saved_models\mfcc_weights.best.basic_cnn.hdf5
Epoch 4/72
Epoch 4: val_loss improved from 1.21468 to 1.12248, saving model to saved_models\mfcc_weights.best.basic_cnn.hdf5
Epoch 5/72
Epoch 5: val_loss improved from 1.12248 to 1.07650, saving model to saved_models\mfcc_weights.best.basic_cnn.hdf5
Epoch 6/72
Epoch 6: val_loss improved from 1.07650 to 0.94161, saving model to saved_models\mfcc_weights.best.basic_cnn.hdf5
Epoch 7/72
Epoch 7: val_loss improved from 0.94161 to 0.87230, saving model to saved_models\mfcc_weights.best.basic_cnn.hdf5
Epoch 8/72
Epoch 8: val_loss improved from 0.87230 to 0.83875, saving model to saved_models\mfcc_weights.best.basic_cnn.hdf5
Epoc

**Test the model**

In [11]:
# Evaluating the model on the training and testing set
score = model.evaluate(x_train, y_train, verbose=0)
print("Training Accuracy: ", score[1])

score = model.evaluate(x_test, y_test, verbose=0)
print("Testing Accuracy: ", score[1])

Training Accuracy:  0.970365047454834
Testing Accuracy:  0.8746422529220581


**Save the Model**

In [12]:
model.save('saved_models/MFCC_Classification_Model.keras')

**Prediction**

In [17]:
# Slice for common length of 1 seconds
def slice_audio(librosa_audio, librosa_sample_rate = SAMPLE_RATE):
    SAMPLE_LENGTH = 2 * librosa_sample_rate

    librosa_audio_sliced = librosa_audio[:SAMPLE_LENGTH]
    if len(librosa_audio) < SAMPLE_LENGTH:
        # print(f"Audio length {len(librosa_audio)} is less than 2 seconds. Padding with zeros.")
        # np.pad specifies the number of values to add at the beginning and the end of the librosa_audio array.
        # 0 -> no padding in the beginning.
        # SAMPLE_LENGTH - len(librosa_audio) -> number of zeros to end, ensuring the total length is 2 seconds.
        librosa_audio_sliced = np.pad(librosa_audio, (0, SAMPLE_LENGTH - len(librosa_audio)), constant_values=0)
    return librosa_audio_sliced

# print(f"Librosa audio before: {librosa_audio.shape} and after: {slice_audio(librosa_audio).shape}")

In [18]:
import numpy as np
import librosa
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
import os

max_pad_len = 174

def extract_mfccs(audio_path):
    audio_file, librosa_sample_rate = librosa.load(audio_path, sr=SAMPLE_RATE, res_type='kaiser_fast')
    audio_file = slice_audio(audio_file, librosa_sample_rate)
    mfccs = librosa.feature.mfcc(y=audio_file, sr=SAMPLE_RATE, n_mfcc=40)

    return mfccs

In [19]:
def print_prediction(file_name):
    prediction_feature = extract_mfccs(file_name)
    prediction_feature = prediction_feature.reshape(1, num_rows, num_columns, num_channels)

    predicted_vector = np.argmax(model.predict(prediction_feature), axis=-1)
    predicted_class = le.inverse_transform(predicted_vector) 
    print("The predicted class is:", predicted_class[0], '\n') 

    predicted_proba_vector = model.predict(prediction_feature) 
    predicted_proba = predicted_proba_vector[0]
    for i in range(len(predicted_proba)): 
        category = le.inverse_transform(np.array([i]))
        print(category[0], "\t\t : ", format(predicted_proba[i], '.32f') )

**Validation**
<p>On new samples<p>

In [20]:
from tensorflow.keras.models import load_model
model = load_model('saved_models/MFCC_Classification_Model.keras')

VAL_DIR = "D:\\Code\\ProjectsPython\\ML_TrainingGround\\ML_Audio\\data\\UrbanSound8K\\validation"

In [21]:
# Class: Air Conditioner
filename = os.path.join(VAL_DIR, "air_conditioner.mp3")
print_prediction(filename)

The predicted class is: jackhammer 

air_conditioner 		 :  0.24512082338333129882812500000000
car_horn 		 :  0.00003541887053870595991611480713
children_playing 		 :  0.00031827684142626821994781494141
dog_bark 		 :  0.00039201395702548325061798095703
drilling 		 :  0.00069823028752580285072326660156
engine_idling 		 :  0.03670024871826171875000000000000
gun_shot 		 :  0.00001047563091560732573270797729
jackhammer 		 :  0.71592324972152709960937500000000
siren 		 :  0.00001376378622808260843157768250
street_music 		 :  0.00078757031587883830070495605469


In [22]:
# Class: Car idle
filename = os.path.join(VAL_DIR, "car_idle.mp3")
print_prediction(filename) 

The predicted class is: engine_idling 

air_conditioner 		 :  0.00286324089393019676208496093750
car_horn 		 :  0.00130091654136776924133300781250
children_playing 		 :  0.01832863874733448028564453125000
dog_bark 		 :  0.00916135776787996292114257812500
drilling 		 :  0.00254237023182213306427001953125
engine_idling 		 :  0.96144711971282958984375000000000
gun_shot 		 :  0.00018570467364042997360229492188
jackhammer 		 :  0.00011460556561360135674476623535
siren 		 :  0.00045504764420911669731140136719
street_music 		 :  0.00360105535946786403656005859375


In [23]:
# Class: dog bark
filename = os.path.join(VAL_DIR, "dog_barking.mp3")
print_prediction(filename) 

The predicted class is: dog_bark 

air_conditioner 		 :  0.00000012250313830008963122963905
car_horn 		 :  0.00000000019495487169063352439480
children_playing 		 :  0.00010973527241731062531471252441
dog_bark 		 :  0.99856144189834594726562500000000
drilling 		 :  0.00002193625732616055756807327271
engine_idling 		 :  0.00000723267748981015756726264954
gun_shot 		 :  0.00000198247585103672463446855545
jackhammer 		 :  0.00000000284088241819802078680368
siren 		 :  0.00000902459214557893574237823486
street_music 		 :  0.00128845463041216135025024414062


In [24]:
# Class: drill
filename = os.path.join(VAL_DIR, "drill.mp3")
print_prediction(filename) 

The predicted class is: drilling 

air_conditioner 		 :  0.00000003343551924217535997740924
car_horn 		 :  0.00000099133046660426771268248558
children_playing 		 :  0.00000083997070987607003189623356
dog_bark 		 :  0.00000586846954320208169519901276
drilling 		 :  0.99572229385375976562500000000000
engine_idling 		 :  0.00000006921179362961993319913745
gun_shot 		 :  0.00000000029437630111317503178725
jackhammer 		 :  0.00424809427931904792785644531250
siren 		 :  0.00000003137820669962820829823613
street_music 		 :  0.00002177190617658197879791259766


In [25]:
# Class: jackhammer
filename = os.path.join(VAL_DIR, "jackhammer.mp3")
print_prediction(filename) 

The predicted class is: drilling 

air_conditioner 		 :  0.00000009252584476371339405886829
car_horn 		 :  0.01980255171656608581542968750000
children_playing 		 :  0.00029384542722254991531372070312
dog_bark 		 :  0.00001114642145694233477115631104
drilling 		 :  0.89985805749893188476562500000000
engine_idling 		 :  0.00000005315106577086226025130600
gun_shot 		 :  0.00000000738547267786771044484340
jackhammer 		 :  0.08002285659313201904296875000000
siren 		 :  0.00000501551585330162197351455688
street_music 		 :  0.00000646574881102424114942550659


In [26]:
# Class: kids playing
filename = os.path.join(VAL_DIR, "kids_playing.mp3")
print_prediction(filename) 

The predicted class is: children_playing 

air_conditioner 		 :  0.00000000000004040043242981537774
car_horn 		 :  0.00000000000601772451314097622799
children_playing 		 :  0.99999940395355224609375000000000
dog_bark 		 :  0.00000003114811519822069385554641
drilling 		 :  0.00000000000016285878082309551695
engine_idling 		 :  0.00000000000001992895221948360468
gun_shot 		 :  0.00000000000096059521014629778435
jackhammer 		 :  0.00000000000000000007123822253209
siren 		 :  0.00000030755370516999391838908195
street_music 		 :  0.00000032211161737905058544129133


In [27]:
# Class: siren
filename = os.path.join(VAL_DIR, "siren.mp3")
print_prediction(filename) 

The predicted class is: siren 

air_conditioner 		 :  0.00000000000000000000000001140671
car_horn 		 :  0.00000000000000002112585039183073
children_playing 		 :  0.00000001112683900572619677404873
dog_bark 		 :  0.00000003421703453909685777034611
drilling 		 :  0.00000000309306646961715614452260
engine_idling 		 :  0.00000000000000000280541034443381
gun_shot 		 :  0.00000000000000000000000000131681
jackhammer 		 :  0.00000000000000000000003406316535
siren 		 :  1.00000000000000000000000000000000
street_music 		 :  0.00000003555681615807770867832005


In [28]:
# Class: street music
filename = os.path.join(VAL_DIR, "street_music.mp3")
print_prediction(filename) 

The predicted class is: street_music 

air_conditioner 		 :  0.16020715236663818359375000000000
car_horn 		 :  0.12585353851318359375000000000000
children_playing 		 :  0.00166083616204559803009033203125
dog_bark 		 :  0.00955508183687925338745117187500
drilling 		 :  0.00012191841233288869261741638184
engine_idling 		 :  0.02299744077026844024658203125000
gun_shot 		 :  0.00000187804187135043321177363396
jackhammer 		 :  0.10391051322221755981445312500000
siren 		 :  0.00975450128316879272460937500000
street_music 		 :  0.56593722105026245117187500000000
