<b>Load preprocessed dataset

In [42]:
# retrieve the preprocessed data from previous notebook

%store -r x_train 
%store -r x_test 
%store -r y_train 
%store -r y_test 
%store -r yy 
%store -r le

In [45]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D, Conv2D, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn import metrics

num_rows = 128
num_columns = 44
num_channels = 1

**Construct the Model**

In [46]:
x_train = x_train.reshape(x_train.shape[0], num_rows, num_columns, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_rows, num_columns, num_channels)

num_labels = yy.shape[1]

In [47]:
# Construct model 
model = Sequential()
model.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns, num_channels), activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))
model.add(GlobalAveragePooling2D())

model.add(Dense(num_labels, activation='softmax')) 

**Compile the Model**
<p>Loss function - we will use categorical_crossentropy. This is the most common choice for classification. A lower score indicates that the model is performing better.</p>
<p>Metrics - we will use the accuracy metric which will allow us to view the accuracy score on the validation data when we train the model.</p>
<p>Optimizer - here we will use adam which is a generally good optimizer for many use cases.</p>

In [49]:
# Compile the model
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam') 

In [50]:
# Display model architecture summary 
model.summary()

# Calculate pre-training accuracy 
score = model.evaluate(x_test, y_test, verbose=1)
accuracy = 100*score[1]

print("Pre-training accuracy: %.4f%%" % accuracy)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_12 (Conv2D)          (None, 127, 43, 16)       80        
                                                                 
 max_pooling2d_12 (MaxPoolin  (None, 63, 21, 16)       0         
 g2D)                                                            
                                                                 
 dropout_12 (Dropout)        (None, 63, 21, 16)        0         
                                                                 
 conv2d_13 (Conv2D)          (None, 62, 20, 32)        2080      
                                                                 
 max_pooling2d_13 (MaxPoolin  (None, 31, 10, 32)       0         
 g2D)                                                            
                                                                 
 dropout_13 (Dropout)        (None, 31, 10, 32)       

**Training**

In [33]:
from keras.callbacks import ModelCheckpoint 
from datetime import datetime 

num_epochs = 100
num_batch_size = 256

checkpointer = ModelCheckpoint(filepath='saved_models/.mels_spectrogram_checkboint.hdf5', 
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(x_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/100
Epoch 1: val_loss improved from inf to 2.31221, saving model to saved_models\.mels_spectrogram_checkboint.hdf5
Epoch 2/100
Epoch 2: val_loss improved from 2.31221 to 2.28986, saving model to saved_models\.mels_spectrogram_checkboint.hdf5
Epoch 3/100
Epoch 3: val_loss improved from 2.28986 to 2.23215, saving model to saved_models\.mels_spectrogram_checkboint.hdf5
Epoch 4/100
Epoch 4: val_loss improved from 2.23215 to 2.14457, saving model to saved_models\.mels_spectrogram_checkboint.hdf5
Epoch 5/100
Epoch 5: val_loss improved from 2.14457 to 2.04509, saving model to saved_models\.mels_spectrogram_checkboint.hdf5
Epoch 6/100
Epoch 6: val_loss improved from 2.04509 to 1.96227, saving model to saved_models\.mels_spectrogram_checkboint.hdf5
Epoch 7/100
Epoch 7: val_loss improved from 1.96227 to 1.88664, saving model to saved_models\.mels_spectrogram_checkboint.hdf5
Epoch 8/100
Epoch 8: val_loss improved from 1.88664 to 1.84817, saving model to saved_models\.mels_spectrogram_chec

**Test the model**

In [51]:
# Evaluating the model on the training and testing set
score = model.evaluate(x_train, y_train, verbose=0)
print("Training Accuracy: ", score[1])

score = model.evaluate(x_test, y_test, verbose=0)
print("Testing Accuracy: ", score[1])

Training Accuracy:  0.11438797414302826
Testing Accuracy:  0.11505437642335892


**Save the Model**

In [35]:
model.save('saved_models/MelSpectrogram_Classification_Model_2.keras')

**Prediction**

In [36]:
import numpy as np
import librosa
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
import os


# Slice for common length of 2 seconds
def slice_audio(librosa_audio, librosa_sample_rate = 22050):
    SAMPLE_LENGTH = 2 * librosa_sample_rate

    librosa_audio_sliced = librosa_audio[:SAMPLE_LENGTH]
    if len(librosa_audio) < SAMPLE_LENGTH:
        # print(f"Audio length {len(librosa_audio)} is less than 2 seconds. Padding with zeros.")
        # np.pad specifies the number of values to add at the beginning and the end of the librosa_audio array.
        # 0 -> no padding in the beginning.
        # SAMPLE_LENGTH - len(librosa_audio) -> number of zeros to end, ensuring the total length is 2 seconds.
        librosa_audio_sliced = np.pad(librosa_audio, (0, SAMPLE_LENGTH - len(librosa_audio)), constant_values=0)
    return librosa_audio_sliced


def extract_spectrogram(audio_path):
    
    audio_file, librosa_sample_rate = librosa.load(audio_path, res_type='kaiser_fast')
    audio_file = slice_audio(audio_file, librosa_sample_rate)

    spectrogram = librosa.stft(audio_file, n_fft=512, win_length=512, dtype=np.float32)
    spectrogram = librosa.amplitude_to_db(abs(spectrogram), ref=np.max)
    #librosa.display.specshow(spectrogram, sr=librosa_sample_rate, x_axis='time')

    # spectrogram = tf.expand_dims(spectrogram, axis = 2)

    return spectrogram

In [37]:
def print_prediction(file_name):
    prediction_feature = extract_spectrogram(file_name)
    prediction_feature = prediction_feature.reshape(1, num_rows, num_columns, num_channels)

    predicted_vector = np.argmax(model.predict(prediction_feature), axis=-1)
    predicted_class = le.inverse_transform(predicted_vector) 
    print("The predicted class is:", predicted_class[0], '\n') 

    predicted_proba_vector = model.predict(prediction_feature) 
    predicted_proba = predicted_proba_vector[0]
    for i in range(len(predicted_proba)): 
        category = le.inverse_transform(np.array([i]))
        print(category[0], "\t\t : ", format(predicted_proba[i], '.32f') )

**Validation**
<p>On new samples<p>

In [38]:
from tensorflow.keras.models import load_model
model = load_model('saved_models/Spectrogram_Classification_Model.keras')

VAL_DIR = "D:\\Code\\ProjectsPython\\ML_TrainingGround\\ML_Audio\\data\\UrbanSound8K\\validation"

In [39]:
# Class: Air Conditioner
filename = os.path.join(VAL_DIR, "air_conditioner.mp3")
print_prediction(filename)

ValueError: cannot reshape array of size 88665 into shape (1,128,44,1)

In [None]:
# Class: Car idle
filename = os.path.join(VAL_DIR, "car_idle.mp3")
print_prediction(filename) 

The predicted class is: engine_idling 

air_conditioner 		 :  0.19074751436710357666015625000000
car_horn 		 :  0.00494202831760048866271972656250
children_playing 		 :  0.01620393991470336914062500000000
dog_bark 		 :  0.00649807881563901901245117187500
drilling 		 :  0.01107691135257482528686523437500
engine_idling 		 :  0.62436318397521972656250000000000
gun_shot 		 :  0.01037050131708383560180664062500
jackhammer 		 :  0.05050666257739067077636718750000
siren 		 :  0.06733405590057373046875000000000
street_music 		 :  0.01795705407857894897460937500000


  stft_matrix[..., :off_start] = fft.rfft(fft_window * y_frames_pre, axis=-2)
  stft_matrix[..., -off_end:] = fft.rfft(fft_window * y_frames_post, axis=-2)
  stft_matrix[..., bl_s + off_start : bl_t + off_start] = fft.rfft(


In [None]:
# Class: dog bark
filename = os.path.join(VAL_DIR, "dog_barking.mp3")
print_prediction(filename) 

The predicted class is: siren 

air_conditioner 		 :  0.04661423712968826293945312500000
car_horn 		 :  0.08865276724100112915039062500000
children_playing 		 :  0.11344639956951141357421875000000
dog_bark 		 :  0.13278207182884216308593750000000
drilling 		 :  0.08890788257122039794921875000000
engine_idling 		 :  0.04933480918407440185546875000000
gun_shot 		 :  0.03988853842020034790039062500000
jackhammer 		 :  0.06437175720930099487304687500000
siren 		 :  0.24432623386383056640625000000000
street_music 		 :  0.13167531788349151611328125000000


  stft_matrix[..., :off_start] = fft.rfft(fft_window * y_frames_pre, axis=-2)
  stft_matrix[..., -off_end:] = fft.rfft(fft_window * y_frames_post, axis=-2)
  stft_matrix[..., bl_s + off_start : bl_t + off_start] = fft.rfft(


In [None]:
# Class: drill
filename = os.path.join(VAL_DIR, "drill.mp3")
print_prediction(filename) 

The predicted class is: siren 

air_conditioner 		 :  0.00457365065813064575195312500000
car_horn 		 :  0.01387429982423782348632812500000
children_playing 		 :  0.00199420424178242683410644531250
dog_bark 		 :  0.00151245389133691787719726562500
drilling 		 :  0.41210639476776123046875000000000
engine_idling 		 :  0.00198868243023753166198730468750
gun_shot 		 :  0.00001093672017304925248026847839
jackhammer 		 :  0.03502830117940902709960937500000
siren 		 :  0.52517592906951904296875000000000
street_music 		 :  0.00373518071137368679046630859375


  stft_matrix[..., :off_start] = fft.rfft(fft_window * y_frames_pre, axis=-2)
  stft_matrix[..., -off_end:] = fft.rfft(fft_window * y_frames_post, axis=-2)
  stft_matrix[..., bl_s + off_start : bl_t + off_start] = fft.rfft(


In [None]:
# Class: jackhammer
filename = os.path.join(VAL_DIR, "jackhammer.mp3")
print_prediction(filename) 

The predicted class is: siren 

air_conditioner 		 :  0.02664410322904586791992187500000
car_horn 		 :  0.02038235776126384735107421875000
children_playing 		 :  0.06551940739154815673828125000000
dog_bark 		 :  0.04880893230438232421875000000000
drilling 		 :  0.16975186765193939208984375000000
engine_idling 		 :  0.04078878834843635559082031250000
gun_shot 		 :  0.00293349893763661384582519531250
jackhammer 		 :  0.23104561865329742431640625000000
siren 		 :  0.36358994245529174804687500000000
street_music 		 :  0.03053554147481918334960937500000


  stft_matrix[..., :off_start] = fft.rfft(fft_window * y_frames_pre, axis=-2)
  stft_matrix[..., -off_end:] = fft.rfft(fft_window * y_frames_post, axis=-2)
  stft_matrix[..., bl_s + off_start : bl_t + off_start] = fft.rfft(


In [None]:
# Class: kids playing
filename = os.path.join(VAL_DIR, "kids_playing.mp3")
print_prediction(filename) 

The predicted class is: siren 



  stft_matrix[..., :off_start] = fft.rfft(fft_window * y_frames_pre, axis=-2)
  stft_matrix[..., -off_end:] = fft.rfft(fft_window * y_frames_post, axis=-2)
  stft_matrix[..., bl_s + off_start : bl_t + off_start] = fft.rfft(


air_conditioner 		 :  0.00301683344878256320953369140625
car_horn 		 :  0.10142213106155395507812500000000
children_playing 		 :  0.11878542602062225341796875000000
dog_bark 		 :  0.04199254512786865234375000000000
drilling 		 :  0.03343436121940612792968750000000
engine_idling 		 :  0.00188155565410852432250976562500
gun_shot 		 :  0.00051945296581834554672241210938
jackhammer 		 :  0.01036619301885366439819335937500
siren 		 :  0.54443687200546264648437500000000
street_music 		 :  0.14414460957050323486328125000000


In [None]:
# Class: siren
filename = os.path.join(VAL_DIR, "siren.mp3")
print_prediction(filename) 

The predicted class is: siren 

air_conditioner 		 :  0.00001835819057305343449115753174
car_horn 		 :  0.02222104184329509735107421875000
children_playing 		 :  0.00399238662794232368469238281250
dog_bark 		 :  0.00138486456125974655151367187500
drilling 		 :  0.00384465279057621955871582031250
engine_idling 		 :  0.00001672653343121055513620376587
gun_shot 		 :  0.00000098241787327424390241503716
jackhammer 		 :  0.00066541740670800209045410156250
siren 		 :  0.94724416732788085937500000000000
street_music 		 :  0.02061129733920097351074218750000


  stft_matrix[..., :off_start] = fft.rfft(fft_window * y_frames_pre, axis=-2)
  stft_matrix[..., -off_end:] = fft.rfft(fft_window * y_frames_post, axis=-2)
  stft_matrix[..., bl_s + off_start : bl_t + off_start] = fft.rfft(


In [None]:
# Class: street music
filename = os.path.join(VAL_DIR, "street_music.mp3")
print_prediction(filename) 

The predicted class is: street_music 

air_conditioner 		 :  0.04146852344274520874023437500000
car_horn 		 :  0.09178592264652252197265625000000
children_playing 		 :  0.04445578157901763916015625000000
dog_bark 		 :  0.02995055913925170898437500000000
drilling 		 :  0.04206641390919685363769531250000
engine_idling 		 :  0.03835495933890342712402343750000
gun_shot 		 :  0.00504108518362045288085937500000
jackhammer 		 :  0.05032153427600860595703125000000
siren 		 :  0.11103385686874389648437500000000
street_music 		 :  0.54552137851715087890625000000000


  stft_matrix[..., :off_start] = fft.rfft(fft_window * y_frames_pre, axis=-2)
  stft_matrix[..., -off_end:] = fft.rfft(fft_window * y_frames_post, axis=-2)
  stft_matrix[..., bl_s + off_start : bl_t + off_start] = fft.rfft(
