<b>Load preprocessed dataset

In [2]:
# retrieve the preprocessed data from previous notebook

%store -r x_train 
%store -r x_test 
%store -r y_train 
%store -r y_test 
%store -r yy 
%store -r le

In [3]:
x_train.shape

(6985, 257, 126)

In [4]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D, Conv2D, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn import metrics

num_rows = 257
num_columns = 126
num_channels = 1
SAMPLE_RATE = 16000

**Construct the Model**

In [5]:
x_train = x_train.reshape(x_train.shape[0], num_rows, num_columns, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_rows, num_columns, num_channels)

num_labels = yy.shape[1]
filter_size = 2

In [6]:
# Construct model 
model = Sequential()
model.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns, num_channels), activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))
model.add(GlobalAveragePooling2D())

model.add(Dense(num_labels, activation='softmax')) 

**Compile the Model**
<p>Loss function - we will use categorical_crossentropy. This is the most common choice for classification. A lower score indicates that the model is performing better.</p>
<p>Metrics - we will use the accuracy metric which will allow us to view the accuracy score on the validation data when we train the model.</p>
<p>Optimizer - here we will use adam which is a generally good optimizer for many use cases.</p>

In [7]:
# Compile the model
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam') 

In [8]:
# Display model architecture summary 
model.summary()

# Calculate pre-training accuracy 
score = model.evaluate(x_test, y_test, verbose=1)
accuracy = 100*score[1]

print("Pre-training accuracy: %.4f%%" % accuracy)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 256, 125, 16)      80        
                                                                 
 max_pooling2d (MaxPooling2D  (None, 128, 62, 16)      0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 128, 62, 16)       0         
                                                                 
 conv2d_1 (Conv2D)           (None, 127, 61, 32)       2080      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 63, 30, 32)       0         
 2D)                                                             
                                                                 
 dropout_1 (Dropout)         (None, 63, 30, 32)        0

**Training**

In [9]:
from keras.callbacks import CSVLogger, ModelCheckpoint
from datetime import datetime 
import os

num_epochs = 72
num_batch_size = 256

checkpointer = ModelCheckpoint(filepath='saved_models/spectrogramweights.best.basic_cnn.hdf5', 
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(x_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/72
Epoch 1: val_loss improved from inf to 2.39909, saving model to saved_models\spectrogramweights.best.basic_cnn.hdf5
Epoch 2/72
Epoch 2: val_loss improved from 2.39909 to 2.31486, saving model to saved_models\spectrogramweights.best.basic_cnn.hdf5
Epoch 3/72
Epoch 3: val_loss improved from 2.31486 to 2.19259, saving model to saved_models\spectrogramweights.best.basic_cnn.hdf5
Epoch 4/72
Epoch 4: val_loss improved from 2.19259 to 2.11973, saving model to saved_models\spectrogramweights.best.basic_cnn.hdf5
Epoch 5/72
Epoch 5: val_loss improved from 2.11973 to 2.08838, saving model to saved_models\spectrogramweights.best.basic_cnn.hdf5
Epoch 6/72
Epoch 6: val_loss improved from 2.08838 to 2.07683, saving model to saved_models\spectrogramweights.best.basic_cnn.hdf5
Epoch 7/72
Epoch 7: val_loss improved from 2.07683 to 2.06254, saving model to saved_models\spectrogramweights.best.basic_cnn.hdf5
Epoch 8/72
Epoch 8: val_loss did not improve from 2.06254
Epoch 9/72
Epoch 9: val_loss 

**Test the model**

In [10]:
# Evaluating the model on the training and testing set
score = model.evaluate(x_train, y_train, verbose=0)
print("Training Accuracy: ", score[1])

score = model.evaluate(x_test, y_test, verbose=0)
print("Testing Accuracy: ", score[1])

Training Accuracy:  0.53214031457901
Testing Accuracy:  0.5231825709342957


**Save the Model**

In [11]:
model.save('saved_models/Spectrogram_Classification_Model.keras')

**Prediction**

In [12]:
import numpy as np
import librosa
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
import os


# Slice for common length of 2 seconds
def slice_audio(librosa_audio, librosa_sample_rate = 22050):
    SAMPLE_LENGTH = 2 * librosa_sample_rate

    librosa_audio_sliced = librosa_audio[:SAMPLE_LENGTH]
    if len(librosa_audio) < SAMPLE_LENGTH:
        # print(f"Audio length {len(librosa_audio)} is less than 2 seconds. Padding with zeros.")
        # np.pad specifies the number of values to add at the beginning and the end of the librosa_audio array.
        # 0 -> no padding in the beginning.
        # SAMPLE_LENGTH - len(librosa_audio) -> number of zeros to end, ensuring the total length is 2 seconds.
        librosa_audio_sliced = np.pad(librosa_audio, (0, SAMPLE_LENGTH - len(librosa_audio)), constant_values=0)
    return librosa_audio_sliced


def extract_spectrogram(audio_path):
    
    audio_file, librosa_sample_rate = librosa.load(audio_path, sr=SAMPLE_RATE, res_type='kaiser_fast')
    audio_file = slice_audio(audio_file, librosa_sample_rate)

    spectrogram = librosa.stft(audio_file, n_fft=512, win_length=256, dtype=np.float32)
    spectrogram = librosa.amplitude_to_db(abs(spectrogram), ref=np.max)
    #librosa.display.specshow(spectrogram, sr=librosa_sample_rate, x_axis='time')

    # spectrogram = tf.expand_dims(spectrogram, axis = 2)

    return spectrogram

In [13]:
def print_prediction(file_name):
    prediction_feature = extract_spectrogram(file_name)
    prediction_feature = prediction_feature.reshape(1, num_rows, num_columns, num_channels)

    predicted_vector = np.argmax(model.predict(prediction_feature), axis=-1)
    predicted_class = le.inverse_transform(predicted_vector) 
    print("The predicted class is:", predicted_class[0], '\n') 

    predicted_proba_vector = model.predict(prediction_feature) 
    predicted_proba = predicted_proba_vector[0]
    for i in range(len(predicted_proba)): 
        category = le.inverse_transform(np.array([i]))
        print(category[0], "\t\t : ", format(predicted_proba[i], '.32f') )

**Validation**
<p>On new samples<p>

In [14]:
from tensorflow.keras.models import load_model
model = load_model('saved_models/Spectrogram_Classification_Model.keras')

VAL_DIR = "D:\\Code\\ProjectsPython\\ML_TrainingGround\\ML_Audio\\data\\UrbanSound8K\\validation"

In [15]:
# Class: Air Conditioner
filename = os.path.join(VAL_DIR, "air_conditioner.mp3")
print_prediction(filename)

  stft_matrix[..., :off_start] = fft.rfft(fft_window * y_frames_pre, axis=-2)
  stft_matrix[..., -off_end:] = fft.rfft(fft_window * y_frames_post, axis=-2)
  stft_matrix[..., bl_s + off_start : bl_t + off_start] = fft.rfft(


ValueError: cannot reshape array of size 128757 into shape (1,257,126,1)

In [None]:
# Class: Car idle
filename = os.path.join(VAL_DIR, "car_idle.mp3")
print_prediction(filename) 

In [None]:
# Class: dog bark
filename = os.path.join(VAL_DIR, "dog_barking.mp3")
print_prediction(filename) 

In [None]:
# Class: drill
filename = os.path.join(VAL_DIR, "drill.mp3")
print_prediction(filename) 

In [None]:
# Class: jackhammer
filename = os.path.join(VAL_DIR, "jackhammer.mp3")
print_prediction(filename) 

In [None]:
# Class: kids playing
filename = os.path.join(VAL_DIR, "kids_playing.mp3")
print_prediction(filename) 

In [None]:
# Class: siren
filename = os.path.join(VAL_DIR, "siren.mp3")
print_prediction(filename) 

In [None]:
# Class: street music
filename = os.path.join(VAL_DIR, "street_music.mp3")
print_prediction(filename) 