In [1]:
import IPython
import math
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import random
from datetime import datetime
from keras import backend as keras_backend
from keras.models import Sequential, load_model
from keras.layers import Dense, SpatialDropout2D, Activation, Conv2D, MaxPooling2D, BatchNormalization, GlobalAveragePooling2D, LeakyReLU
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.regularizers import l2
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import keras.backend as K
import tensorflow as tf

In [None]:
labels = pd.read_csv('/content/drive/MyDrive/Gen AI/SoundClassification/EmergencyvsNonEmergencyVehicleSoundClassification-200414-193707/train_fuSp8nd.csv')
labels.head()

Unnamed: 0,ID,Class
0,0,siren
1,1,street_music
2,2,drilling
3,3,siren
4,4,dog_bark


In [None]:
# Ensure "channel last" data format on Keras
keras_backend.set_image_data_format('channels_last')

# Define a labels array for future use
labels = [
        'Air Conditioner',
        'Car Horn',
        'Children Playing',
        'Dog bark',
        'Drilling',
        'Engine Idling',
        'Gun Shot',
        'Jackhammer',
        'Siren',
        'Street Music'
    ]

In [None]:
X = np.load("/content/drive/MyDrive/Gen AI/SoundClassification/EmergencyvsNonEmergencyVehicleSoundClassification-200414-193707/X-mfcc_ass.npy")

In [2]:
y = np.load("/content/drive/MyDrive/Gen AI/SoundClassification/EmergencyvsNonEmergencyVehicleSoundClassification-200414-193707/y-mfcc_ass.npy", allow_pickle=True)

In [3]:
y = y[:, 1]

In [4]:
y

array(['siren', 'street_music', 'drilling', ..., 'engine_idling',
       'engine_idling', 'air_conditioner'], dtype=object)

In [5]:
y[:5]

array(['siren', 'street_music', 'drilling', 'siren', 'dog_bark'],
      dtype=object)

In [None]:
print(X.shape)
print(y.shape)

(5435, 40, 751)
(5435,)


In [6]:
le = LabelEncoder()
y_encoded = to_categorical(le.fit_transform(y))

In [8]:
le.classes_

array(['air_conditioner', 'car_horn', 'children_playing', 'dog_bark',
       'drilling', 'engine_idling', 'gun_shot', 'jackhammer', 'siren',
       'street_music'], dtype=object)

In [9]:
le.transform(['air_conditioner', 'car_horn', 'children_playing', 'dog_bark',
       'drilling', 'engine_idling', 'gun_shot', 'jackhammer', 'siren',
       'street_music'])

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [7]:
y_encoded

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [None]:
y_encoded

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [None]:
y_encoded.shape

(5435, 10)

In [None]:
# How data should be structured
num_rows = 40
num_columns = 751
num_channels = 1

# Reshape to fit the network input (channel last)
X = X.reshape(X.shape[0], num_rows, num_columns, num_channels)

In [None]:
# Total number of labels to predict (equal to the network output nodes)
num_labels = y_encoded.shape[1]

In [None]:
print(X.shape)
print(y_encoded.shape)

(5435, 40, 751, 1)
(5435, 10)


In [None]:
def create_model(spatial_dropout_rate_1=0, spatial_dropout_rate_2=0, l2_rate=0):

    # Create a secquential object
    model = Sequential()


    # Conv 1
    model.add(Conv2D(filters=32,
                     kernel_size=(3, 3),
                     kernel_regularizer=l2(l2_rate),
                     input_shape=(num_rows, num_columns, num_channels)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())

    model.add(SpatialDropout2D(spatial_dropout_rate_1))
    model.add(Conv2D(filters=32,
                     kernel_size=(3, 3),
                     kernel_regularizer=l2(l2_rate)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())


    # Max Pooling #1
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(SpatialDropout2D(spatial_dropout_rate_1))
    model.add(Conv2D(filters=64,
                     kernel_size=(3, 3),
                     kernel_regularizer=l2(l2_rate)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())

    model.add(SpatialDropout2D(spatial_dropout_rate_2))
    model.add(Conv2D(filters=64,
                     kernel_size=(3,3),
                     kernel_regularizer=l2(l2_rate)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())


    # Reduces each h×w feature map to a single number by taking the average of all h,w values.
    model.add(GlobalAveragePooling2D())


    # Softmax output
    model.add(Dense(num_labels, activation='softmax'))

    return model

# Regularization rates
spatial_dropout_rate_1 = 0.07
spatial_dropout_rate_2 = 0.14
l2_rate = 0.0005

model = create_model(spatial_dropout_rate_1, spatial_dropout_rate_2, l2_rate)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
adam = Adam(learning_rate=1e-4, beta_1=0.99, beta_2=0.999)
model.compile(
    loss='categorical_crossentropy',
    metrics=['accuracy'],
    optimizer=adam)

# Display model architecture summary
model.summary()

In [None]:
num_epochs = 250
num_batch_size = 128
# model_file = 'simple-train-nb3.hdf5'
model_file = 'simple-train-nb3.keras'
model_path = '/content/drive/MyDrive/Gen AI/SoundClassification/EmergencyvsNonEmergencyVehicleSoundClassification-200414-193707/'+model_file


# Save checkpoints
checkpointer = ModelCheckpoint(filepath=model_path,
                               verbose=1,
                               save_best_only=True)
start = datetime.now()
history = model.fit(X,
                    y_encoded,
                    batch_size=num_batch_size,
                    epochs=num_epochs,
                    validation_split=1/12.,
                    callbacks=[checkpointer],
                    verbose=1)

duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/250
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.2772 - loss: 2.1768
Epoch 1: val_loss improved from inf to 2.23128, saving model to /content/drive/MyDrive/Gen AI/SoundClassification/EmergencyvsNonEmergencyVehicleSoundClassification-200414-193707/simple-train-nb3.keras
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step - accuracy: 0.2777 - loss: 2.1758 - val_accuracy: 0.2826 - val_loss: 2.2313
Epoch 2/250
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.3343 - loss: 2.0684
Epoch 2: val_loss improved from 2.23128 to 2.14742, saving model to /content/drive/MyDrive/Gen AI/SoundClassification/EmergencyvsNonEmergencyVehicleSoundClassification-200414-193707/simple-train-nb3.keras
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - accuracy: 0.3345 - loss: 2.0679 - val_accuracy: 0.3223 - val_loss: 2.1474
Epoch 3/250
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0

In [None]:
model = load_model(model_path)

In [None]:
y_pred = model.predict(X)

[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [None]:
y_pred

array([[1.19017646e-01, 2.00404599e-02, 6.06138725e-03, ...,
        2.32620370e-02, 5.29978991e-01, 5.05872490e-03],
       [6.90617934e-02, 7.86446966e-03, 3.74338171e-03, ...,
        2.15449911e-02, 1.69014471e-04, 8.84960949e-01],
       [3.54755201e-07, 6.37670382e-05, 3.68608698e-06, ...,
        4.05140469e-07, 1.82220745e-07, 1.42566481e-04],
       ...,
       [1.20468661e-02, 1.24869912e-04, 1.88351125e-02, ...,
        6.73542451e-03, 3.23784025e-03, 9.26393457e-03],
       [3.79765220e-02, 1.28288905e-03, 9.05860681e-03, ...,
        2.63150781e-03, 1.47274346e-03, 3.43145244e-02],
       [9.66671824e-01, 8.41160363e-04, 1.37274736e-03, ...,
        5.96020727e-05, 2.36969930e-03, 2.26401016e-02]], dtype=float32)

In [None]:
y_pred = np.argmax(y_pred, axis=1)

In [None]:
y_pred

array([8, 9, 4, ..., 5, 5, 0])

In [None]:
y_encoded_ = np.argmax(y_encoded, axis=1)

In [None]:
print(classification_report(y_encoded_, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97       600
           1       0.86      0.93      0.89       306
           2       0.90      0.90      0.90       600
           3       0.85      0.89      0.87       600
           4       0.96      0.92      0.94       600
           5       0.96      0.97      0.96       624
           6       0.93      0.94      0.93       230
           7       0.99      0.99      0.99       668
           8       0.95      0.94      0.95       607
           9       0.92      0.83      0.87       600

    accuracy                           0.93      5435
   macro avg       0.93      0.93      0.93      5435
weighted avg       0.93      0.93      0.93      5435



In [None]:
print(confusion_matrix(y_encoded_, y_pred))

[[588   0   1   1   0   1   1   1   3   4]
 [  0 285   0  10   5   0   0   0   1   5]
 [  3   1 538  25   3   3   0   0   4  23]
 [  4  14  20 535   8   7   4   0   3   5]
 [  1  17   1  16 552   0   7   3   0   3]
 [  2   2   0   0   0 604   4   1   8   3]
 [  0   0   0  12   2   0 216   0   0   0]
 [  0   3   0   0   0   0   1 664   0   0]
 [  2   5   0  23   2   2   0   0 570   3]
 [ 16   4  41   8   5  13   0   4   8 501]]
