# add title

## Import libraries

In [1]:
import numpy as np
import tensorflow as tf
import keras
from keras.preprocessing.image import ImageDataGenerator

print("Tensorflow version %s" %tf.__version__)
print("Keras version %s" %keras.__version__)

Tensorflow version 2.4.1
Keras version 2.4.3


## Load data and get mel spectrogram from audio

In [2]:
# Load various imports 
import pandas as pd
import os
import librosa

max_pad_len = 174

def extract_features(file_name):
   
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        pad_width = max_pad_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None 
     
    return mfccs
    
# Set the path to the full UrbanSound dataset 
fulldatasetpath = '/Xception/UrbanSound8K/audio/'

metadata = pd.read_csv('UrbanSound8K/metadata/UrbanSound8K.csv')

features = []

# Iterate through each sound file and extract the features 
for index, row in metadata.iterrows():
    
    file_name = os.path.join(os.path.abspath(fulldatasetpath),'fold'+str(row["fold"]),str(row["slice_file_name"]))
    
    class_label = row["class"]
    data = extract_features(file_name)
    
    features.append([data, class_label])

# Convert into a Panda dataframe 
featuresdf = pd.DataFrame(features, columns=['feature','class_label'])

print('Finished feature extraction from ', len(featuresdf), ' files')



Finished feature extraction from  8732  files


## Preprocess data

In [3]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Convert features and corresponding classification labels into numpy arrays
X = np.array(featuresdf.feature.tolist())
y = np.array(featuresdf.class_label.tolist())

# Encode the classification labels
le = LabelEncoder()
yy = to_categorical(le.fit_transform(y)) 

# split the dataset 
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(X, yy, test_size=0.2, random_state = 42)

In [10]:
featuresdf.feature[0]
features
x_train[0]


array([[-8.3752213e+01, -7.1893913e+01, -7.4429947e+01, ...,
        -9.7842186e+01, -9.4238541e+01,  0.0000000e+00],
       [ 1.1466684e+02,  1.1912703e+02,  1.2508240e+02, ...,
         1.1245096e+02,  1.0646953e+02,  0.0000000e+00],
       [-7.3170403e+01, -8.1585869e+01, -8.7429298e+01, ...,
        -1.0017877e+02, -1.0056201e+02,  0.0000000e+00],
       ...,
       [ 6.2128234e-01, -1.4668137e-02,  9.3862867e-01, ...,
         5.4531813e-01,  3.0852802e+00,  0.0000000e+00],
       [ 3.8533199e+00,  1.1004624e+00,  1.5143282e+00, ...,
         4.1922727e+00,  4.5371199e+00,  0.0000000e+00],
       [ 3.2343304e+00, -2.4159760e+00, -1.5217065e+00, ...,
        -1.3523768e+01, -1.0443431e+01,  0.0000000e+00]], dtype=float32)

In [4]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn import metrics 

num_rows = 40
num_columns = 174
num_channels = 1

x_train = x_train.reshape(x_train.shape[0], num_rows, num_columns, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_rows, num_columns, num_channels)

num_labels = yy.shape[1]
filter_size = 2

# Construct model 
model = Sequential()
model.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns, num_channels), activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))
model.add(GlobalAveragePooling2D())

model.add(Dense(num_labels, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

# Display model architecture summary 
model.summary()

# Calculate pre-training accuracy 
score = model.evaluate(x_test, y_test, verbose=1)
accuracy = 100*score[1]

print("Pre-training accuracy: %.4f%%" % accuracy) 

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 39, 173, 16)       80        
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 19, 86, 16)        0         
_________________________________________________________________
dropout (Dropout)            (None, 19, 86, 16)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 18, 85, 32)        2080      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 9, 42, 32)         0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 9, 42, 32)         0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 8, 41, 64)         8

## Model

In [4]:
from keras.models import Sequential
from keras.layers import SeparableConv2D, ZeroPadding2D, Activation, Dropout, Dense, \
                            Conv2D, MaxPooling2D, Flatten, GlobalAveragePooling2D
from keras.layers.normalization import BatchNormalization
from keras import Input, optimizers

num_rows = 40
num_columns = 174
num_channels = 1

x_train = x_train.reshape(x_train.shape[0], num_rows, num_columns, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_rows, num_columns, num_channels)

num_labels = yy.shape[1]

def Net(input_shape, num_classes, dilated_kernel, dilation, dilated_padding):

    model = Sequential()
    
    # input layer
    #model.add(Input(shape=(256,256,1)))#TODO ADD REAL SIZE
    model.add(Input(shape=(num_rows, num_columns, num_channels)))
    
    # DWS-CNN layer 1
    model.add(ZeroPadding2D(padding=(2)))
    # use valid padding since padding is introduced before since it has a special form
    model.add(SeparableConv2D(256,kernel_size=(5,5), strides=(1,1), padding='valid'))
    model.add(Activation('relu'))
    # Batch Normalisation before passing it to the next layer
    model.add(BatchNormalization())
    # Pooling
    model.add(MaxPooling2D(pool_size=(1,5), strides=(1,5), padding='valid'))
    # Dropout
    model.add(Dropout(0.25))
    
    # DWS-CNN layer 2
    model.add(ZeroPadding2D(padding=(2, 2)))
    # use valid padding since padding is introduced before since it has a special form
    model.add(SeparableConv2D(256,kernel_size=(5,5), strides=(1,1), padding='valid'))
    model.add(Activation('relu'))
    # Batch Normalisation before passing it to the next layer
    model.add(BatchNormalization())
    # Pooling
    model.add(MaxPooling2D(pool_size=(1,4), strides=(1,4), padding='valid'))
    # Dropout
    model.add(Dropout(0.25))
    
    # DWS-CNN layer 3
    model.add(ZeroPadding2D(padding=(2, 2)))
    # use valid padding since padding is introduced before since it has a special form
    model.add(SeparableConv2D(256,kernel_size=(5,5), strides=(1,1), padding='valid'))
    model.add(Activation('relu'))
    # Batch Normalisation before passing it to the next layer
    model.add(BatchNormalization())
    # Pooling
    model.add(MaxPooling2D(pool_size=(1,2), strides=(1,2), padding='valid'))
    # Dropout
    model.add(Dropout(0.25))
    
    # DIL-CNN 
    model.add(ZeroPadding2D(padding=(0, dilated_padding*dilation)))
    model.add(Conv2D(256, kernel_size=dilated_kernel, dilation_rate=(1,dilation)))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    
    # classifier layer
    model.add(Flatten())
    #model.add(GlobalAveragePooling2D(name='avg_pool'))
    model.add(Dense(num_labels))
    
    # model compilation for training
    adam = optimizers.Adam()
    model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
    
    return model


# create the model
input_shape = (16, 16, 16, 1)
num_classes = 16
dilated_kernel = (3,3)
dilation = (10)
dilated_padding = 2
model = Net(input_shape,num_classes,dilated_kernel,dilation,dilated_padding)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
zero_padding2d (ZeroPadding2 (None, 44, 178, 1)        0         
_________________________________________________________________
separable_conv2d (SeparableC (None, 40, 174, 256)      537       
_________________________________________________________________
activation (Activation)      (None, 40, 174, 256)      0         
_________________________________________________________________
batch_normalization (BatchNo (None, 40, 174, 256)      1024      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 40, 34, 256)       0         
_________________________________________________________________
dropout (Dropout)            (None, 40, 34, 256)       0         
_________________________________________________________________
zero_padding2d_1 (ZeroPaddin (None, 44, 38, 256)       0

## Train the model

In [5]:
#from keras.callbacks import ModelCheckpoint 
from datetime import datetime 

num_epochs = 72
num_batch_size = 256

#checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.basic_cnn.hdf5', 
#                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(x_test, y_test), verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/72
Epoch 2/72
Epoch 3/72
Epoch 4/72
Epoch 5/72
Epoch 6/72
Epoch 7/72
Epoch 8/72
Epoch 9/72
Epoch 10/72
Epoch 11/72
Epoch 12/72
Epoch 13/72
Epoch 14/72
Epoch 15/72
Epoch 16/72
Epoch 17/72
Epoch 18/72
Epoch 19/72
Epoch 20/72
Epoch 21/72
Epoch 22/72
Epoch 23/72
Epoch 24/72
Epoch 25/72
Epoch 26/72
Epoch 27/72
Epoch 28/72
Epoch 29/72
Epoch 30/72
Epoch 31/72
Epoch 32/72
Epoch 33/72
Epoch 34/72
Epoch 35/72
Epoch 36/72
Epoch 37/72
Epoch 38/72
Epoch 39/72
Epoch 40/72
Epoch 41/72
Epoch 42/72
Epoch 43/72
Epoch 44/72
Epoch 45/72
Epoch 46/72
Epoch 47/72
Epoch 48/72
Epoch 49/72
Epoch 50/72
Epoch 51/72

KeyboardInterrupt: 

In [None]:
steps_per_epoch=train_generator.n//train_generator.batch_size
val_steps=test_generator.n//test_generator.batch_size+1
epochs = 1
history = model.fit(train_generator, epochs=epochs, verbose=1,\
                steps_per_epoch=steps_per_epoch,\
                validation_data=test_generator,\
                validation_steps=val_steps)