 # Model Generation
 ## Imports

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
        GradientBoostingClassifier, StackingClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate
from mlxtend.plotting import plot_learning_curves
import matplotlib.pyplot as plt
plt.style.use('ggplot')
# Hyperparameter tuning functions using keras-tuner
import kerastuner as kt
from tensorboard.plugins.hparams import api as hp
from sklearn.metrics import accuracy_score
import librosa
import sonicboom


 ## Read in the features

In [2]:
filedata = pd.read_pickle('./output/intermediate-data/filedata-mfcc-100perclass.pkl')


 ### Define convolutional network architecture

In [3]:
def extract_features(filedata):
    features, labels = [], []
    for fn in filedata['path']:
        sound_clip,sr = librosa.load(fn)
        melspec = librosa.feature.mfcc(sound_clip, n_mfcc=40)
        deltas = librosa.feature.delta(melspec)
        combi = np.dstack((melspec, deltas))
        features.append(combi)
    return features


In [4]:
# filedata = filedata.groupby(
#     'class', 
#     as_index=False, 
#     group_keys=False
# ).apply(lambda x: x.sample(10))


In [5]:
fff = extract_features(filedata)
filedata['extrafeatures'] = fff


In [6]:
testdata = pd.read_pickle('./output/intermediate-data/filedata-librosaFeatures-test.pkl')
ttt = extract_features(testdata)
testdata['extrafeatures'] = ttt
x_test = np.array(testdata['extrafeatures'].tolist())
y_test = np.array(testdata['classID'].tolist())


In [7]:
filedata

Unnamed: 0,path,slice_file_name,fsID,start,end,salience,fold,classID,class,mfccs,extrafeatures
6715,data\UrbanSound8K\audio\fold7\189981-0-0-2.wav,189981-0-0-2.wav,189981,1.000000,5.000000,1,7,0,air_conditioner,"[-221.69457018127355, 177.7204434107985, -32.1...","[[[-164.30281265752072, 0.6735632795582637], [..."
5235,data\UrbanSound8K\audio\fold5\60846-0-0-1.wav,60846-0-0-1.wav,60846,0.500000,4.500000,1,5,0,air_conditioner,"[-335.0034071263695, 160.9618582589945, 6.4540...","[[[-288.49031302668186, 0.45364415151113835], ..."
246,data\UrbanSound8K\audio\fold1\151977-0-0-8.wav,151977-0-0-8.wav,151977,4.466587,8.466587,1,1,0,air_conditioner,"[-370.6915022563572, 173.24041472360636, -25.9...","[[[-305.97639503465007, -1.8421214362065246], ..."
1551,data\UrbanSound8K\audio\fold10\73524-0-0-52.wav,73524-0-0-52.wav,73524,26.000000,30.000000,1,10,0,air_conditioner,"[-199.02758337069096, 160.82377276416108, -8.4...","[[[-170.40577657739087, 0.6839470223927894], [..."
1215,data\UrbanSound8K\audio\fold10\167464-0-0-9.wav,167464-0-0-9.wav,167464,4.500000,8.500000,2,10,0,air_conditioner,"[-216.4193757592353, 179.36352864634716, -23.3...","[[[-159.747947492767, -0.030002530384386928], ..."
...,...,...,...,...,...,...,...,...,...,...,...
8136,data\UrbanSound8K\audio\fold9\155044-9-0-11.wav,155044-9-0-11.wav,155044,5.500000,9.500000,1,9,9,street_music,"[-360.65340902191724, 214.81542779529252, -13....","[[[-326.04099228759515, 1.936926268160445], [-..."
6544,data\UrbanSound8K\audio\fold7\165192-9-0-118.wav,165192-9-0-118.wav,165192,59.000000,63.000000,1,7,9,street_music,"[-364.07334434665046, 182.97183888225194, -13....","[[[-292.4954425624062, -2.329705812629982], [-..."
2717,data\UrbanSound8K\audio\fold3\138473-9-0-12.wav,138473-9-0-12.wav,138473,7.523034,11.523034,2,3,9,street_music,"[-200.61808762694494, 175.69696781920197, -30....","[[[-114.4648747308639, -3.3344862234656705], [..."
5519,data\UrbanSound8K\audio\fold6\115243-9-0-46.wav,115243-9-0-46.wav,115243,23.000000,27.000000,1,6,9,street_music,"[-349.4178728133258, 203.74903899707104, -18.6...","[[[-279.50714538321296, -9.16680988511779], [-..."


In [8]:
testdata

Unnamed: 0,path,slice_file_name,fsID,start,end,salience,fold,classID,class,In Tony's DT TEST?,mfccs,extrafeatures
4955,data\UrbanSound8K\audio\fold5\178686-0-0-11.wav,178686-0-0-11.wav,178686.0,5.500000,9.500000,2.0,5.0,0.0,air_conditioner,,"[-423.9715845154162, 184.13612486090355, -9.60...","[[[-428.65454426698557, 1.4474459559736133], [..."
1370,data\UrbanSound8K\audio\fold10\189985-0-0-4.wav,189985-0-0-4.wav,189985.0,2.000000,6.000000,1.0,10.0,0.0,air_conditioner,,"[-120.9571014126731, 125.17268374330818, -10.8...","[[[-130.67388391211114, 0.32848223966734746], ..."
4049,data\UrbanSound8K\audio\fold4\195969-0-0-12.wav,195969-0-0-12.wav,195969.0,289.485610,293.485610,2.0,4.0,0.0,air_conditioner,,"[-326.16156175366854, 79.07585555374375, -5.50...","[[[-311.62007751538266, -1.1440849872996277], ..."
240,data\UrbanSound8K\audio\fold1\151977-0-0-2.wav,151977-0-0-2.wav,151977.0,1.466587,5.466587,1.0,1.0,0.0,air_conditioner,,"[-304.13305248825577, 132.52725156657107, -5.5...","[[[-307.102640719154, -0.1464304062439627], [-..."
5814,data\UrbanSound8K\audio\fold6\184805-0-0-63.wav,184805-0-0-63.wav,184805.0,31.500000,35.500000,2.0,6.0,0.0,air_conditioner,,"[-345.35570405495974, 161.09149121337347, -6.2...","[[[-352.98776073157416, 2.2544893399611787], [..."
...,...,...,...,...,...,...,...,...,...,...,...,...
5980,data\UrbanSound8K\audio\fold6\35548-9-0-21.wav,35548-9-0-21.wav,35548.0,16.592612,20.592612,2.0,6.0,9.0,street_music,,"[-368.1196419119443, 185.17806699902027, -61.6...","[[[-286.6318586019854, -8.787184276278587], [-..."
8434,data\UrbanSound8K\audio\fold9\194310-9-0-7.wav,194310-9-0-7.wav,194310.0,3.500000,7.500000,2.0,9.0,9.0,street_music,,"[-244.10612435155025, 190.15638006421534, -23....","[[[-28.88304099842488, -34.15634201922355], [3..."
4313,data\UrbanSound8K\audio\fold4\39968-9-0-173.wav,39968-9-0-173.wav,39968.0,111.116485,115.116485,1.0,4.0,9.0,street_music,,"[-269.1458304295694, 92.10816565871349, 9.7335...","[[[-219.2142015685867, -17.026257314767687], [..."
4324,data\UrbanSound8K\audio\fold4\42954-9-0-22.wav,42954-9-0-22.wav,42954.0,11.000000,15.000000,1.0,4.0,9.0,street_music,,"[-297.87756411062173, 79.88204910047659, -42.7...","[[[-166.317254626894, -3.9334265552549783], [-..."


In [9]:

def get_cnn():
    num_filters = [24,32,64,128] 
    pool_size = (2, 2) 
    kernel_size = (3, 3)  
    input_shape = (40, 173, 2)
    num_classes = 10
    keras.backend.clear_session()
    
    model = keras.models.Sequential()
    model.add(keras.layers.Conv2D(24, kernel_size, input_shape=input_shape,
                padding="same"))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Activation("relu"))
    model.add(keras.layers.MaxPooling2D(pool_size=pool_size))

    model.add(keras.layers.Conv2D(32, kernel_size,
                                  padding="same"))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Activation("relu"))  
    model.add(keras.layers.MaxPooling2D(pool_size=pool_size))
    
    model.add(keras.layers.Conv2D(64, kernel_size,
                                  padding="same"))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Activation("relu"))  
    model.add(keras.layers.MaxPooling2D(pool_size=pool_size))
    
    model.add(keras.layers.Conv2D(128, kernel_size,
                                  padding="same"))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Activation("relu"))  

    model.add(keras.layers.GlobalMaxPooling2D())
    model.add(keras.layers.Dense(256, activation="relu"))
    model.add(keras.layers.Dense(num_classes, activation="softmax"))

    model.compile(optimizer=keras.optimizers.Adam(1e-3), 
        loss=keras.losses.SparseCategoricalCrossentropy(), 
        metrics=["accuracy"])
    return model


 ### Train and evaluate via 10-Folds cross-validation

In [10]:
accuracies = []
folds = np.array(list(range(1,11)))
kf = KFold(n_splits=10)
trainaccuracies = []
valaccuracies = []
testaccuracies = []
i = 0
logdir = './logs/2DMFCCwDelta/'
num_epochs = 1000
num_waits = 50
verbosity = 0

for train_index, test_index in kf.split(folds):
    traindata = filedata[filedata['fold'].isin(list(folds[train_index]))]
    x_train = np.array(traindata['extrafeatures'].tolist())
    y_train = np.array(traindata['classID'].tolist())

    testdata = filedata[filedata['fold'] == folds[test_index][0]]
    x_val = np.array(traindata['extrafeatures'].tolist())
    y_val = np.array(traindata['classID'].tolist())

    # Checkpoint to continue models, early stopping and tensorboard
    checkpoint = keras.callbacks.ModelCheckpoint(
        logdir + 'best_%d.h5'%i, 
        monitor='val_loss',
        verbose=verbosity, 
        save_weights_only=True, 
        save_best_only=True
    )
    early = keras.callbacks.EarlyStopping(
        monitor='val_loss', 
        mode='min', 
        patience=num_waits
    )
    tb = keras.callbacks.TensorBoard(log_dir=logdir)
    # callbacks_list = [checkpoint, early, tb]
    callbacks_list = [checkpoint, early]

    model = get_cnn()
    # model.fit(x_train, y_train, epochs = 10, batch_size = 24, verbose = 0)
    history = model.fit(
        x_train, 
        y_train, 
        epochs=num_epochs,
        use_multiprocessing=True, 
        verbose=0,
        callbacks=callbacks_list,
        validation_data=(x_val, y_val)
    )
    trainloss, trainacc = model.evaluate(x_train, y_train, verbose=0)
    valloss, valacc = model.evaluate(x_val, y_val, verbose=0)
    testloss, testacc = model.evaluate(x_test, y_test, verbose=0)
    trainaccuracies.append(trainacc)
    valaccuracies.append(valacc)
    testaccuracies.append(testacc)
    print(f"Fold: {i}")
    print("Train Loss: {0} | Accuracy: {1}".format(trainloss, trainacc))
    print("Val Loss: {0} | Accuracy: {1}".format(valloss, valacc))
    print("Test Loss: {0} | Accuracy: {1}".format(testloss, testacc))
    i += 1

# Out of loop, print average of the results
print("===============================================")
print("FINISHED!")
print(f"Number of Epochs per fold: {num_epochs}")
print("Average Train 10 Folds Accuracy: {0}".format(np.mean(trainaccuracies)))
print("Average Val 10 Folds Accuracy: {0}".format(np.mean(valaccuracies)))
print("Average Test 10 Folds Accuracy: {0}".format(np.mean(testaccuracies)))


Fold: 0
Train Loss: 0.00037671165773645043 | Accuracy: 1.0
Val Loss: 0.00037671165773645043 | Accuracy: 1.0
Test Loss: 0.9508028626441956 | Accuracy: 0.8500000238418579
Fold: 1
Train Loss: 0.09548583626747131 | Accuracy: 0.971238911151886
Val Loss: 0.09548583626747131 | Accuracy: 0.971238911151886
Test Loss: 1.2680329084396362 | Accuracy: 0.7900000214576721
Fold: 2
Train Loss: 0.0005306784296408296 | Accuracy: 1.0
Val Loss: 0.0005306784296408296 | Accuracy: 1.0
Test Loss: 0.7559277415275574 | Accuracy: 0.8700000047683716
Fold: 3
Train Loss: 0.00029274821281433105 | Accuracy: 1.0
Val Loss: 0.00029274821281433105 | Accuracy: 1.0
Test Loss: 0.811155378818512 | Accuracy: 0.8399999737739563
Fold: 4
Train Loss: 0.0005015098140574992 | Accuracy: 1.0
Val Loss: 0.0005015098140574992 | Accuracy: 1.0
Test Loss: 1.0037517547607422 | Accuracy: 0.8199999928474426
Fold: 5
Train Loss: 0.0003871484368573874 | Accuracy: 1.0
Val Loss: 0.0003871484368573874 | Accuracy: 1.0
Test Loss: 0.9479972124099731 | 