Urbansound8k


In [14]:

import sys
import os
import IPython as IP
import pandas as pd
import numpy as np
import math
import librosa
import librosa.display
import pickle
from include import helpers

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical


import matplotlib.pyplot as plt
%matplotlib inline

import random
from datetime import datetime
from include import helpers

from keras import backend as keras_backend
from keras.models import Sequential, load_model
from keras.layers import Dense, SpatialDropout2D, Activation, Conv2D, MaxPooling2D, BatchNormalization, GlobalAveragePooling2D, LeakyReLU
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint 
from keras.regularizers import l2

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

In [5]:
us8k_path = os.path.abspath('./UrbanSound8K')
audio_path = os.path.join(us8k_path, 'audio')
metadata_path = os.path.join(us8k_path, 'metadata/UrbanSound8K.csv')

In [6]:
# Load the metadata from the generated CSV
metadata = pd.read_csv(metadata_path)
metadata.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [7]:
# Iterate through all audio files and extract MFCC
features = []
labels = []
frames_max = 0
counter = 0
total_samples = len(metadata)
n_mfcc = 40

for index, row in metadata.iterrows():
    file_path = os.path.join(os.path.abspath(audio_path), 'fold' + str(row["fold"]), str(row["slice_file_name"]))
    class_label = row["class"]

    # Extract MFCCs (do not add padding)
    mfccs = helpers.get_mfcc(file_path, 0, n_mfcc)
    
    # Save current frame count
    num_frames = mfccs.shape[1]
    
    # Add row (feature / label)
    features.append(mfccs)
    labels.append(class_label)

    # Update frames maximum
    if (num_frames > frames_max):
        frames_max = num_frames

    # Notify update every N files
    if (counter == 500):
        print("Status: {}/{}".format(index+1, total_samples))
        counter = 0

    counter += 1
    
print("Finished: {}/{}".format(index, total_samples))

Status: 501/8732
Status: 1001/8732
Status: 1501/8732
Status: 2001/8732
Status: 2501/8732
Status: 3001/8732
Status: 3501/8732
Status: 4001/8732
Status: 4501/8732
Status: 5001/8732
Status: 5501/8732
Status: 6001/8732
Status: 6501/8732
Status: 7001/8732
Status: 7501/8732
Status: 8001/8732
Status: 8501/8732
Finished: 8731/8732


In [8]:
# Add padding to features with less than frames than frames_max
padded_features = helpers.add_padding(features, frames_max)

In [10]:
# Convert features (X) and labels (y) to Numpy arrays
X = np.array(padded_features)
y = np.array(labels)

# Optionally save the features to disk
np.save("data/X-mfcc", X)
np.save("data/y-mfcc", y)

# Iterate through all audio files and extract MFCC
features = []
labels = []
frames_max = 0
counter = 0
total_samples = len(metadata)
n_mels=40

for index, row in metadata.iterrows():
    file_path = os.path.join(os.path.abspath(audio_path), 'fold' + str(row["fold"]), str(row["slice_file_name"]))
    class_label = row["class"]

    # Extract Log-Mel Spectrograms (do not add padding)
    mels = helpers.get_mel_spectrogram(file_path, 0, n_mels=n_mels)
    
    # Save current frame count
    num_frames = mels.shape[1]
    
    # Add row (feature / label)
    features.append(mels)
    labels.append(class_label)

    # Update frames maximum
    if (num_frames > frames_max):
        frames_max = num_frames

    # Notify update every N files
    if (counter == 500):
        print("Status: {}/{}".format(index+1, total_samples))
        counter = 0

    counter += 1
    
print("Finished: {}/{}".format(index, total_samples))

# Add padding to features with less than frames than frames_max
padded_features = helpers.add_padding(features, frames_max)

# Convert features (X) and labels (y) to Numpy arrays
X = np.array(padded_features)
y = np.array(labels)

np.save("data/X-mel_spec", X)
np.save("data/y-mel_spec", y)

In [11]:
# Define general variables

# Set your path to the dataset
models_path = os.path.abspath('./models')
data_path = os.path.abspath('./data')

# Ensure "channel last" data format on Keras
keras_backend.set_image_data_format('channels_last')

# Define a labels array for future use
labels = [
        'Air Conditioner',
        'Car Horn',
        'Children Playing',
        'Dog bark',
        'Drilling',
        'Engine Idling',
        'Gun Shot',
        'Jackhammer',
        'Siren',
        'Street Music'
    ]

In [12]:

# Metadata
metadata = pd.read_csv(metadata_path)

In [15]:
indexes = []
total = len(metadata)
indexes = list(range(0, total))

# Randomize indexes
random.shuffle(indexes)

# Divide the indexes into Train and Test
test_split_pct = 20
split_offset = math.floor(test_split_pct * total / 100)

# Split the metadata
test_split_idx = indexes[0:split_offset]
train_split_idx = indexes[split_offset:total]


# Split the features with the same indexes
X_test = np.take(X, test_split_idx, axis=0)
y_test = np.take(y, test_split_idx, axis=0)
X_train = np.take(X, train_split_idx, axis=0)
y_train = np.take(y, train_split_idx, axis=0)

# Also split metadata
test_meta = metadata.iloc[test_split_idx]
train_meta = metadata.iloc[train_split_idx]

# Print status
print("Test split: {} \t\t Train split: {}".format(len(test_meta), len(train_meta)))
print("X test shape: {} \t X train shape: {}".format(X_test.shape, X_train.shape))
print("y test shape: {} \t\t y train shape: {}".format(y_test.shape, y_train.shape))

Test split: 1746 		 Train split: 6986
X test shape: (1746, 40, 174) 	 X train shape: (6986, 40, 174)
y test shape: (1746,) 		 y train shape: (6986,)


In [16]:
le = LabelEncoder()
y_test_encoded = to_categorical(le.fit_transform(y_test))
y_train_encoded = to_categorical(le.fit_transform(y_train))

In [17]:
# How data should be structured
num_rows = 40
num_columns = 174 
num_channels = 1

# Reshape to fit the network input (channel last)
X_train = X_train.reshape(X_train.shape[0], num_rows, num_columns, num_channels)
X_test = X_test.reshape(X_test.shape[0], num_rows, num_columns, num_channels)

# Total number of labels to predict (equal to the network output nodes)
num_labels = y_train_encoded.shape[1]

In [18]:
def create_model(spatial_dropout_rate_1=0, spatial_dropout_rate_2=0, l2_rate=0):

    # Create a secquential object
    model = Sequential()


    # Conv 1
    model.add(Conv2D(filters=32, 
                     kernel_size=(3, 3), 
                     kernel_regularizer=l2(l2_rate), 
                     input_shape=(num_rows, num_columns, num_channels)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())

    model.add(SpatialDropout2D(spatial_dropout_rate_1))
    model.add(Conv2D(filters=32, 
                     kernel_size=(3, 3), 
                     kernel_regularizer=l2(l2_rate)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())


    # Max Pooling #1
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(SpatialDropout2D(spatial_dropout_rate_1))
    model.add(Conv2D(filters=64, 
                     kernel_size=(3, 3), 
                     kernel_regularizer=l2(l2_rate)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())

    model.add(SpatialDropout2D(spatial_dropout_rate_2))
    model.add(Conv2D(filters=64, 
                     kernel_size=(3,3), 
                     kernel_regularizer=l2(l2_rate)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())
    
   
    # Reduces each h×w feature map to a single number by taking the average of all h,w values.
    model.add(GlobalAveragePooling2D())


    # Softmax output
    model.add(Dense(num_labels, activation='softmax'))
    
    return model

# Regularization rates
spatial_dropout_rate_1 = 0.07
spatial_dropout_rate_2 = 0.14
l2_rate = 0.0005

model = create_model(spatial_dropout_rate_1, spatial_dropout_rate_2, l2_rate)

In [19]:
adam = Adam(lr=1e-4, beta_1=0.99, beta_2=0.999)
model.compile(
    loss='categorical_crossentropy', 
    metrics=['accuracy'], 
    optimizer=adam)

# Display model architecture summary 
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 38, 172, 32)       320       
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 38, 172, 32)       0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 38, 172, 32)       128       
_________________________________________________________________
spatial_dropout2d_1 (Spatial (None, 38, 172, 32)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 36, 170, 32)       9248      
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 36, 170, 32)       0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 36, 170, 32)      

In [None]:
num_epochs = 5
num_batch_size = 128
model_file = 'simple-train-nb3.hdf5'
model_path = os.path.join(models_path, model_file)


# Save checkpoints
checkpointer = ModelCheckpoint(filepath=model_path, 
                               verbose=1, 
                               save_best_only=True)
start = datetime.now()
history = model.fit(X_train, 
                    y_train_encoded, 
                    batch_size=num_batch_size, 
                    epochs=num_epochs, 
                    validation_split=1/12.,
                    callbacks=[checkpointer], 
                    verbose=1)

duration = datetime.now() - start
print("Training completed in time: ", duration)

In [None]:
helpers.plot_train_history(history, x_ticks_vertical=True)

In [None]:
# Predict probabilities for test set
y_probs = model.predict(X_test, verbose=0)

# Get predicted labels
yhat_probs = np.argmax(y_probs, axis=1)
y_trues = np.argmax(y_test_encoded, axis=1)

# Add "pred" column
test_meta['pred'] = yhat_probs

# go through 45 combinations of experiments
res = {}
for class1 in range(10):
    for class2 in range(class1 + 1, 10):

        # accuracy vs num training samples (naive_rf)
        naive_rf_acc_vs_n = list()
        fraction_of_train_samples_space = np.geomspace(0.01, 1, num=8)
        for fraction_of_train_samples in fraction_of_train_samples_space:
            RF = RandomForestClassifier(n_estimators=100, n_jobs=-1)
            best_accuracy = np.mean(
                [
                    ConvRF.run_rf(
                        RF,
                        cifar_train_images,
                        cifar_train_labels,
                        cifar_test_images,
                        cifar_test_labels,
                        fraction_of_train_samples,
                        class1,
                        class2,
                    )
                    for _ in range(5)
                ]
            )
            naive_rf_acc_vs_n.append(best_accuracy)

        # accuracy vs num training samples (naive_rf)
        conv_rf_2_layer = list()
        for fraction_of_train_samples in fraction_of_train_samples_space:
            conv_rf_2l = ConvRF.ConvRFClassifier(
                layers=2, kernel_size=(10, 5), stride=(2, 1)
            )
            best_accuracy = np.mean(
                [
                    ConvRF.run_rf(
                        conv_rf_2l,
                        cifar_train_images,
                        cifar_train_labels,
                        cifar_test_images,
                        cifar_test_labels,
                        fraction_of_train_samples,
                        class1,
                        class2,
                    )
                    for _ in range(5)
                ]
            )
            conv_rf_2_layer.append(best_accuracy)

        # accuracy vs num training samples (naive_rf)
        conv_rf_apply = list()
        for fraction_of_train_samples in fraction_of_train_samples_space:
            conv_rf_a = ConvRF.ConvRFClassifier(
                layers=1, kernel_size=(10,), stride=(2,)
            )
            best_accuracy = np.mean(
                [
                    ConvRF.run_rf(
                        conv_rf_a,
                        cifar_train_images,
                        cifar_train_labels,
                        cifar_test_images,
                        cifar_test_labels,
                        fraction_of_train_samples,
                        class1,
                        class2,
                    )
                    for _ in range(5)
                ]
            )
            conv_rf_apply.append(best_accuracy)

        # accuracy vs num training samples (one layer cnn (32 filters))
        cnn32_acc_vs_n = list()
        for fraction_of_train_samples in fraction_of_train_samples_space:
            best_accuracy = np.mean(
                [
                    ConvRF.run_cnn(
                        SimpleCNN32Filter,
                        cifar_train_images,
                        cifar_train_labels,
                        cifar_test_images,
                        cifar_test_labels,
                        fraction_of_train_samples,
                        class1,
                        class2,
                        trainset,
                        testset,
                    )
                    for _ in range(5)
                ]
            )
            cnn32_acc_vs_n.append(best_accuracy)

        # accuracy vs num training samples (two layer cnn (32 filters))
        cnn32_two_layer_acc_vs_n = list()
        for fraction_of_train_samples in fraction_of_train_samples_space:
            best_accuracy = np.mean(
                [
                    ConvRF.run_cnn(
                        SimpleCNN32Filter2Layers,
                        cifar_train_images,
                        cifar_train_labels,
                        cifar_test_images,
                        cifar_test_labels,
                        fraction_of_train_samples,
                        class1,
                        class2,
                        trainset,
                        testset,
                    )
                    for _ in range(5)
                ]
            )
            cnn32_two_layer_acc_vs_n.append(best_accuracy)

        # accuracy vs num training samples (one layer cnn)
        cnn_acc_vs_n = list()
        for fraction_of_train_samples in fraction_of_train_samples_space:
            best_accuracy = np.mean(
                [
                    ConvRF.run_cnn(
                        SimpleCNNOneFilter,
                        cifar_train_images,
                        cifar_train_labels,
                        cifar_test_images,
                        cifar_test_labels,
                        fraction_of_train_samples,
                        class1,
                        class2,
                        trainset,
                        testset,
                    )
                    for _ in range(5)
                ]
            )
            cnn_acc_vs_n.append(best_accuracy)

        table = pd.DataFrame(
            np.concatenate(
                (
                    [naive_rf_acc_vs_n],
                    [conv_rf_apply],
                    [conv_rf_2_layer],
                    [cnn_acc_vs_n],
                    [cnn32_acc_vs_n],
                    [cnn32_two_layer_acc_vs_n],
                ),
                axis=0,
            )
        )
        algos = [
            "naiveRF",
            "convrf",
            "convrf2layer",
            "simplecnn",
            "cnn32",
            "cnn32_2layer",
        ]
        table["algos"] = algos
        cols = table.columns.tolist()
        cols = [cols[-1]] + cols[:-1]
        cols = pd.Index(cols)
        table = table[cols]
        res[str(class1) + "_vs_" + str(class2)] = table