# UrbanSound8K Challenge at Kaggle
**Challenge:** Classification problem of 10 different ambient noise
8732 audio files split into 10 folds by the author. Author specifically requests 10-fold cross-validation to be performed. 

**Summary of Progress:**
1.	Input data is Mel-frequency cepstrum coefficients of the audio data, produced by librosa package.
2.	Tensorflow and Keras packages are used to construct, compile, train and test model.
3.	Latest model layer construction: 2 2D convolutional layers both with L2 regularizer and RelU activation, maxpooling by 2X2 kernel and 0.5 dropout after every conv layer, 1 fully connected layer before final classification layer with softmax activation.
4.	Single split performed (1:9 training-validation ratio). No 10-fold cross-validation performed yet.

**Current Findings:**
1.	Validation accuracy of around 60% reached, where it fluctuates around 5% at each epoch. Time taken to obtain mfcc is 9 minutes. 
2.	Changing n_mfcc parameter fin librosa didn’t significantly lead to improvement in accuracy
3.	The use of SGD optimizer instead of adam led to a decrease in accuracy.

**Next Steps:**
1.	Use of deep ensemble model to boost accuracy as suggested by Ricky
2.  Use of Tensorflow to calculate MFCC, exploting GPU acceleration



In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install librosa

In [None]:
# Load Imports
import IPython.display as ipd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import torch
import tensorflow as tf
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical, np_utils
from keras.models import Sequential
from keras.layers import Conv2D, MaxPool2D, Dense, Dropout, Activation, Flatten, BatchNormalization
from keras.optimizers import Adam
from sklearn import metrics
from scipy.io import wavfile

In [None]:
def feature_extraction(data):
    #sample rate conversion, bit depth and audio channel modification
    try:
        audio, sample_rate = librosa.load(data, res_type = 'kaiser_fast')
        max_pad = 174
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=64, n_mels = 40)
        pad_width = max_pad - mfccs.shape[1]
        mfccspadded = np.pad(mfccs, pad_width = ((0,0),(0, pad_width)), mode='constant')
    except Exception as e:
        print("Error encountered while parsing file: ",e)
        return None
    return mfccspadded

In [None]:
def feature_extraction1(data):
    audio_binary = tf.io.read_file(data)
    sampling_rate = 44100
    audio, sample_rate = tf.audio.decode_wav(audio_binary, desired_channels = 1)
    #sample_rate, audio = wavfile.read(data)
    signals = tf.cast(tf.reshape(audio, [1, -1]), tf.float32)
    stfts = tf.signal.stft(signals, frame_length = 1024, frame_step=512, fft_length=1024, pad_end = True)
    magnitude_spectrograms = tf.abs(stfts)
    num_spectrogram_bins = magnitude_spectrograms.shape[-1]
    lower_edge_hertz, upper_edge_hertz, num_mel_bins = 20, 20000, 64
    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
                                                                        num_mel_bins, 
                                                                        num_spectrogram_bins, 
                                                                        sample_rate, 
                                                                        lower_edge_hertz,
                                                                        upper_edge_hertz
                                                                        )
    mel_spectrograms = tf.tensordot(magnitude_spectrograms, linear_to_mel_weight_matrix, 1)
    log_offset = 1e-6
    
    log_mel_spectrograms = tf.math.log(mel_spectrograms + log_offset)
    num_mfccs = 30
    mfccs = tf.signal.mfccs_from_log_mel_spectrograms(
    log_mel_spectrograms)[..., :num_mfccs]
    return mfccs

In [None]:
datasetpath = "../input/urbansound8k"
metadata = pd.read_csv(datasetpath + "/UrbanSound8K.csv")
print(metadata.head())
print(metadata['class'].value_counts())
print(metadata['classID'].value_counts())

Next, we extract the class and the data in the form of dataframes. Data needs preprocessing in terms of audio channels, sample rate and bit depth. Mel-frequency cepstrum coefficients are also calculated.

In [None]:

audiodata = []
for index, row in metadata.iterrows():
    file_name = os.path.join(os.path.abspath(datasetpath), 'fold' + str(row['fold']) + '/', str(row['slice_file_name']))
    class_id = row['classID']
    fold = row['fold']
    audio = feature_extraction(file_name)
    audiodata.append([audio, class_id, fold])
features = pd.DataFrame(audiodata, columns=['feature', 'class_label', 'fold'])
print('Finished extraction from', len(features), 'files')

#encoder for classification labels into model-understandable numerical data (onehotencoder)






Afterwards, we split the dataset into training and testing sets. The documentation proposed splitting the testing set size to be 20%. However, since the data is already divided into folds, we can implement 10-fold crossvalidation.

In [None]:
def loader(fold):    
    traindf = features[features['fold'] != fold]
    testdf = features[features['fold'] == fold]
    
    train_x = np.array(traindf.feature.tolist())
    train_x = train_x.reshape(len(traindf), 80, -1, 1)
    train_x = tf.convert_to_tensor(train_x)

    train_y = torch.tensor(traindf['class_label'].values)
    le = LabelEncoder()
    train_y = torch.tensor(to_categorical(le.fit_transform(train_y)))
    train_y = train_y.numpy()
    train_y = tf.convert_to_tensor(train_y)
#train_data = TensorDataset(train_x, train_y)
#train = DataLoader(train_data, batch_size = BATCH_S, shuffle = True)

    print(train_x.shape)
    print(train_y.shape)

    test_x = np.array(testdf.feature.tolist())
    test_x = test_x.reshape(len(testdf), 80, -1, 1)
    test_x = tf.convert_to_tensor(test_x)
    
    test_y = torch.tensor(testdf['class_label'].values)
    le = LabelEncoder()
    test_y = torch.tensor(to_categorical(le.fit_transform(test_y))) 
    test_y = test_y.numpy()
    test_y = tf.convert_to_tensor(test_y)

    print(test_x.shape)
    print(test_y.shape)
    
    return train_x, train_y, test_x, test_y

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')

In [None]:
def ConvNet():
    model = Sequential()

    model.add(Conv2D(12, (3, 3), padding = "same", activation = 'relu', input_shape = (80, 87, 1)))
    model.add(BatchNormalization(axis = 3))
    model.add(MaxPool2D(pool_size = (2,2)))
    model.add(Conv2D(12, (3, 3), padding = "same", activation = 'relu'))
    model.add(BatchNormalization(axis = 3))
    model.add(MaxPool2D(pool_size = (2,2)))
    model.add(Conv2D(12, (3, 3), padding = "same", activation = 'relu'))
    model.add(BatchNormalization(axis = 3))
    model.add(MaxPool2D(pool_size = (2,2)))
    model.add(Conv2D(64, (3, 3), padding = "same", activation = 'relu'))
    model.add(MaxPool2D(pool_size = (2,2)))
    model.add(Conv2D(64, (3, 3), padding = "same", activation = 'relu'))
    model.add(MaxPool2D(pool_size = (2,2)))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(1024, activation = "relu"),)
    model.add(Dense(10, activation = 'softmax'))
    
    return model

In [None]:
BATCH_S = 5
MAX_EPOCHS = 20
opt = tf.keras.optimizers.SGD(
    learning_rate=0.01, momentum=0.5, nesterov=False, name='SGD')
acc = []
for fold in range(1, 11):
    train_x, train_y, test_x, test_y = loader(fold)
    model = ConvNet()
    model.compile(optimizer = opt, loss ='categorical_crossentropy', metrics = ['accuracy'])

    model.fit(train_x, train_y, epochs = MAX_EPOCHS, batch_size = BATCH_S, validation_data = (test_x, test_y));
    predictions = model.predict(test_x);
    score = model.evaluate(test_x, test_y)
    print(score)
    acc.append(score[1])

print("10-fold crossvalidation accuracy score: {}".format(np.mean(acc)))


In [None]:
preds = np.argmax(predictions, axis = 1)
result = pd.DataFrame(preds)
result.to_csv("UrbanSound8kResults.csv")