# Sound Classification with CNNs

This notebook walks through the process of building a simple CNN-based network for classifying 10 different urban sounds. 

We'll be using the UrbanSound8k dataset. 


***
## Obtaining the Dataset

In [None]:
import urllib.request
import tarfile
import os

if not os.path.exists(os.path.join(os.getcwd(), 'UrbanSound8K')):
    urllib.request.urlretrieve("https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz","a.tar.gz")
    tar = tarfile.open("a.tar.gz")
    tar.extractall()
    tar.close()

***
## Feature Exploration

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
from librosa import display
import librosa
import IPython.display


### Importing the Metadata File

In [None]:
# forming a panda dataframe from the metadata file
data = pd.read_csv("UrbanSound8K/metadata/UrbanSound8K.csv")

# display the first 5 rows of this dataframe
data.head(5)

In [None]:
# number of files in each class
data["class"].value_counts()

In [None]:
# number of files in each folder
data["fold"].value_counts()

### Import an Audio Clip

In [None]:
# this file is of a dog bark
y, sr = librosa.load("UrbanSound8K/audio/fold5/100032-3-0-0.wav")

# display the waveform
librosa.display.waveplot(y=y, sr=sr)

# play it back
IPython.display.Audio(data=y, rate=sr)


In [None]:
# let's extract some audio features... 
NUM_MEL_BINS = 40

mfccs = librosa.feature.mfcc(y, sr, n_mfcc=NUM_MEL_BINS)
melspectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=NUM_MEL_BINS, fmax=8000)
# chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr, n_chroma=NUM_FEATURE_BINS)
# chroma_cq = librosa.feature.chroma_cqt(y=y, sr=sr, n_chroma=NUM_FEATURE_BINS)
# chroma_cens = librosa.feature.chroma_cens(y=y, sr=sr, n_chroma=NUM_FEATURE_BINS)

# display dimensions
print('Number of mel filter banks:', melspectrogram.shape[0])
print('Number of analysis windows (time frames):', melspectrogram.shape[1])

In [None]:
import matplotlib.pyplot as plt

# mel-frequency cepstral coefficients (mfccs)
plt.figure(figsize=(10,4))
librosa.display.specshow(mfccs, x_axis='time')
plt.colorbar()
plt.title('MFCC')
plt.tight_layout()

# Mel spectrogram
plt.figure(figsize=(10,4))
librosa.display.specshow(librosa.power_to_db(melspectrogram,ref=np.max),y_axis='mel', fmax=8000,x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel spectrogram')
plt.tight_layout()


***
## Data Preprocessing

In [None]:
NUM_MEL_BINS = 40
FEATURE = 'mfccs'  # 'melspect' or 'mfccs'
PATH = "UrbanSound8K/audio/fold"

X_train = []
X_test = []
y_train = []
y_test = []

for i in tqdm(range(len(data))):
    
    fold_no = str(data.iloc[i]["fold"])
    file = data.iloc[i]["slice_file_name"]
    label = data.iloc[i]["classID"]
    
    filepath = PATH + fold_no + "/" + file
    
    y, sr = librosa.load(filepath)
    
    if FEATURE is 'melspect':
        feature_array = librosa.feature.mfcc(y, sr, n_mfcc=NUM_MEL_BINS)
    elif FEATURE is 'mfccs':
        feature_array = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=NUM_MEL_BINS, fmax=8000)
    else:
        raise ValueError('FEATURE must be specified as "mfccs" or "melspect"')
        
    if(fold_no != '10'):
        X_train.append(feature_array)
        y_train.append(label)
    else:
        X_test.append(feature_array)
        y_test.append(label)

### Save Extracted Features to Disk

In [None]:
import pickle 

In [None]:
FEATURES_DIR = os.path.join(os.getcwd(), 'features')
if not os.path.exists(FEATURES_DIR):
    os.mkdir(FEATURES_DIR)

save_dir = os.path.join(FEATURES_DIR, FEATURE)
if not os.path.exists(save_dir):
    os.mkdir(save_dir)

with open(os.path.join(save_dir, 'X_train.pkl'), 'wb') as f:
    pickle.dump(X_train, f)
with open(os.path.join(save_dir, 'X_test.pkl'), 'wb') as f:
    pickle.dump(X_test, f)
with open(os.path.join(save_dir, 'y_train.pkl'), 'wb') as f:
    pickle.dump(y_train, f)
with open(os.path.join(save_dir, 'y_test.pkl'), 'wb') as f:
    pickle.dump(y_test, f)

### Load Extracted Features from Disk

In [None]:
FEATURE = 'melspect'  # 'melspect' or 'mfccs'

load_dir = os.path.join(os.getcwd(), 'features', FEATURE)

with open(os.path.join(load_dir, 'X_train.pkl'), 'rb') as f:
    X_train = pickle.load(f)
with open(os.path.join(load_dir, 'X_test.pkl'), 'rb') as f:
    X_test = pickle.load(f)
with open(os.path.join(load_dir, 'y_train.pkl'), 'rb') as f:
    y_train = pickle.load(f)
with open(os.path.join(load_dir, 'y_test.pkl'), 'rb') as f:
    y_test = pickle.load(f)

### Enforce Common Window Size on Feature Data

To train a CNN, we need to have an equal number of time steps from each sample. That's not the case with this dataset. But most (83%) have 173 time steps. Rather than pad samples that have too few, or trim samples that have too many, as a shortcut here I'll only use samples with 173 times steps.

In [None]:
# examining the distribution of time steps per sample...
sizes_train = [sample.shape[1] for sample in X_train]
sizes_test = [sample.shape[1] for sample in X_test]

plt.hist(np.array(sizes_train), bins=20)
plt.ylabel('No of times')
plt.show()

print('Fraction of samples in X_train having 173 time steps:', sizes_train.count(173)/len(sizes_train))
print('Fraction of samples in X_test having 173 time steps:', sizes_test.count(173)/len(sizes_test))


In [None]:
# restricting the samples
def restrict_samples(X_data, y_data):
    
    X = []
    y = []
    
    for idx in range(len(X_data)):
        if X_data[idx].shape[1] == 173:
            X.append(X_data[idx])
            y.append(y_data[idx])
    
    return X, y

X_train, y_train = restrict_samples(X_train, y_train)
X_test, y_test = restrict_samples(X_test, y_test)


We should double-check that this hasn't significantly changed the balance of samples among our classes:

In [None]:
# create a dictionary for mapping numerical class labels to the class name
class_label_to_name = {
    0 : 'air_conditioner',
    1 : 'car_horn',
    2 : 'children_playing',
    3 : 'dog_bark',
    4 : 'drilling',
    5 : 'engine_idling',
    6 : 'gun_shot',
    7 : 'jackhammer',
    8 : 'siren',
    9 : 'street_music'
}

def map_labels_to_names(labels):
    label_names = [class_label_to_name[label] for label in labels]
    return label_names

y_train_names = map_labels_to_names(y_train)
y_test_names = map_labels_to_names(y_test)

df_train_names = pd.DataFrame({'class_names': y_train_names}) 
df_test_names = pd.DataFrame({'class_names': y_test_names}) 


In [None]:
print('Class count within restricted training dataset:')
df_train_names["class_names"].value_counts()

In [None]:
print('Class count within restricted test dataset:')
df_test_names["class_names"].value_counts()

### One-Hot Encoding of Labels

In [None]:
from keras.utils.np_utils import to_categorical

# convert from lists to arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

# apply one-hot encoding
y_train = to_categorical(y_train, num_classes=10)
y_test = to_categorical(y_test, num_classes=10)

print('Shape of y_train:', y_train.shape)
print('Shape of y_test:', y_test.shape)


### Reshape Feature Data
The CNN expects an additional "channels" dimension. 

In [None]:
# convert to arrays (fails unless you enforced equal number of time steps)
X_train = np.array(X_train)
X_test = np.array(X_test)
print('Shape of X_train (before):', X_train.shape)

# add an additional dimension (note: we only have 1 channel here)
X_train = np.expand_dims(X_train, axis=-1)
X_test= np.expand_dims(X_test, axis=-1)
print('Shape of X_train (after):', X_train.shape)
print('(num samples, num mel bins, num time steps, num channels)')


### Standardize the Data

In [None]:
mean = np.mean(X_train.flatten(), axis=0) 
std = np.std(X_train.flatten(), axis=0)

X_train = (X_train - mean)/std
X_test = (X_test - mean)/std

***
## ML Model

### Imports

In [None]:
import tensorflow as tf
import tensorflow.keras as Keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import (
    Conv2D, MaxPooling2D, BatchNormalization,
    Dropout, Dense, Activation, Input, concatenate,
    Flatten)
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l1, l2

print('TensorFlow Version:', tf.__version__)
print('Keras Version:', Keras.__version__)
print('Keras Backend:', Keras.backend.backend())
print('Keras Conv Data Format:', Keras.backend.image_data_format())

print("Is GPU available?:", tf.test.is_gpu_available())
print("GPU name:", tf.test.gpu_device_name())

tf.logging.set_verbosity(tf.logging.ERROR)

# config = tf.ConfigProto()
# config.gpu_options.allow_growth = True
# sess = tf.Session(config=config)

### Construct CNN

In [None]:
tf.keras.backend.clear_session()

model = Sequential()

# first conv stack
model.add(Conv2D(
    filters=64, 
    kernel_size=5, 
    strides=1, 
    padding="same",
    activation="relu",
    activity_regularizer=l2(0.002),
    kernel_initializer='VarianceScaling',
    input_shape=X_train.shape[1:]
))      
model.add(MaxPooling2D(pool_size=2, padding="same"))

# second conv stack
model.add(Conv2D(
    filters=128, 
    kernel_size=5, 
    strides=1, 
    padding="same",
    activation="relu",
    activity_regularizer=l2(0.002),
    kernel_initializer='VarianceScaling'
))     
model.add(MaxPooling2D(pool_size=2, padding="same"))  
model.add(Dropout(0.3))

# dense layers
model.add(Flatten())
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.3))

model.add(Dense(128, activation="relu"))
model.add(Dropout(0.3))

model.add(Dense(10, activation="softmax"))

model.summary()

### Select Loss Function, Optimizer, and Metrics

In [None]:
model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"])

### Model Training

In [None]:
history = model.fit(
    X_train,
    y_train,
    batch_size=50,
    epochs=10,
    validation_data=(X_test,y_test))

In [None]:
def plot_histories(history):
    plt.plot(history.history['acc'], label='train')
    plt.plot(history.history['val_acc'], label='test')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.title('Learning Curves')
    plt.legend()
    plt.show()
    
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='test')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.title('Loss Function')
    plt.legend()
    plt.show()
    
    return None

plot_histories(history)

### Model Evaluation

In [None]:
#train and test loss and scores respectively
train_scores = model.evaluate(X_train, y_train)
test_scores = model.evaluate(X_test, y_test)

print('\nTraining Loss:', train_scores[0])
print('Training Accuracy:', train_scores[1])
print('\nTest Loss:', test_scores[0])
print('Test Accuracy:', test_scores[1])


**NOTES**

The best-reported cross-validated test accuracy I found was 83% (from [this paper](https://arxiv.org/pdf/1808.08405.pdf)), albeit my search was non-exaustive. They used data augmentation. 

An even better data augmentation technique to try may be the one described [here](https://ai.googleblog.com/2019/04/specaugment-new-data-augmentation.html).

### Plot Confusion Matrix

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels


def plot_confusion_matrix(
        y_true,
        y_pred,
        labels=None,
        normalize=False,
        title=None,
        cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    y_true = y_true.argmax(axis=1)
    y_pred = y_pred.argmax(axis=1)

    cm = confusion_matrix(y_true, y_pred)

    # Only use the labels that appear in the data
    #classes = classes[unique_labels(y_true, y_pred)]
    ul = unique_labels(y_true, y_pred)
    ul = ul.tolist()
    classes = []
    for label in ul:
        classes.append(labels[label])

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    else:
        print('Confusion matrix, without normalization')

    fig, ax = plt.subplots(figsize=(8, 8))
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    # ax.figure.colorbar(im, ax=ax)

    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()

    plt.show()

    return ax


np.set_printoptions(precision=2)

# obtain predictions
y_pred = model.predict(X_test)

# Plot normalized confusion matrix
ax = plot_confusion_matrix(y_test, y_pred, labels=class_label_to_name, normalize=True,
                      title='Normalized confusion matrix')
