## Milestone 3 Example Solution: Training a Convolutional Neural Network for Audio Classification



_Build: Define a working convolutional neural network architecture_

**Featurize data and determine input and output tensor shapes:**

In [1]:
#Import required libraries and define the same variables used in the setup notebook
import numpy
from pathlib import Path
from IPython.display import Audio
import pandas
import librosa
import matplotlib.pyplot as plt

current_dir = Path('.')
data_path = current_dir / 'google_speech'
train_audio_path = data_path / 'train' / 'audio'
train_data = pandas.read_csv(data_path / "train" / "train.csv")
example_audio_path = train_audio_path / 'zero' / 'ffd2ba2f_nohash_1.wav'

In [2]:
def mel_spectrogram(
    audio: numpy.ndarray,
    sample_rate: int,
    threshold: int=None,
):
    melspec = librosa.feature.melspectrogram(y=audio, sr=sample_rate)
    melspec = librosa.power_to_db(melspec, ref=numpy.max)
    if threshold:
        melspec[melspec < threshold] = -80
    return melspec

In [3]:
def pad_audio_with_silence(audio: numpy.ndarray, sample_rate: int, duration: float):
    target_n_samples = int(duration * sample_rate)
    assert audio.shape[0] < target_n_samples
    padded_audio = numpy.zeros(target_n_samples)
    padded_audio[:audio.shape[0]] = audio
    return padded_audio

In [4]:
def featurize_clip(file_path):
    audio, sample_rate = librosa.load(f"{train_audio_path}/{file_path}", sr=None)
    duration = 1
    if (audio.shape[0] < int(duration * sample_rate)):
        audio = pad_audio_with_silence(audio, sample_rate, duration)
    spectrogram = mel_spectrogram(audio, sample_rate)
    return spectrogram

In [5]:
%%time
train_data["spectrogram"] = train_data["file_path"].apply(featurize_clip)

CPU times: total: 1h 2min 2s
Wall time: 16min 35s


In [6]:
spectrogram_shape = train_data["spectrogram"].iloc[1].shape
spectrogram_shape

(128, 32)

In [7]:
input_shape = (spectrogram_shape[0], spectrogram_shape[1], 1)
input_shape

(128, 32, 1)

In [8]:
n_classes = train_data["label"].unique().shape[0]
n_classes

30

**Define architecture:**

In [9]:
from tensorflow import keras

In [10]:
mvm_net = keras.models.Sequential(
    [
        keras.layers.Convolution2D(
            input_shape=input_shape,
            filters=8,
            kernel_size=(3,3),
            activation="relu",
        ),
        keras.layers.MaxPooling2D(
            pool_size=(2,2)
        ),
        keras.layers.Flatten(),
        keras.layers.Dense(
            units=64,
            activation="relu"
        ),
        keras.layers.Dense(
            units=32,
            activation="relu"
        ),
        keras.layers.Dense(
            units=n_classes,
            activation="softmax"
        )
        
    ]
)

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
import numpy

In [13]:
X = numpy.stack(train_data["spectrogram"])
X.shape

(51088, 128, 32)

In [14]:
X = numpy.expand_dims(X, axis=3)
X.shape

(51088, 128, 32, 1)

In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
encoder = LabelEncoder()
y = encoder.fit_transform(train_data[["label"]])

  y = column_or_1d(y, warn=True)


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

_Train: Fit the model to the data and observe training progress._

In [18]:
mvm_net.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=[
        "accuracy", 
    ]
)

In [19]:
history = mvm_net.fit(
    X_train,
    y_train,
    epochs=10,
    verbose=True,
    validation_split=0.2
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


_Test and validate: Evaluate your model's performance using quality metrics, baseline models and real-world testing._

In [23]:
pred = mvm_net.predict(X_test)
y_pred = numpy.argmax(pred, axis=1)
y_pred



array([20, 10, 16, ..., 12, 11, 14], dtype=int64)

In [24]:
y_test

array([25, 10, 26, ..., 12, 11,  8])

In [25]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score, f1_score

In [26]:
accuracy_score(y_test, y_pred)

0.5985515756508123

In [27]:
f1_score(y_test, y_pred, average="macro")

0.5946228709563228

In [28]:
from sklearn.dummy import DummyClassifier

In [29]:
y_dummy = DummyClassifier(strategy="stratified").fit(X_train, y_train).predict(X_test)
y_dummy

array([ 6, 17,  7, ...,  8, 20, 21])

In [30]:
accuracy_score(y_test, y_dummy)

0.03327461342728518

In [31]:
f1_score(y_test, y_dummy, average="macro")

0.032144752091663266

In [32]:
# Now we save in a .npz file the train and test dataset, as we will use them in the next milestone (around 1.5Gb)
numpy.savez('arrays.npz', y_pred=y_pred, y_test=y_test, y_train=y_train, X_test=X_test, X_train=X_train)