## Milestone 3 Example Solution: Training a Convolutional Neural Network for Audio Classification



_Build: Define a working convolutional neural network architecture_

**Featurize data and determine input and output tensor shapes:**

In [350]:
def featurize_clip(file_path):
    audio, sample_rate = librosa.load(f"{train_audio_path}/{file_path}", sr=None)
    audio = pad_audio_with_silence(audio, sample_rate, duration=1)
    spectrogram = mel_spectrogram(audio, sample_rate)
    return spectrogram

In [351]:
%%time
train_data["spectrogram"] = train_data["file_path"].apply(featurize_clip)

CPU times: user 5min 53s, sys: 12.4 s, total: 6min 6s
Wall time: 1min 35s


In [352]:
spectrogram_shape = train_data["spectrogram"].iloc[1].shape
spectrogram_shape

(128, 32)

In [353]:
input_shape = (spectrogram_shape[0], spectrogram_shape[1], 1)
input_shape

(128, 32, 1)

In [354]:
n_classes = train_data["label"].unique().shape[0]
n_classes

30

**Define architecture:**

In [222]:
from tensorflow import keras

In [308]:
mvm_net = keras.models.Sequential(
    [
        keras.layers.Convolution2D(
            input_shape=input_shape,
            filters=8,
            kernel_size=(3,3),
            activation="relu",
        ),
        keras.layers.MaxPooling2D(
            pool_size=(2,2)
        ),
        keras.layers.Flatten(),
        keras.layers.Dense(
            units=64,
            activation="relu"
        ),
        keras.layers.Dense(
            units=32,
            activation="relu"
        ),
        keras.layers.Dense(
            units=n_classes,
            activation="softmax"
        )
        
    ]
)

In [309]:
from sklearn.model_selection import train_test_split

In [310]:
import numpy

In [311]:
X = numpy.stack(train_data["spectrogram"])
X.shape

(13633, 128, 32)

In [312]:
X = numpy.expand_dims(X, axis=3)
X.shape

(13633, 128, 32, 1)

In [313]:
from sklearn.preprocessing import LabelEncoder

In [314]:
encoder = LabelEncoder()
y = encoder.fit_transform(train_data[["label"]])

In [315]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

_Train: Fit the model to the data and observe training progress._

In [316]:
mvm_net.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=[
        "accuracy", 
    ]
)

In [317]:
history = mvm_net.fit(
    X_train,
    y_train,
    epochs=10,
    verbose=True,
    validation_split=0.2
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


_Test and validate: Evaluate your model's performance using quality metrics, baseline models and real-world testing._

In [355]:
y_pred = mvm_net.predict_classes(X_test)
y_pred

array([19, 16, 25, ...,  7, 28, 20])

In [356]:
y_test

array([ 8, 16, 23, ...,  7, 25,  4])

In [345]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score, f1_score

In [348]:
accuracy_score(y_test, y_pred)

0.4281524926686217

In [349]:
f1_score(y_test, y_pred, average="macro")

0.4099964609322261

In [357]:
from sklearn.dummy import DummyClassifier

In [359]:
y_dummy = DummyClassifier(strategy="stratified").fit(X_train, y_train).predict(X_test)
y_dummy

array([13,  6, 11, ..., 25, 24,  8])

In [360]:
accuracy_score(y_test, y_dummy)

0.03152492668621701

In [361]:
f1_score(y_test, y_dummy, average="macro")

0.029492052187509075