In [1]:
import gc
import os
import pickle
import random
import shutil

import numpy as np
import pandas as pd
import tensorflow as tf
from keras import losses, optimizers, metrics, callbacks

In [2]:
tf.config.list_logical_devices()

[LogicalDevice(name='/device:CPU:0', device_type='CPU'),
 LogicalDevice(name='/device:GPU:0', device_type='GPU')]

In [3]:
SEED = 123
N_CLASS = 12
MAX_EPOCHS = 200

In [4]:
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

## Load data

In [5]:
train_ds = tf.keras.utils.audio_dataset_from_directory(
    directory="data/train",
    batch_size=512,
    output_sequence_length=16000,
    shuffle=True,
    seed=SEED
)

val_ds = tf.keras.utils.audio_dataset_from_directory(
    directory="data/val",
    batch_size=512,
    output_sequence_length=16000,
    shuffle=False
)

test_ds = tf.keras.utils.audio_dataset_from_directory(
    directory="data/test",
    batch_size=512,
    output_sequence_length=16000,
    shuffle=False
)

label_names = np.array(train_ds.class_names)
print("label names:", label_names)

Found 45586 files belonging to 12 classes.
Found 6513 files belonging to 12 classes.
Found 13024 files belonging to 12 classes.
label names: ['down' 'go' 'left' 'no' 'off' 'on' 'right' 'silence' 'stop' 'unknown'
 'up' 'yes']


In [6]:
def squeeze(audio, labels):
    audio = tf.squeeze(audio, axis=-1)
    return audio, labels

train_ds = train_ds.map(squeeze, tf.data.AUTOTUNE)
val_ds = val_ds.map(squeeze, tf.data.AUTOTUNE)
test_ds = test_ds.map(squeeze, tf.data.AUTOTUNE)

## CNN + LSTM Complex

In [7]:
from SpeechModels import get_melspec_model
import tensorflow.keras.layers as L
from keras.models import Model

def CNN_LSTM(n_out, input_length, kernel_size):

    mel_spec_model = get_melspec_model(input_length)
    inputs, outputs = mel_spec_model.inputs, mel_spec_model.outputs
    o_shape = outputs[0].shape
    y1 = L.Conv1D(32, kernel_size, activation='relu', padding='same', input_shape=o_shape[1:])(outputs[0])
    y2 = L.Conv1D(32, kernel_size * 2, activation='relu', padding='same', input_shape=o_shape[1:])(outputs[0])
    y3 = L.Conv1D(32, kernel_size * 4, activation='relu', padding='same', input_shape=o_shape[1:])(outputs[0])
    y4 = L.Conv1D(32, kernel_size * 8, activation='relu', padding='same', input_shape=o_shape[1:])(outputs[0])

    x = L.Concatenate(axis=-1)([y1, y2, y3, y4])

    x = L.Bidirectional(L.LSTM(128, return_sequences=False)
                        )(x)  # [b_s, seq_len, vec_dim]

    output = L.Dense(n_out, activation='softmax', name='output')(x)

    model = Model(inputs=[inputs], outputs=[output])

    return model


model = CNN_LSTM(N_CLASS, 16000, 4)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 16000)]      0           []                               
                                                                                                  
 tf.signal.stft (TFOpLambda)    (None, 125, 513)     0           ['input[0][0]']                  
                                                                                                  
 tf.math.abs (TFOpLambda)       (None, 125, 513)     0           ['tf.signal.stft[0][0]']         
                                                                                                  
 tf.tensordot (TFOpLambda)      (None, 125, 80)      0           ['tf.math.abs[0][0]']            
                                                                                              

In [8]:
model.compile(
    optimizer=optimizers.Adam(learning_rate=0.001),
    loss=losses.SparseCategoricalCrossentropy(),
    metrics=[metrics.SparseCategoricalAccuracy(), metrics.SparseCategoricalCrossentropy()]
)
checkpoint = callbacks.ModelCheckpoint(
    filepath="simple_cnn.h5",
    monitor="val_sparse_categorical_accuracy",
    verbose=0,
    save_best_only=True,
    save_weights_only=True
)

early_stopping = callbacks.EarlyStopping(
    monitor='val_sparse_categorical_accuracy',
    min_delta=0,
    patience=5,
    verbose=0,
    mode='max',
    baseline=None,
    restore_best_weights=True
)

reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_sparse_categorical_accuracy', factor=0.5, patience=3, min_lr=0.00001, verbose=1)

In [9]:
history = model.fit(
    train_ds,
    epochs=2,
    validation_data=val_ds,
    shuffle=True,
    callbacks=[checkpoint, early_stopping, reduce_lr]
)

Epoch 1/2
Epoch 2/2


## Experiments

Training will be repeated 5 times with different weights initialization.

In [10]:
EXPERIMENT_NAME = "cnn_lstm_complex"
if os.path.exists(EXPERIMENT_NAME):
    shutil.rmtree(EXPERIMENT_NAME)
    os.mkdir(EXPERIMENT_NAME)
else:
    os.mkdir(EXPERIMENT_NAME)

In [11]:
TRAINING_SEEDS = list(range(5))
results = []
for seed in TRAINING_SEEDS:
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    model = CNN_LSTM(N_CLASS, 16000, 4)

    model.compile(
        optimizer=optimizers.Adam(learning_rate=0.01),
        loss=losses.SparseCategoricalCrossentropy(),
        metrics=[metrics.SparseCategoricalAccuracy(), metrics.SparseCategoricalCrossentropy()]
    )

    history = model.fit(
        train_ds,
        epochs=MAX_EPOCHS,
        validation_data=val_ds,
        shuffle=True,
        callbacks=[early_stopping, reduce_lr]
    )

    with open(os.path.join(EXPERIMENT_NAME, f"history_{seed}.pkl"), "wb") as file:
        pickle.dump(history.history, file)

    eval_results = model.evaluate(test_ds)

    predictions = model.predict(test_ds)
    with open(os.path.join(EXPERIMENT_NAME, f"predictions_{seed}.pkl"), "wb") as file:
        pickle.dump(predictions, file)

    results += [{
        'seed': seed,
        'results': dict(zip(model.metrics_names, eval_results))
    }]
    gc.collect()

results = pd.DataFrame(results)
results = pd.concat([results.drop(["results"], axis=1), results["results"].apply(pd.Series)], axis=1)
results.to_csv(os.path.join(EXPERIMENT_NAME, 'results.csv'))

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 35: ReduceLROnPlateau reducing learning rate to 0.004999999888241291.
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 46: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/20

In [12]:
results

Unnamed: 0,seed,loss,sparse_categorical_accuracy,sparse_categorical_crossentropy
0,0,0.544547,0.828547,0.544547
1,1,0.674495,0.798526,0.674495
2,2,0.479161,0.855037,0.479161
3,3,0.335362,0.900108,0.335362
4,4,0.557034,0.823787,0.557034
