In [1]:
import gc
import os
import pickle
import random
import shutil

import numpy as np
import pandas as pd
import tensorflow as tf
from keras import losses, optimizers, metrics, callbacks, Model, layers, backend as K
from augment_layers import FreqMaskLayer, TimeMaskLayer
import SpeechModels

In [2]:
tf.config.list_logical_devices()

[LogicalDevice(name='/device:CPU:0', device_type='CPU'),
 LogicalDevice(name='/device:GPU:0', device_type='GPU')]

In [3]:
SEED = 123
N_CLASS = 12
MAX_EPOCHS = 200

In [4]:
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

## Load data

In [5]:
train_ds = tf.keras.utils.audio_dataset_from_directory(
    directory="data/train",
    batch_size=512,
    output_sequence_length=16000,
    shuffle=True,
    seed=SEED
)

val_ds = tf.keras.utils.audio_dataset_from_directory(
    directory="data/val",
    batch_size=512,
    output_sequence_length=16000,
    shuffle=False
)

test_ds = tf.keras.utils.audio_dataset_from_directory(
    directory="data/test",
    batch_size=512,
    output_sequence_length=16000,
    shuffle=False
)

label_names = np.array(train_ds.class_names)
print("label names:", label_names)

Found 45586 files belonging to 12 classes.
Found 6513 files belonging to 12 classes.
Found 13024 files belonging to 12 classes.
label names: ['down' 'go' 'left' 'no' 'off' 'on' 'right' 'silence' 'stop' 'unknown'
 'up' 'yes']


In [6]:
def squeeze(audio, labels):
    audio = tf.squeeze(audio, axis=-1)
    return audio, labels

train_ds = train_ds.map(squeeze, tf.data.AUTOTUNE)
val_ds = val_ds.map(squeeze, tf.data.AUTOTUNE)
test_ds = test_ds.map(squeeze, tf.data.AUTOTUNE)

## Model from article

In [7]:
def create_model(freq=False, time=False):
    m = SpeechModels.get_melspec_model(iLen=16000)
    m.trainable = False
    inputs, outputs = m.inputs, m.outputs

    x = m(inputs)
    if freq:
        x = FreqMaskLayer(10)(x)
    if time:
        x = TimeMaskLayer(10)(x)
    x = tf.expand_dims(x, axis=-1, name='mel_stft')

    x = layers.Conv2D(10, (5, 1), activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(1, (5, 1), activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)

    # x = Reshape((125, 80)) (x)
    # keras.backend.squeeze(x, axis)
    x = layers.Lambda(lambda q: K.squeeze(q, -1), name='squeeze_last_dim')(x)

    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True)
                        )(x)  # [b_s, seq_len, vec_dim]
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True)
                        )(x)  # [b_s, seq_len, vec_dim]

    x_first = layers.Lambda(lambda q: q[:, -1])(x)  # [b_s, vec_dim]
    query = layers.Dense(128)(x_first)

    # dot product attention
    att_scores = layers.Dot(axes=[1, 2])([query, x])
    att_scores = layers.Softmax(name='attSoftmax')(att_scores)  # [b_s, seq_len]

    # rescale sequence
    att_vector = layers.Dot(axes=[1, 1])([att_scores, x])  # [b_s, vec_dim]
    x = layers.Dropout(rate=0.3)(att_vector)
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dropout(rate=0.3)(x)
    x = layers.Dense(32)(x)
    x = layers.Dropout(rate=0.3)(x)
    output = layers.Dense(N_CLASS, activation='softmax', name='output')(x)

    model = Model(inputs=[inputs], outputs=[output])

    return model

model = create_model(time=True)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 16000)]      0           []                               
                                                                                                  
 normalized_spectrogram_model (  (None, 125, 80)     0           ['input[0][0]']                  
 Functional)                                                                                      
                                                                                                  
 time_mask_layer (TimeMaskLayer  (None, 125, 80)     0           ['normalized_spectrogram_model[0]
 )                                                               [0]']                            
                                                                                              

In [8]:
model.compile(
    optimizer=optimizers.Adam(learning_rate=0.001),
    loss=losses.SparseCategoricalCrossentropy(),
    metrics=[metrics.SparseCategoricalAccuracy(), metrics.SparseCategoricalCrossentropy()]
)

early_stopping = callbacks.EarlyStopping(
    monitor='val_sparse_categorical_accuracy',
    min_delta=0,
    patience=5,
    verbose=0,
    mode='max',
    baseline=None,
    restore_best_weights=True
)

reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_sparse_categorical_accuracy', factor=0.5, patience=3, min_lr=0.00001, verbose=1)

In [9]:
history = model.fit(
    train_ds,
    epochs=2,
    validation_data=val_ds,
    shuffle=True,
    callbacks=[early_stopping, reduce_lr]
)

Epoch 1/2
Epoch 2/2


In [10]:
model = create_model(freq=True)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 16000)]      0           []                               
                                                                                                  
 normalized_spectrogram_model (  (None, 125, 80)     0           ['input[0][0]']                  
 Functional)                                                                                      
                                                                                                  
 freq_mask_layer (FreqMaskLayer  (None, 125, 80)     0           ['normalized_spectrogram_model[0]
 )                                                               [0]']                            
                                                                                            

## Experiments

Training will be repeated 5 times with different weights initialization.

In [11]:
TRAINING_SEEDS = list(range(5))

for aug in ["time", "freq"]:
    results = []
    EXPERIMENT_NAME = f"article_net_{aug}"
    if os.path.exists(EXPERIMENT_NAME):
        shutil.rmtree(EXPERIMENT_NAME)
        os.mkdir(EXPERIMENT_NAME)
    else:
        os.mkdir(EXPERIMENT_NAME)

    for seed in TRAINING_SEEDS:
        random.seed(seed)
        np.random.seed(seed)
        tf.random.set_seed(seed)

        if aug == "time":
            model = create_model(time=True)
        else:
            model = create_model(freq=True)

        model.compile(
            optimizer=optimizers.Adam(learning_rate=0.01),
            loss=losses.SparseCategoricalCrossentropy(),
            metrics=[metrics.SparseCategoricalAccuracy(), metrics.SparseCategoricalCrossentropy()]
        )

        history = model.fit(
            train_ds,
            epochs=MAX_EPOCHS,
            validation_data=val_ds,
            shuffle=True,
            callbacks=[early_stopping, reduce_lr]
        )

        with open(os.path.join(EXPERIMENT_NAME, f"history_{seed}.pkl"), "wb") as file:
            pickle.dump(history.history, file)

        eval_results = model.evaluate(test_ds)

        predictions = model.predict(test_ds)
        with open(os.path.join(EXPERIMENT_NAME, f"predictions_{seed}.pkl"), "wb") as file:
            pickle.dump(predictions, file)

        results += [{
            'seed': seed,
            'results': dict(zip(model.metrics_names, eval_results))
        }]
        gc.collect()

    results_temp = pd.DataFrame(results)
    results_df = pd.concat([results_temp.drop(["results"], axis=1), results_temp["results"].apply(pd.Series)], axis=1)
    results_df.to_csv(os.path.join(EXPERIMENT_NAME, 'results.csv'))

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 16: ReduceLROnPlateau reducing learning rate to 0.004999999888241291.
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 21: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 27: ReduceLROnPlateau reducing learning rate to 0.0012499999720603228.
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 34: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 35/200
Epoch 36/200
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 12: ReduceLROnPlateau reducing learning rate to 0.004999999888241291.
Epoch 13/20

In [12]:
results_df

Unnamed: 0,seed,loss,sparse_categorical_accuracy,sparse_categorical_crossentropy
0,0,0.138397,0.969671,0.138397
1,1,0.155362,0.969979,0.155362
2,2,0.139853,0.967368,0.139853
3,3,0.159299,0.969134,0.159299
4,4,0.175304,0.95539,0.175304
