In [1]:
import gc
import os
import pickle
import random
import shutil

import numpy as np
import pandas as pd
import tensorflow as tf
from keras import losses, optimizers, metrics, callbacks

import SpeechModels

In [2]:
tf.config.list_logical_devices()

[LogicalDevice(name='/device:CPU:0', device_type='CPU'),
 LogicalDevice(name='/device:GPU:0', device_type='GPU')]

In [3]:
SEED = 123
N_CLASS = 12
MAX_EPOCHS = 200

In [4]:
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

## Load data

In [5]:
train_ds = tf.keras.utils.audio_dataset_from_directory(
    directory="data/train",
    batch_size=512,
    output_sequence_length=16000,
    shuffle=True,
    seed=SEED
)

val_ds = tf.keras.utils.audio_dataset_from_directory(
    directory="data/val",
    batch_size=512,
    output_sequence_length=16000
)

test_ds = tf.keras.utils.audio_dataset_from_directory(
    directory="data/test",
    batch_size=512,
    output_sequence_length=16000)

label_names = np.array(train_ds.class_names)
print("label names:", label_names)

Found 45586 files belonging to 12 classes.
Found 6513 files belonging to 12 classes.
Found 13024 files belonging to 12 classes.
label names: ['down' 'go' 'left' 'no' 'off' 'on' 'right' 'silence' 'stop' 'unknown'
 'up' 'yes']


In [6]:
def squeeze(audio, labels):
    audio = tf.squeeze(audio, axis=-1)
    return audio, labels

train_ds = train_ds.map(squeeze, tf.data.AUTOTUNE)
val_ds = val_ds.map(squeeze, tf.data.AUTOTUNE)
test_ds = test_ds.map(squeeze, tf.data.AUTOTUNE)

## Model from article

In [7]:
model = SpeechModels.AttRNNSpeechModel(N_CLASS, samplingrate = 16000, inputLength = None)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, None)]       0           []                               
                                                                                                  
 normalized_spectrogram_model (  (None, None, 80)    0           ['input[0][0]']                  
 Functional)                                                                                      
                                                                                                  
 tf.expand_dims (TFOpLambda)    (None, None, 80, 1)  0           ['normalized_spectrogram_model[0]
                                                                 [0]']                            
                                                                                              

In [8]:
model.compile(
    optimizer=optimizers.Adam(learning_rate=0.001),
    loss=losses.SparseCategoricalCrossentropy(),
    metrics=[metrics.SparseCategoricalAccuracy(), metrics.SparseCategoricalCrossentropy()]
)
checkpoint = callbacks.ModelCheckpoint(
    filepath="simple_cnn.h5",
    monitor="val_sparse_categorical_accuracy",
    verbose=0,
    save_best_only=True,
    save_weights_only=True
)

early_stopping = callbacks.EarlyStopping(
    monitor='val_sparse_categorical_accuracy',
    min_delta=0,
    patience=5,
    verbose=0,
    mode='max',
    baseline=None,
    restore_best_weights=True
)

reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_sparse_categorical_accuracy', factor=0.5, patience=3, min_lr=0.00001, verbose=1)

In [9]:
history = model.fit(
    train_ds,
    epochs=2,
    validation_data=val_ds,
    shuffle=True,
    callbacks=[checkpoint, early_stopping, reduce_lr]
)

Epoch 1/2
Epoch 2/2


## Experiments

Training will be repeated 5 times with different weights initialization.

In [10]:
EXPERIMENT_NAME = "article_net"
if os.path.exists(EXPERIMENT_NAME):
    shutil.rmtree(EXPERIMENT_NAME)
    os.mkdir(EXPERIMENT_NAME)
else:
    os.mkdir(EXPERIMENT_NAME)

In [15]:
TRAINING_SEEDS = list(range(5))
results = []
for seed in TRAINING_SEEDS:
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    model = SpeechModels.AttRNNSpeechModel(N_CLASS, samplingrate = 16000, inputLength = None)

    model.compile(
        optimizer=optimizers.Adam(learning_rate=0.01),
        loss=losses.SparseCategoricalCrossentropy(),
        metrics=[metrics.SparseCategoricalAccuracy(), metrics.SparseCategoricalCrossentropy()]
    )

    history = model.fit(
        train_ds,
        epochs=MAX_EPOCHS,
        validation_data=val_ds,
        shuffle=True,
        callbacks=[early_stopping, reduce_lr]
    )

    with open(os.path.join(EXPERIMENT_NAME, f"history_{seed}.pkl"), "wb") as file:
        pickle.dump(history.history, file)

    eval_results = model.evaluate(test_ds)

    predictions = model.predict(test_ds)
    with open(os.path.join(EXPERIMENT_NAME, f"predictions_{seed}.pkl"), "wb") as file:
        pickle.dump(predictions, file)

    results += [{
        'seed': seed,
        'results': dict(zip(model.metrics_names, eval_results))
    }]
    gc.collect()

results = pd.DataFrame(results)
results = pd.concat([results.drop(["results"], axis=1), results["results"].apply(pd.Series)], axis=1)
results.to_csv(os.path.join(EXPERIMENT_NAME, 'results.csv'))

Epoch 1/2
Epoch 2/2
Epoch 1/2
13/90 [===>..........................] - ETA: 12s - loss: 1.6736 - sparse_categorical_accuracy: 0.5939 - sparse_categorical_crossentropy: 1.6736

KeyboardInterrupt: 

In [12]:
results

Unnamed: 0,seed,loss,sparse_categorical_accuracy,sparse_categorical_crossentropy
0,0,1.606008,0.630144,1.606008
1,1,1.025078,0.707463,1.025078
2,2,0.427118,0.867168,0.427118
3,3,0.306876,0.905405,0.306876
4,4,1.357723,0.646806,1.357723
