In [1]:
import gc
import os
import pickle
import random
import shutil

import numpy as np
import pandas as pd
import tensorflow as tf
from keras import losses, optimizers, metrics, callbacks, Model, layers, backend as K
import SpeechModels
from augment_layers import FreqMaskLayer, TimeMaskLayer


In [2]:
tf.config.list_logical_devices()

[LogicalDevice(name='/device:CPU:0', device_type='CPU'),
 LogicalDevice(name='/device:GPU:0', device_type='GPU')]

In [3]:
SEED = 123
N_CLASS = 12
MAX_EPOCHS = 200
TEST_PATH = "tensorflow-speech-recognition-challenge/test"
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [4]:
train_ds = tf.keras.utils.audio_dataset_from_directory(
    directory="data/train",
    batch_size=512,
    output_sequence_length=16000,
    shuffle=True,
    seed=SEED
)

val_ds = tf.keras.utils.audio_dataset_from_directory(
    directory="data/val",
    batch_size=512,
    output_sequence_length=16000,
    shuffle=True
)

test_ds = tf.keras.utils.audio_dataset_from_directory(
    directory="data/test",
    batch_size=512,
    output_sequence_length=16000,
    shuffle=True
)

label_names = np.array(train_ds.class_names)
print("label names:", label_names)

Found 45586 files belonging to 12 classes.
Found 6513 files belonging to 12 classes.
Found 13024 files belonging to 12 classes.
label names: ['down' 'go' 'left' 'no' 'off' 'on' 'right' 'silence' 'stop' 'unknown'
 'up' 'yes']


In [5]:
test_dataset = tf.keras.utils.audio_dataset_from_directory(
    directory=TEST_PATH,
    batch_size=512,
    output_sequence_length=16000,
    shuffle=False
)

Found 158538 files belonging to 1 classes.


In [6]:
test_dataset.class_names

['audio']

In [7]:
train_dataset = train_ds.concatenate(val_ds)
train_dataset.element_spec

(TensorSpec(shape=(None, 16000, None), dtype=tf.float32, name=None),
 TensorSpec(shape=(None,), dtype=tf.int32, name=None))

In [8]:
validation_dataset = test_ds
validation_dataset.element_spec

(TensorSpec(shape=(None, 16000, None), dtype=tf.float32, name=None),
 TensorSpec(shape=(None,), dtype=tf.int32, name=None))

In [9]:
def squeeze(audio, labels):
    audio = tf.squeeze(audio, axis=-1)
    return audio, labels


train_dataset = train_dataset.map(squeeze, tf.data.AUTOTUNE)
validation_dataset = validation_dataset.map(squeeze, tf.data.AUTOTUNE)
test_dataset = test_dataset.map(squeeze, tf.data.AUTOTUNE)


def create_model(freq=False, time=False):
    m = SpeechModels.get_melspec_model(iLen=16000)
    m.trainable = False
    inputs, outputs = m.inputs, m.outputs

    x = m(inputs)
    if freq:
        x = FreqMaskLayer(10)(x)
    if time:
        x = TimeMaskLayer(10)(x)
    x = tf.expand_dims(x, axis=-1, name='mel_stft')

    x = layers.Conv2D(10, (5, 1), activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(1, (5, 1), activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)

    # x = Reshape((125, 80)) (x)
    # keras.backend.squeeze(x, axis)
    x = layers.Lambda(lambda q: K.squeeze(q, -1), name='squeeze_last_dim')(x)

    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True)
                             )(x)  # [b_s, seq_len, vec_dim]
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True)
                             )(x)  # [b_s, seq_len, vec_dim]

    x_first = layers.Lambda(lambda q: q[:, -1])(x)  # [b_s, vec_dim]
    query = layers.Dense(128)(x_first)

    # dot product attention
    att_scores = layers.Dot(axes=[1, 2])([query, x])
    att_scores = layers.Softmax(name='attSoftmax')(att_scores)  # [b_s, seq_len]

    # rescale sequence
    att_vector = layers.Dot(axes=[1, 1])([att_scores, x])  # [b_s, vec_dim]
    x = layers.Dropout(rate=0.3)(att_vector)
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dropout(rate=0.3)(x)
    x = layers.Dense(32)(x)
    x = layers.Dropout(rate=0.3)(x)
    output = layers.Dense(N_CLASS, activation='softmax', name='output')(x)

    model = Model(inputs=[inputs], outputs=[output])

    return model


model = create_model(freq=True)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 16000)]      0           []                               
                                                                                                  
 normalized_spectrogram_model (  (None, 125, 80)     0           ['input[0][0]']                  
 Functional)                                                                                      
                                                                                                  
 freq_mask_layer (FreqMaskLayer  (None, 125, 80)     0           ['normalized_spectrogram_model[0]
 )                                                               [0]']                            
                                                                                              

In [10]:
model.compile(
    optimizer=optimizers.Adam(learning_rate=0.001),
    loss=losses.SparseCategoricalCrossentropy(),
    metrics=[metrics.SparseCategoricalAccuracy(), metrics.SparseCategoricalCrossentropy()]
)

early_stopping = callbacks.EarlyStopping(
    monitor='val_sparse_categorical_accuracy',
    min_delta=0,
    patience=5,
    verbose=0,
    mode='max',
    baseline=None,
    restore_best_weights=True
)
model_checkpoint = callbacks.ModelCheckpoint(
    filepath="best_model_checkpoint_weighted",
    monitor='val_sparse_categorical_accuracy',
    mode='max',
    save_best_only=True)

reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_sparse_categorical_accuracy', factor=0.5, patience=3,
                                        min_lr=0.00001, verbose=1)

In [11]:
weights = {
    0: 27.606189,
    1: 27.454890,
    2: 27.676583,
    3: 27.420211,
    4: 27.629614,
    5: 27.512886,
    6: 27.512886,
    7: 161.997512,
    8: 27.362605,
    9: 1.586856,
    10: 27.420211,
    11: 27.397139
}

In [12]:
history = model.fit(
    train_dataset,
    epochs=MAX_EPOCHS,
    validation_data=validation_dataset,
    callbacks=[early_stopping, reduce_lr, model_checkpoint],
    class_weight=weights
)

Epoch 1/200



INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


Epoch 2/200



INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


Epoch 3/200



INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


Epoch 4/200



INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


Epoch 5/200



INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


Epoch 6/200



INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


Epoch 7/200
Epoch 8/200



INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


Epoch 9/200
Epoch 10/200



INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


Epoch 11/200



INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 14: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 15/200



INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


Epoch 16/200
Epoch 17/200



INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


Epoch 18/200



INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 21: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 22/200
Epoch 23/200



INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


Epoch 24/200



INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


Epoch 25/200



INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


Epoch 26/200
Epoch 27/200
Epoch 27: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 28/200



INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


INFO:tensorflow:Assets written to: best_model_checkpoint_weighted\assets


Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 31: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 32/200
Epoch 33/200


In [13]:
predictions = model.predict(test_dataset)



In [14]:
predict_class = np.array(list(label_names[i] for i in np.argmax(predictions, axis=1)))

In [15]:
for i, (root, dir, fname) in enumerate(os.walk(os.path.join(TEST_PATH, 'audio'))):
    files = np.array(fname)

In [16]:
submission = pd.DataFrame(np.vstack([files, predict_class]).T, columns=["fname", "label"])
submission.to_csv("submission_reweighted.csv", index=False)