- Accent-recognizing model, same as in main2.ipynb
- Including Yamnet model.
- Including hyperparam tuning.

In [None]:
%pip install -U -q tensorflow tensorflow_datasets
#apt install --allow-change-held-packages libcudnn8=8.1.0.77-1+cuda11.2
%pip install -U -q keras-tuner

In [2]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
import pathlib
import os
from os.path import isfile, join, splitext
import librosa
import soundfile as sf
from pydub import AudioSegment
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

import tensorflow as tf
from tensorflow import keras
import tensorflow_io as tfio
import tensorflow_hub as tfhub
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from keras.models import Sequential
import keras_tuner as kt

import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Audio

SAMPLING_RATE = 16000
EPOCHS = 50
BATCH_SIZE = 128
SHUFFLE_SEED = 43
MODEL_NAME = "uu_accent_recognition"

# Set the seed value for experiment reproducibility.
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

2023-01-19 18:02:13.007373: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Set the target classes.

In [5]:
df = pd.read_csv(os.getcwd() + "/data/audio.csv", on_bad_lines = 'skip' , delimiter= ';')
# print(len(df['native_langs'].unique()))
# for lang in df['native_langs'].unique():
#     print(f'"{lang}",')
class_names = df['native_langs'].unique()
lang_idxs = range(len(class_names))
class_dict = dict(zip(class_names, lang_idxs))

print(f"There are {len(class_names)} classes.")
print(f"There are {len(class_dict)} classes.")

There are 228 classes.
There are 228 classes.


### Prepare dataset using Yamnet model as feature-extractor

Yamnet requires a slightly different way of preprocessing the dataset.

In [7]:
yamnet_model = tf.saved_model.load('./yamnet_1')

In [10]:
# audio_wav = path_to_audio('data/audio_wav/afrikaans1.wav')

def load_16k_audio_wav(filename):
    # Read file content
    file_content = tf.io.read_file(filename)

    # Decode audio wave
    audio_wav, sample_rate = tf.audio.decode_wav(file_content, desired_channels=1)
    audio_wav = tf.squeeze(audio_wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)

    # Resample to 16k
    audio_wav = tfio.audio.resample(audio_wav, rate_in=sample_rate, rate_out=16000)
    return audio_wav


def filepath_to_embeddings(filename, label):
    # Load 16k audio wave
    audio_wav = load_16k_audio_wav(filename)

    # Get audio embeddings & scores.
    # The embeddings are the audio features extracted using transfer learning
    # while scores will be used to identify time slots that are not speech
    # which will then be gathered into a specific new category 'other'
    scores, embeddings, _ = yamnet_model(audio_wav)

    # Number of embeddings in order to know how many times to repeat the label
    embeddings_num = tf.shape(embeddings)[0]
    labels = tf.repeat(label, embeddings_num)

    # Change labels for time-slots that are not speech into a new category 'other'
    # labels = tf.where(tf.argmax(scores, axis=1) == 0, label, len(class_names) - 1)

    # Using one-hot in order to use AUC
    return (embeddings, tf.one_hot(labels, len(class_names)))


def dataframe_to_dataset(dataframe, batch_size=64):
    # print(dataframe.info())
    # Rewrite file_name to contain file paths
    dataframe['file_name'] = dataframe.apply(
        lambda row: os.path.join(os.getcwd(), 'data/audio_wav', row["file_name"] + ".wav"), 
        axis=1
    )
    # Convert the labels into numbers
    dataframe['native_langs'] = dataframe.apply(
        lambda row: class_dict[row['native_langs']],
        axis=1
    )

    dataset = tf.data.Dataset.from_tensor_slices((dataframe["file_name"], dataframe["native_langs"]))

    dataset = dataset.map(
        lambda x, y: filepath_to_embeddings(x, y),
        num_parallel_calls=tf.data.experimental.AUTOTUNE,
    ).unbatch()

    return dataset.cache().batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [23]:
dataframe = pd.read_csv(os.getcwd() + "/data/audio.csv", on_bad_lines='skip', delimiter=';')
dataframe = dataframe[(dataframe['native_langs'] == 'english') | (dataframe['native_langs'] == 'spanish')]
print(f'Number of data points: {len(dataframe)}')

# Splitting training and validation set
split = int(len(dataframe) * 0.8)
train_df = dataframe[:split]
valid_df = dataframe[split:]

train_ds = dataframe_to_dataset(train_df)
valid_ds = dataframe_to_dataset(valid_df)

Number of data points: 886


### Build the model for hyperparameter tuning

In [None]:
def build_and_compile_model(hp):
    inputs = keras.layers.Input(shape=(1024), name="embedding")

    # Tune the number of units in the first Dense layer. Choose an optimal value between 32-512
    hp_units = hp.Int('dense_layer_1', min_value=32, max_value=512, step=32)
    x = keras.layers.Dense(units=hp_units, activation="relu", name="dense_1")(inputs)
    x = keras.layers.Dropout(0.15, name="dropout_1")(x)

    x = keras.layers.Dense(384, activation="relu", name="dense_2")(x)
    x = keras.layers.Dropout(0.2, name="dropout_2")(x)

    x = keras.layers.Dense(192, activation="relu", name="dense_3")(x)
    x = keras.layers.Dropout(0.25, name="dropout_3")(x)

    x = keras.layers.Dense(384, activation="relu", name="dense_4")(x)
    x = keras.layers.Dropout(0.2, name="dropout_4")(x)

    outputs = keras.layers.Dense(len(class_names), activation="softmax", name="ouput")(x)

    model = keras.Model(inputs=inputs, outputs=outputs, name="accent_recognition")

    # Tune the learning rate for the optimizer. Choose an optimal value from 0.001, 0.0001, or 1.9644e-5
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-3, 1e-4, 1.9644e-5])
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
        loss=keras.losses.CategoricalCrossentropy(),
        metrics=["accuracy", keras.metrics.AUC(name="auc")],
    )

    return model

In [None]:
tuner = kt.Hyperband(build_and_compile_model,
                    objective='val_accuracy',
                    max_epochs=20,
                    factor=3,
                    hyperband_iterations=10,
                    directory='uu_accent_detection_dir',
                    project_name='uu_accent_detection',
                    overwrite=False)

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

tuner.search(train_ds, 
            epochs=20, 
            validation_data=valid_ds, 
            callbacks=[stop_early],
            verbose=2)

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. 
The optimal number of units in the first densely-connected layers is {best_hps.get('dense_layer_1')}. 
The optimal learning rate for the optimizer is {best_hps.get('learning_rate')}.
""")


# model.summary()

#### Class weight calculation

Since the dataset is quite unbalanced, we wil use class_weight argument during training.

Getting the class weights is a little tricky because even though we know the number of audio files for each class, it does not represent the number of samples for that class since Yamnet transforms each audio file into multiple audio samples of 0.96 seconds each. So every audio file will be split into a number of samples that is proportional to its length.

Therefore, to get those weights, we have to calculate the number of samples for each class after preprocessing through Yamnet.

In [None]:
class_counts = tf.zeros(shape=(len(class_names),), dtype=tf.int32)

for x, y in iter(train_ds):
    class_counts = class_counts + tf.math.bincount(
        tf.cast(tf.math.argmax(y, axis=1), tf.int32), minlength=len(class_names)
    )

class_weight = {
    i: tf.math.reduce_sum(class_counts).numpy() / class_counts[i].numpy()
    for i in range(len(class_counts))
}

print(class_weight)

### Train the model

We use Keras callbacks in order to:

- Stop whenever the validation AUC stops improving.
- Save the best model.
- Call TensorBoard in order to later view the training and validation logs.

In [15]:
# Build the model with the optimal hyperparameters and train it on the data for 50 epochs
hypermodel = tuner.hypermodel.build(best_hps)

early_stopping_cb = keras.callbacks.EarlyStopping(monitor="val_auc", patience=10, restore_best_weights=True)
model_checkpoint_cb = keras.callbacks.ModelCheckpoint(MODEL_NAME + ".h5", monitor="val_auc", save_best_only=True)
tensorboard_cb = keras.callbacks.TensorBoard(os.path.join(os.curdir, "logs", hypermodel.name))

callbacks = [early_stopping_cb, model_checkpoint_cb, tensorboard_cb]

history = hypermodel.fit(train_ds, 
                        validation_data=valid_ds,
                        epochs=EPOCHS,
                        class_weight=class_weight,
                        batch_size=kt.HyperParameters.Choice('batch_size', [16, 32]),
                        callbacks=callbacks,
                        verbose=2)

# val_acc_per_epoch = history.history['val_accuracy']
# best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
# print('Best epoch: %d' % (best_epoch,))

#### Evaluation

In [None]:
train_loss, train_acc, train_auc = hypermodel.evaluate(train_ds)
valid_loss, valid_acc, valid_auc = hypermodel.evaluate(valid_ds)

#### Results

Plotting the accuracy and AUC results.

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(14, 5))

axs[0].plot(range(EPOCHS), history.history["accuracy"], label="Training")
axs[0].plot(range(EPOCHS), history.history["val_accuracy"], label="Validation")
axs[0].set_xlabel("Epochs")
axs[0].set_title("Training & Validation Accuracy")
axs[0].legend()
axs[0].grid(True)

axs[1].plot(range(EPOCHS), history.history["auc"], label="Training")
axs[1].plot(range(EPOCHS), history.history["val_auc"], label="Validation")
axs[1].set_xlabel("Epochs")
axs[1].set_title("Training & Validation AUC")
axs[1].legend()
axs[1].grid(True)

plt.show()

#### Confusion Matrix

Let's now plot the confusion matrix for the validation dataset.

The confusion matrix lets us see, for every class, not only how many samples were correctly classified, but also which other classes were the samples confused with.

It allows us to calculate the precision and recall for every class.

In [None]:
# Create x and y tensors
x_valid = None
y_valid = None

for x, y in iter(valid_ds):
    if x_valid is None:
        x_valid = x.numpy()
        y_valid = y.numpy()
    else:
        x_valid = np.concatenate((x_valid, x.numpy()), axis=0)
        y_valid = np.concatenate((y_valid, y.numpy()), axis=0)

# Generate predictions
y_pred = hypermodel.predict(x_valid)

# Calculate confusion matrix
confusion_mtx = tf.math.confusion_matrix(
    np.argmax(y_valid, axis=1), np.argmax(y_pred, axis=1)
)

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(
    confusion_mtx, xticklabels=class_names, yticklabels=class_names, annot=True, fmt="g"
)
plt.xlabel("Prediction")
plt.ylabel("Label")
plt.title("Validation Confusion Matrix")
plt.show()

#### Precision & Recall

For every class:

- Recall is the ratio of correctly classified samples i.e. it shows how many samples of this specific class, the model is able to detect. It is the ratio of diagonal elements to the sum of all elements in the row.
- Precision shows the accuracy of the classifier. It is the ratio of correctly predicted samples among the ones classified as belonging to this class. It is the ratio of diagonal elements to the sum of all elements in the column.

In [None]:
for i, label in enumerate(class_names):
    precision = confusion_mtx[i, i] / np.sum(confusion_mtx[:, i])
    recall = confusion_mtx[i, i] / np.sum(confusion_mtx[i, :])
    print(
        "{0:15} Precision:{1:.2f}%; Recall:{2:.2f}%".format(
            label, precision * 100, recall * 100
        )
    )