- Accent-recognizing model, same as in main2.ipynb
- Including Yamnet model.

Not included:
- Including hyperparam tuning.

In [1]:
%pip install -U -q tensorflow tensorflow_datasets
#apt install --allow-change-held-packages libcudnn8=8.1.0.77-1+cuda11.2
%pip install -U -q keras-tuner

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [14]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
import pathlib
import os
from os.path import isfile, join, splitext
import librosa
import soundfile as sf
from pydub import AudioSegment
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
import tensorflow_io as tfio
import tensorflow_hub as tfhub
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from keras.models import Sequential
import keras_tuner as kt

import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Audio

SAMPLING_RATE = 16000
EPOCHS = 50
BATCH_SIZE = 128
SHUFFLE_SEED = 43
MODEL_NAME = "uu_accent_recognition"

# Set the seed value for experiment reproducibility.
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

Check how many classes are there.

In [2]:
df = pd.read_csv(os.getcwd() + "/data/audio.csv", on_bad_lines = 'skip' , delimiter= ';')
# print(len(df['native_langs'].unique()))
# for lang in df['native_langs'].unique():
#     print(f'"{lang}",')
class_names = df['native_langs'].unique()
lang_idxs = range(len(class_names))
class_dict = dict(zip(class_names, lang_idxs))

print(f"There are {len(class_names)} classes.")
print(f"There are {len(class_dict)} classes.")

There are 228 classes.
There are 228 classes.


### Prepare dataset

**WARNING** -- This method of loading the audio files does not work with the Yamnet model. A different approach will be implemented.

In [3]:
def path_to_audio(path):
    """Reads and decodes an audio file."""
    audio = tf.io.read_file(path)
    audio, _ = tf.audio.decode_wav(audio, 1, SAMPLING_RATE)
    return audio


def dataframe_to_dataset(dataframe):
    """Constructs a dataset of audios and labels."""
    
    # print(dataframe.info())
    # Rewrite file_name to contain file paths
    dataframe['file_name'] = dataframe.apply(
        lambda row: os.path.join(os.getcwd(), 'data/audio_wav', row["file_name"] + ".wav"), 
        axis=1
    )
    # Convert the labels into numbers
    dataframe['native_langs'] = dataframe.apply(
        lambda row: class_dict[row['native_langs']],
        axis=1
    )

    path_ds = tf.data.Dataset.from_tensor_slices(dataframe['file_name'])
    audio_ds = path_ds.map(lambda x: path_to_audio(x))
    label_ds = tf.data.Dataset.from_tensor_slices(dataframe['native_langs'])
    
    # print(len(path_ds), len(audio_ds), len(label_ds))
    # dataframe = df[(df["native_langs"] == "amharic") | (df["native_langs"] == "indonesian")]
    # print(dataframe.shape)
    return tf.data.Dataset.zip((audio_ds, label_ds))


def audio_to_fft(audio):
    # Since tf.signal.fft applies FFT on the innermost dimension, we need to squeeze the dimensions and then expand them again after FFT
    audio = tf.squeeze(audio, axis=-1)
    fft = tf.signal.fft(
        tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
    )
    fft = tf.expand_dims(fft, axis=-1)

    # Return the absolute value of the first half of the FFT which represents the positive frequencies
    return tf.math.abs(fft[:, : (audio.shape[1] // 2), :])

Read the list of audio file paths and labels from the CSV file, as Pandas dataframe.

**IMPORTANT!**
To avoid noises, we are experimenting with only the most frequent classes in our dataset (English and Spanish). In order to work with the entire classes, we are going to identify infrequent classes and remove them from the dataset.

In [5]:
dataframe = pd.read_csv(os.getcwd() + "/data/audio.csv", on_bad_lines = 'skip' , delimiter= ';')
dataframe = dataframe[(dataframe['native_langs'] == 'english') | (dataframe['native_langs'] == 'spanish')]
print(len(dataframe))

886


Split training & validation dataset.

In [6]:
# Shuffle
dataframe = dataframe.sample(frac=1).reset_index(drop=True)
# rng = np.random.RandomState(SHUFFLE_SEED)
# rng.shuffle(audio_paths)
# rng = np.random.RandomState(SHUFFLE_SEED)
# rng.shuffle(labels)

# Splitting training and validation set
split = int(len(dataframe) * 0.8)
train_df = dataframe[:split]
valid_df = dataframe[split:]

train_ds = dataframe_to_dataset(train_df)
valid_ds = dataframe_to_dataset(valid_df)

train_ds = train_ds.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(BATCH_SIZE)
valid_ds = valid_ds.shuffle(buffer_size=32 * 8, seed=SHUFFLE_SEED).batch(32)

# Transform audio wave to the frequency domain using `audio_to_fft`
train_ds = train_ds.map(
    lambda x, y: (audio_to_fft(x), y), 
    num_parallel_calls=tf.data.AUTOTUNE,
)
valid_ds = valid_ds.map(
    lambda x, y: (audio_to_fft(x), y), 
    num_parallel_calls=tf.data.AUTOTUNE,
)
train_ds = train_ds.prefetch(tf.data.AUTOTUNE)
valid_ds = valid_ds.prefetch(tf.data.AUTOTUNE)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


2023-01-17 21:24:12.479185: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Yamnet Model

#### Prepare Dataset

Yamnet requires a slightly different way of preprocessing the dataset. Note that we **do not do data shuffling** as before. 

TODO: Need to see if this affect the performance.

In [7]:
yamnet_model = tf.saved_model.load('./yamnet_1')

In [10]:
# audio_wav = path_to_audio('data/audio_wav/afrikaans1.wav')

def load_16k_audio_wav(filename):
    # Read file content
    file_content = tf.io.read_file(filename)

    # Decode audio wave
    audio_wav, sample_rate = tf.audio.decode_wav(file_content, desired_channels=1)
    audio_wav = tf.squeeze(audio_wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)

    # Resample to 16k
    audio_wav = tfio.audio.resample(audio_wav, rate_in=sample_rate, rate_out=16000)
    return audio_wav


def filepath_to_embeddings(filename, label):
    # Load 16k audio wave
    audio_wav = load_16k_audio_wav(filename)

    # Get audio embeddings & scores.
    # The embeddings are the audio features extracted using transfer learning
    # while scores will be used to identify time slots that are not speech
    # which will then be gathered into a specific new category 'other'
    scores, embeddings, _ = yamnet_model(audio_wav)

    # Number of embeddings in order to know how many times to repeat the label
    embeddings_num = tf.shape(embeddings)[0]
    labels = tf.repeat(label, embeddings_num)

    # Change labels for time-slots that are not speech into a new category 'other'
    # labels = tf.where(tf.argmax(scores, axis=1) == 0, label, len(class_names) - 1)

    # Using one-hot in order to use AUC
    return (embeddings, tf.one_hot(labels, len(class_names)))


def dataframe_to_dataset(dataframe, batch_size=64):
    # print(dataframe.info())
    # Rewrite file_name to contain file paths
    dataframe['file_name'] = dataframe.apply(
        lambda row: os.path.join(os.getcwd(), 'data/audio_wav', row["file_name"] + ".wav"), 
        axis=1
    )
    # Convert the labels into numbers
    dataframe['native_langs'] = dataframe.apply(
        lambda row: class_dict[row['native_langs']],
        axis=1
    )

    dataset = tf.data.Dataset.from_tensor_slices((dataframe["file_name"], dataframe["native_langs"]))

    dataset = dataset.map(
        lambda x, y: filepath_to_embeddings(x, y),
        num_parallel_calls=tf.data.experimental.AUTOTUNE,
    ).unbatch()

    return dataset.cache().batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [23]:
dataframe = pd.read_csv(os.getcwd() + "/data/audio.csv", on_bad_lines = 'skip' , delimiter= ';')
dataframe = dataframe[(dataframe['native_langs'] == 'english') | (dataframe['native_langs'] == 'spanish')]
print(f'Number of data points: {len(dataframe)}')

# Splitting training and validation set
split = int(len(dataframe) * 0.8)
train_df = dataframe[:split]
valid_df = dataframe[split:]

train_ds = dataframe_to_dataset(train_df)
valid_ds = dataframe_to_dataset(valid_df)

Number of data points: 886


### Build the model

In [21]:
def build_and_compile_model():
    inputs = keras.layers.Input(shape=(1024), name="embedding")

    x = keras.layers.Dense(256, activation="relu", name="dense_1")(inputs)
    x = keras.layers.Dropout(0.15, name="dropout_1")(x)

    x = keras.layers.Dense(384, activation="relu", name="dense_2")(x)
    x = keras.layers.Dropout(0.2, name="dropout_2")(x)

    x = keras.layers.Dense(192, activation="relu", name="dense_3")(x)
    x = keras.layers.Dropout(0.25, name="dropout_3")(x)

    x = keras.layers.Dense(384, activation="relu", name="dense_4")(x)
    x = keras.layers.Dropout(0.2, name="dropout_4")(x)

    outputs = keras.layers.Dense(len(class_names), activation="softmax", name="ouput")(
        x
    )

    model = keras.Model(inputs=inputs, outputs=outputs, name="accent_recognition")

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1.9644e-5),
        loss=keras.losses.CategoricalCrossentropy(),
        metrics=["accuracy", keras.metrics.AUC(name="auc")],
    )

    return model


model = build_and_compile_model()
model.summary()

Model: "accent_recognition"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (InputLayer)      [(None, 1024)]            0         
                                                                 
 dense_1 (Dense)             (None, 256)               262400    
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_2 (Dense)             (None, 384)               98688     
                                                                 
 dropout_2 (Dropout)         (None, 384)               0         
                                                                 
 dense_3 (Dense)             (None, 192)               73920     
                                                                 
 dropout_3 (Dropout)         (None, 192)        

#### Class weight calculation

Since the dataset is quite unbalanced, we wil use class_weight argument during training.

Getting the class weights is a little tricky because even though we know the number of audio files for each class, it does not represent the number of samples for that class since Yamnet transforms each audio file into multiple audio samples of 0.96 seconds each. So every audio file will be split into a number of samples that is proportional to its length.

Therefore, to get those weights, we have to calculate the number of samples for each class after preprocessing through Yamnet.

In [24]:
class_counts = tf.zeros(shape=(len(class_names),), dtype=tf.int32)

for x, y in iter(train_ds):
    class_counts = class_counts + tf.math.bincount(
        tf.cast(tf.math.argmax(y, axis=1), tf.int32), minlength=len(class_names)
    )

class_weight = {
    i: tf.math.reduce_sum(class_counts).numpy() / class_counts[i].numpy()
    for i in range(len(class_counts))
}

print(class_weight)

{0: inf, 1: inf, 2: inf, 3: inf, 4: inf, 5: inf, 6: inf, 7: inf, 8: inf, 9: inf, 10: inf, 11: inf, 12: inf, 13: inf, 14: inf, 15: inf, 16: inf, 17: inf, 18: inf, 19: inf, 20: inf, 21: inf, 22: inf, 23: inf, 24: inf, 25: inf, 26: 1.3341108310447312, 27: inf, 28: inf, 29: inf, 30: inf, 31: inf, 32: inf, 33: inf, 34: inf, 35: inf, 36: inf, 37: inf, 38: inf, 39: inf, 40: inf, 41: inf, 42: inf, 43: inf, 44: inf, 45: inf, 46: inf, 47: inf, 48: inf, 49: inf, 50: inf, 51: inf, 52: inf, 53: inf, 54: inf, 55: inf, 56: inf, 57: inf, 58: inf, 59: inf, 60: inf, 61: inf, 62: inf, 63: inf, 64: inf, 65: inf, 66: inf, 67: inf, 68: inf, 69: inf, 70: inf, 71: inf, 72: inf, 73: inf, 74: inf, 75: inf, 76: inf, 77: inf, 78: inf, 79: inf, 80: inf, 81: inf, 82: inf, 83: inf, 84: inf, 85: inf, 86: inf, 87: inf, 88: 3.9930188041887176, 89: inf, 90: inf, 91: inf, 92: inf, 93: inf, 94: inf, 95: inf, 96: inf, 97: inf, 98: inf, 99: inf, 100: inf, 101: inf, 102: inf, 103: inf, 104: inf, 105: inf, 106: inf, 107: inf,

  i: tf.math.reduce_sum(class_counts).numpy() / class_counts[i].numpy()


#### Callbacks
We use Keras callbacks in order to:

- Stop whenever the validation AUC stops improving.
- Save the best model.
- Call TensorBoard in order to later view the training and validation logs.

In [15]:
early_stopping_cb = keras.callbacks.EarlyStopping(
    monitor="val_auc", patience=10, restore_best_weights=True
)

model_checkpoint_cb = keras.callbacks.ModelCheckpoint(
    MODEL_NAME + ".h5", monitor="val_auc", save_best_only=True
)

tensorboard_cb = keras.callbacks.TensorBoard(
    os.path.join(os.curdir, "logs", model.name)
)

callbacks = [early_stopping_cb, model_checkpoint_cb, tensorboard_cb]

#### Training

In [25]:
history = model.fit(
    train_ds,
    epochs=EPOCHS,
    validation_data=valid_ds,
    class_weight=class_weight,
    callbacks=callbacks,
    verbose=2,
)

Epoch 1/50
555/555 - 59s - loss: 4.5469 - accuracy: 0.5902 - auc: 0.9566 - val_loss: 0.7492 - val_accuracy: 0.5868 - val_auc: 0.9974 - 59s/epoch - 106ms/step
Epoch 2/50
555/555 - 8s - loss: 1.6634 - accuracy: 0.5295 - auc: 0.9978 - val_loss: 0.6929 - val_accuracy: 0.5868 - val_auc: 0.9978 - 8s/epoch - 14ms/step
Epoch 3/50
555/555 - 8s - loss: 1.5572 - accuracy: 0.5191 - auc: 0.9979 - val_loss: 0.6767 - val_accuracy: 0.5868 - val_auc: 0.9985 - 8s/epoch - 14ms/step
Epoch 4/50
555/555 - 8s - loss: 1.5053 - accuracy: 0.5181 - auc: 0.9979 - val_loss: 0.6686 - val_accuracy: 0.5873 - val_auc: 0.9985 - 8s/epoch - 14ms/step
Epoch 5/50
555/555 - 9s - loss: 1.4795 - accuracy: 0.5235 - auc: 0.9979 - val_loss: 0.6639 - val_accuracy: 0.5886 - val_auc: 0.9985 - 9s/epoch - 15ms/step
Epoch 6/50
555/555 - 8s - loss: 1.4612 - accuracy: 0.5265 - auc: 0.9980 - val_loss: 0.6601 - val_accuracy: 0.5910 - val_auc: 0.9985 - 8s/epoch - 14ms/step
Epoch 7/50
555/555 - 9s - loss: 1.4471 - accuracy: 0.5272 - auc: 0.

#### Results

Plotting the accuracy and AUC results.

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(14, 5))

axs[0].plot(range(EPOCHS), history.history["accuracy"], label="Training")
axs[0].plot(range(EPOCHS), history.history["val_accuracy"], label="Validation")
axs[0].set_xlabel("Epochs")
axs[0].set_title("Training & Validation Accuracy")
axs[0].legend()
axs[0].grid(True)

axs[1].plot(range(EPOCHS), history.history["auc"], label="Training")
axs[1].plot(range(EPOCHS), history.history["val_auc"], label="Validation")
axs[1].set_xlabel("Epochs")
axs[1].set_title("Training & Validation AUC")
axs[1].legend()
axs[1].grid(True)

plt.show()

#### Evaluation

In [18]:
train_loss, train_acc, train_auc = model.evaluate(train_ds)
valid_loss, valid_acc, valid_auc = model.evaluate(valid_ds)



#### Confusion Matrix

Let's now plot the confusion matrix for the validation dataset.

The confusion matrix lets us see, for every class, not only how many samples were correctly classified, but also which other classes were the samples confused with.

It allows us to calculate the precision and recall for every class.

In [None]:
# Create x and y tensors
x_valid = None
y_valid = None

for x, y in iter(valid_ds):
    if x_valid is None:
        x_valid = x.numpy()
        y_valid = y.numpy()
    else:
        x_valid = np.concatenate((x_valid, x.numpy()), axis=0)
        y_valid = np.concatenate((y_valid, y.numpy()), axis=0)

# Generate predictions
y_pred = model.predict(x_valid)

# Calculate confusion matrix
confusion_mtx = tf.math.confusion_matrix(
    np.argmax(y_valid, axis=1), np.argmax(y_pred, axis=1)
)

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(
    confusion_mtx, xticklabels=class_names, yticklabels=class_names, annot=True, fmt="g"
)
plt.xlabel("Prediction")
plt.ylabel("Label")
plt.title("Validation Confusion Matrix")
plt.show()

#### Precision & Recall

For every class:

- Recall is the ratio of correctly classified samples i.e. it shows how many samples of this specific class, the model is able to detect. It is the ratio of diagonal elements to the sum of all elements in the row.
- Precision shows the accuracy of the classifier. It is the ratio of correctly predicted samples among the ones classified as belonging to this class. It is the ratio of diagonal elements to the sum of all elements in the column.

In [None]:
for i, label in enumerate(class_names):
    precision = confusion_mtx[i, i] / np.sum(confusion_mtx[:, i])
    recall = confusion_mtx[i, i] / np.sum(confusion_mtx[i, :])
    print(
        "{0:15} Precision:{1:.2f}%; Recall:{2:.2f}%".format(
            label, precision * 100, recall * 100
        )
    )