Accent-recognizing model, based on example from https://www.tensorflow.org/tutorials/audio/simple_audio and https://keras.io/examples/audio/speaker_recognition_using_cnn/

Not included:
- Hyperparams tuning
- Underlying Yamnet model

In [1]:
%pip install -U -q tensorflow tensorflow_datasets
#apt install --allow-change-held-packages libcudnn8=8.1.0.77-1+cuda11.2
%pip install -U -q keras-tuner

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
import pathlib
import os
from os.path import isfile, join, splitext
import librosa
import soundfile as sf
from pydub import AudioSegment
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
import tensorflow_io as tfio
import tensorflow_hub as tfhub
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from keras.models import Sequential
import keras_tuner as kt

import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Audio

SAMPLING_RATE = 16000
EPOCHS = 50
BATCH_SIZE = 128
SHUFFLE_SEED = 43

# Set the seed value for experiment reproducibility.
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

2023-01-16 23:35:59.818934: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Check how many classes are there.

In [2]:
df = pd.read_csv(os.getcwd() + "/data/audio.csv", on_bad_lines = 'skip' , delimiter= ';')
# print(len(df['native_langs'].unique()))
# for lang in df['native_langs'].unique():
#     print(f'"{lang}",')
class_names = df['native_langs'].unique()
lang_idxs = range(len(class_names))
class_dict = dict(zip(class_names, lang_idxs))

print(f"There are {len(class_names)} classes.")
print(f"There are {len(class_dict)} classes.")

There are 228 classes.
There are 228 classes.


### Prepare dataset

In [3]:
def path_to_audio(path):
    """Reads and decodes an audio file."""
    audio = tf.io.read_file(path)
    audio, _ = tf.audio.decode_wav(audio, 1, SAMPLING_RATE)
    return audio


def dataframe_to_dataset(dataframe):
    """Constructs a dataset of audios and labels."""
    
    # print(dataframe.info())
    # Rewrite file_name to contain file paths
    dataframe['file_name'] = dataframe.apply(
        lambda row: os.path.join(os.getcwd(), 'data/audio_wav', row["file_name"] + ".wav"), 
        axis=1
    )
    # Convert the labels into numbers
    dataframe['native_langs'] = dataframe.apply(
        lambda row: class_dict[row['native_langs']],
        axis=1
    )

    path_ds = tf.data.Dataset.from_tensor_slices(dataframe['file_name'])
    audio_ds = path_ds.map(lambda x: path_to_audio(x))
    label_ds = tf.data.Dataset.from_tensor_slices(dataframe['native_langs'])
    
    return tf.data.Dataset.zip((audio_ds, label_ds))    

    # print(len(path_ds), len(audio_ds), len(label_ds))
    # dataframe = df[(df["native_langs"] == "amharic") | (df["native_langs"] == "indonesian")]
    # print(dataframe.shape)



def audio_to_fft(audio):
    # Since tf.signal.fft applies FFT on the innermost dimension,
    # we need to squeeze the dimensions and then expand them again
    # after FFT
    audio = tf.squeeze(audio, axis=-1)
    fft = tf.signal.fft(
        tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
    )
    fft = tf.expand_dims(fft, axis=-1)

    # Return the absolute value of the first half of the FFT
    # which represents the positive frequencies
    return tf.math.abs(fft[:, : (audio.shape[1] // 2), :])

Read the list of audio file paths and labels from the CSV file, as Pandas dataframe.

**IMPORTANT!**
To avoid noises, we are experimenting with only the most frequent classes in our dataset (English and Spanish). In order to work with the entire classes, we are going to identify infrequent classes and remove them from the dataset.

In [4]:
dataframe = pd.read_csv(os.getcwd() + "/data/audio.csv", on_bad_lines = 'skip' , delimiter= ';')
dataframe = dataframe[(dataframe['native_langs'] == 'english') | (dataframe['native_langs'] == 'spanish')]
print(len(dataframe))

886


Split training & validation set.

In [5]:
# Shuffle
dataframe = dataframe.sample(frac=1).reset_index(drop=True)
# rng = np.random.RandomState(SHUFFLE_SEED)
# rng.shuffle(audio_paths)
# rng = np.random.RandomState(SHUFFLE_SEED)
# rng.shuffle(labels)

# Splitting training and validation set
split = int(len(dataframe) * 0.8)
train_df = dataframe[:split]
valid_df = dataframe[split:]

train_ds = dataframe_to_dataset(train_df)
valid_ds = dataframe_to_dataset(valid_df)

train_ds = train_ds.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(BATCH_SIZE)
valid_ds = valid_ds.shuffle(buffer_size=32 * 8, seed=SHUFFLE_SEED).batch(32)

# Transform audio wave to the frequency domain using `audio_to_fft`
train_ds = train_ds.map(
    lambda x, y: (audio_to_fft(x), y), 
    num_parallel_calls=tf.data.AUTOTUNE,
)
valid_ds = valid_ds.map(
    lambda x, y: (audio_to_fft(x), y), 
    num_parallel_calls=tf.data.AUTOTUNE,
)
train_ds = train_ds.prefetch(tf.data.AUTOTUNE)
valid_ds = valid_ds.prefetch(tf.data.AUTOTUNE)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


2023-01-16 23:41:06.968474: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Model definition

In [None]:
def residual_block(x, filters, conv_num=3, activation="relu"):
    # Shortcut
    s = keras.layers.Conv1D(filters, 1, padding="same")(x)
    for i in range(conv_num - 1):
        x = keras.layers.Conv1D(filters, 3, padding="same")(x)
        x = keras.layers.Activation(activation)(x)
    x = keras.layers.Conv1D(filters, 3, padding="same")(x)
    x = keras.layers.Add()([x, s])
    x = keras.layers.Activation(activation)(x)
    return keras.layers.MaxPool1D(pool_size=2, strides=2)(x)


def build_model(input_shape, num_classes):
    inputs = keras.layers.Input(shape=input_shape, name="input")

    x = residual_block(inputs, 16, 2)
    x = residual_block(x, 32, 2)
    x = residual_block(x, 64, 3)
    x = residual_block(x, 128, 3)
    x = residual_block(x, 128, 3)

    x = keras.layers.AveragePooling1D(pool_size=3, strides=3)(x)
    x = keras.layers.Flatten()(x)
    x = keras.layers.Dense(256, activation="relu")(x)
    x = keras.layers.Dense(128, activation="relu")(x)

    outputs = keras.layers.Dense(num_classes, activation="softmax", name="output")(x)

    return keras.models.Model(inputs=inputs, outputs=outputs)

In [None]:
model = build_model((SAMPLING_RATE // 2, 1), len(class_names))
model.summary()

# Compile the model using Adam's default learning rate
model.compile(
    optimizer="Adam", 
    loss="sparse_categorical_crossentropy", 
    metrics=["accuracy"]
)
model_save_filename = "model.h5"

# Add callbacks:
# 'EarlyStopping' to stop training when the model is not enhancing anymore
# 'ModelCheckPoint' to always keep the model that has the best val_accuracy
earlystopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
mdlcheckpoint_cb = keras.callbacks.ModelCheckpoint(
    model_save_filename, monitor="val_accuracy", save_best_only=True
)

### Model training

In [105]:
history = model.fit(
    train_ds,
    epochs=EPOCHS,
    validation_data=valid_ds,
    callbacks=[earlystopping_cb, mdlcheckpoint_cb],
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50


In [106]:
print(model.evaluate(valid_ds))

[0.5344429016113281, 0.7247191071510315]


### Experiment with data split

When previously we had 80-20 training-validation split, now we are experimenting with 90-10 split.

In [7]:
split = int(len(dataframe) * 0.9)
train_df2 = dataframe[:split]
valid_df2 = dataframe[split:]

train_ds2 = dataframe_to_dataset(train_df2)
valid_ds2 = dataframe_to_dataset(valid_df2)

train_ds2 = train_ds2.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(BATCH_SIZE)
valid_ds2 = valid_ds2.shuffle(buffer_size=32 * 8, seed=SHUFFLE_SEED).batch(32)

# Transform audio wave to the frequency domain using `audio_to_fft`
train_ds2 = train_ds2.map(
    lambda x, y: (audio_to_fft(x), y), 
    num_parallel_calls=tf.data.AUTOTUNE,
)
valid_ds2 = valid_ds2.map(
    lambda x, y: (audio_to_fft(x), y), 
    num_parallel_calls=tf.data.AUTOTUNE,
)
train_ds2 = train_ds2.prefetch(tf.data.AUTOTUNE)
valid_ds2 = valid_ds2.prefetch(tf.data.AUTOTUNE)

In [8]:
model2 = build_model((SAMPLING_RATE // 2, 1), len(class_names))

model2.summary()

# Compile the model using Adam's default learning rate
model2.compile(
    optimizer="Adam", 
    loss="sparse_categorical_crossentropy", 
    metrics=["accuracy"]
)

# Add callbacks:
# 'EarlyStopping' to stop training when the model is not enhancing anymore
# 'ModelCheckPoint' to always keep the model that has the best val_accuracy
model_save_filename = "model2.h5"

earlystopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
mdlcheckpoint_cb = keras.callbacks.ModelCheckpoint(
    model_save_filename, monitor="val_accuracy", save_best_only=True
)

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 8000, 1)]    0           []                               
                                                                                                  
 conv1d_19 (Conv1D)             (None, 8000, 16)     64          ['input[0][0]']                  
                                                                                                  
 activation_13 (Activation)     (None, 8000, 16)     0           ['conv1d_19[0][0]']              
                                                                                                  
 conv1d_20 (Conv1D)             (None, 8000, 16)     784         ['activation_13[0][0]']          
                                                                                            

In [13]:
history = model2.fit(
    train_ds2,
    epochs=EPOCHS,
    validation_data=valid_ds2,
    callbacks=[earlystopping_cb, mdlcheckpoint_cb],
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50


In [14]:
print(model2.evaluate(valid_ds2))

[0.5742688775062561, 0.6853932738304138]


Experiment with 70-30 split.

In [11]:
split = int(len(dataframe) * 0.9)
train_df3 = dataframe[:split]
valid_df3 = dataframe[split:]

train_ds3 = dataframe_to_dataset(train_df3)
valid_ds3 = dataframe_to_dataset(valid_df3)

train_ds3 = train_ds3.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(BATCH_SIZE)
valid_ds3 = valid_ds3.shuffle(buffer_size=32 * 8, seed=SHUFFLE_SEED).batch(32)

# Transform audio wave to the frequency domain using `audio_to_fft`
train_ds3 = train_ds3.map(
    lambda x, y: (audio_to_fft(x), y), 
    num_parallel_calls=tf.data.AUTOTUNE,
)
valid_ds3 = valid_ds3.map(
    lambda x, y: (audio_to_fft(x), y), 
    num_parallel_calls=tf.data.AUTOTUNE,
)
train_ds3 = train_ds3.prefetch(tf.data.AUTOTUNE)
valid_ds3 = valid_ds3.prefetch(tf.data.AUTOTUNE)

In [12]:
model3 = build_model((SAMPLING_RATE // 2, 1), len(class_names))

model3.summary()

# Compile the model using Adam's default learning rate
model3.compile(
    optimizer="Adam", 
    loss="sparse_categorical_crossentropy", 
    metrics=["accuracy"]
)

# Add callbacks:
# 'EarlyStopping' to stop training when the model is not enhancing anymore
# 'ModelCheckPoint' to always keep the model that has the best val_accuracy
model_save_filename = "model3.h5"

earlystopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
mdlcheckpoint_cb = keras.callbacks.ModelCheckpoint(
    model_save_filename, monitor="val_accuracy", save_best_only=True
)

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 8000, 1)]    0           []                               
                                                                                                  
 conv1d_37 (Conv1D)             (None, 8000, 16)     64          ['input[0][0]']                  
                                                                                                  
 activation_26 (Activation)     (None, 8000, 16)     0           ['conv1d_37[0][0]']              
                                                                                                  
 conv1d_38 (Conv1D)             (None, 8000, 16)     784         ['activation_26[0][0]']          
                                                                                            

In [16]:
history = model3.fit(
    train_ds3,
    epochs=EPOCHS,
    validation_data=valid_ds3,
    callbacks=[earlystopping_cb, mdlcheckpoint_cb],
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50


In [17]:
print(model3.evaluate(valid_ds3))

[0.574015736579895, 0.6853932738304138]
