Accent-recognizing model, same as in main2.ipynb
Including hyperparam tuning.

Not included:
- Yamnet model

In [1]:
%pip install -U -q tensorflow tensorflow_datasets
#apt install --allow-change-held-packages libcudnn8=8.1.0.77-1+cuda11.2
%pip install -U -q keras-tuner

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
import pathlib
import os
from os.path import isfile, join, splitext
import librosa
import soundfile as sf
from pydub import AudioSegment
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
import tensorflow_io as tfio
import tensorflow_hub as tfhub
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from keras.models import Sequential
import keras_tuner as kt

import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Audio

SAMPLING_RATE = 16000
EPOCHS = 50
BATCH_SIZE = 128
SHUFFLE_SEED = 43

# Set the seed value for experiment reproducibility.
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

2023-01-17 21:23:42.574143: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Check how many classes are there.

In [3]:
df = pd.read_csv(os.getcwd() + "/data/audio.csv", on_bad_lines = 'skip' , delimiter= ';')
# print(len(df['native_langs'].unique()))
# for lang in df['native_langs'].unique():
#     print(f'"{lang}",')
class_names = df['native_langs'].unique()
lang_idxs = range(len(class_names))
class_dict = dict(zip(class_names, lang_idxs))

print(f"There are {len(class_names)} classes.")
print(f"There are {len(class_dict)} classes.")

There are 228 classes.
There are 228 classes.


### Prepare dataset

In [4]:
def path_to_audio(path):
    """Reads and decodes an audio file."""
    audio = tf.io.read_file(path)
    audio, _ = tf.audio.decode_wav(audio, 1, SAMPLING_RATE)
    return audio


def dataframe_to_dataset(dataframe):
    """Constructs a dataset of audios and labels."""
    
    # print(dataframe.info())
    # Rewrite file_name to contain file paths
    dataframe['file_name'] = dataframe.apply(
        lambda row: os.path.join(os.getcwd(), 'data/audio_wav', row["file_name"] + ".wav"), 
        axis=1
    )
    # Convert the labels into numbers
    dataframe['native_langs'] = dataframe.apply(
        lambda row: class_dict[row['native_langs']],
        axis=1
    )

    path_ds = tf.data.Dataset.from_tensor_slices(dataframe['file_name'])
    audio_ds = path_ds.map(lambda x: path_to_audio(x))
    label_ds = tf.data.Dataset.from_tensor_slices(dataframe['native_langs'])
    
    # print(len(path_ds), len(audio_ds), len(label_ds))
    # dataframe = df[(df["native_langs"] == "amharic") | (df["native_langs"] == "indonesian")]
    # print(dataframe.shape)
    return tf.data.Dataset.zip((audio_ds, label_ds))


def audio_to_fft(audio):
    # Since tf.signal.fft applies FFT on the innermost dimension, we need to squeeze the dimensions and then expand them again after FFT
    audio = tf.squeeze(audio, axis=-1)
    fft = tf.signal.fft(
        tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
    )
    fft = tf.expand_dims(fft, axis=-1)

    # Return the absolute value of the first half of the FFT which represents the positive frequencies
    return tf.math.abs(fft[:, : (audio.shape[1] // 2), :])

Read the list of audio file paths and labels from the CSV file, as Pandas dataframe.

**IMPORTANT!**
To avoid noises, we are experimenting with only the most frequent classes in our dataset (English and Spanish). In order to work with the entire classes, we are going to identify infrequent classes and remove them from the dataset.

In [5]:
dataframe = pd.read_csv(os.getcwd() + "/data/audio.csv", on_bad_lines = 'skip' , delimiter= ';')
dataframe = dataframe[(dataframe['native_langs'] == 'english') | (dataframe['native_langs'] == 'spanish')]
print(len(dataframe))

886


Split training & validation dataset.

In [6]:
# Shuffle
dataframe = dataframe.sample(frac=1).reset_index(drop=True)
# rng = np.random.RandomState(SHUFFLE_SEED)
# rng.shuffle(audio_paths)
# rng = np.random.RandomState(SHUFFLE_SEED)
# rng.shuffle(labels)

# Splitting training and validation set
split = int(len(dataframe) * 0.8)
train_df = dataframe[:split]
valid_df = dataframe[split:]

train_ds = dataframe_to_dataset(train_df)
valid_ds = dataframe_to_dataset(valid_df)

train_ds = train_ds.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(BATCH_SIZE)
valid_ds = valid_ds.shuffle(buffer_size=32 * 8, seed=SHUFFLE_SEED).batch(32)

# Transform audio wave to the frequency domain using `audio_to_fft`
train_ds = train_ds.map(
    lambda x, y: (audio_to_fft(x), y), 
    num_parallel_calls=tf.data.AUTOTUNE,
)
valid_ds = valid_ds.map(
    lambda x, y: (audio_to_fft(x), y), 
    num_parallel_calls=tf.data.AUTOTUNE,
)
train_ds = train_ds.prefetch(tf.data.AUTOTUNE)
valid_ds = valid_ds.prefetch(tf.data.AUTOTUNE)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


2023-01-17 21:24:12.479185: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Hyperparam tuning

Reference https://www.tensorflow.org/tutorials/keras/keras_tuner

In [10]:
def residual_block(x, filters, conv_num=3, activation="relu"):
    # Shortcut
    s = keras.layers.Conv1D(filters, 1, padding="same")(x)
    for i in range(conv_num - 1):
        x = keras.layers.Conv1D(filters, 3, padding="same")(x)
        x = keras.layers.Activation(activation)(x)
    x = keras.layers.Conv1D(filters, 3, padding="same")(x)
    x = keras.layers.Add()([x, s])
    x = keras.layers.Activation(activation)(x)
    return keras.layers.MaxPool1D(pool_size=2, strides=2)(x)



def model_builder(hp):
    inputs = keras.layers.Input(shape=(SAMPLING_RATE // 2, 1), name="input")

    x = residual_block(inputs, 16, 2)
    x = residual_block(x, 32, 2)
    x = residual_block(x, 64, 3)
    x = residual_block(x, 128, 3)
    x = residual_block(x, 128, 3)

    x = keras.layers.AveragePooling1D(pool_size=3, strides=3)(x)
    x = keras.layers.Flatten()(x)

    # Tune the number of units in the first and second Dense layer. Choose an optimal value between 32-512
    hp_units = hp.Int('dense_layer_1', min_value=32, max_value=512, step=32)
    hp_units2 = hp.Int('dense_layer_2', min_value=32, max_value=512, step=32)
    x = keras.layers.Dense(units=hp_units, activation="relu")(x)
    x = keras.layers.Dense(units=hp_units2, activation="relu")(x)

    outputs = keras.layers.Dense(len(class_names), activation="softmax", name="output")(x)

    model = keras.models.Model(inputs=inputs, outputs=outputs)

    # Tune the learning rate for the optimizer. Choose an optimal value from 0.01, 0.001, or 0.0001
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate), 
        loss=keras.losses.SparseCategoricalCrossentropy(), 
        metrics=['accuracy'])

    return model

In [None]:
# Instantiate the hyperparameter tuner. 
# Set overwrite=False to continue the search from last session, overwrite=True to start new search everytime.
 
tuner = kt.Hyperband(model_builder,
                    objective='val_accuracy',
                    max_epochs=20,
                    factor=3,
                    hyperband_iterations=10,
                    directory='uu_accent_detection_dir',
                    project_name='uu_accent_detection',
                    overwrite=False)

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

tuner.search(train_ds, 
            epochs=20, 
            validation_data=valid_ds, 
            callbacks=[stop_early],
            verbose=2)

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. 
The optimal number of units in the first and second densely-connected layers 
are {best_hps.get('dense_layer_1')} and {best_hps.get('dense_layer_2')}. 
The optimal learning rate for the optimizer is {best_hps.get('learning_rate')}.
""")

### Train the model

In [None]:
# Callback to stop training when the model is not enhancing anymore
stop_early_cb = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Build the model with the optimal hyperparameters and train it on the data for 50 epochs
hypermodel = tuner.hypermodel.build(best_hps)
history = hypermodel.fit(train_ds, 
                        validation_data=valid_ds,
                        epochs=EPOCHS,
                        batch_size=kt.HyperParameters.Choice('batch_size', [16, 32]),
                        callbacks=[stop_early_cb])

# val_acc_per_epoch = history.history['val_accuracy']
# best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
# print('Best epoch: %d' % (best_epoch,))

Evaluate the hypermodel on test data.

In [None]:
eval_result = hypermodel.evaluate(valid_ds)
print("[test loss, test accuracy]:", eval_result)

### Hyperparameter Tuning (Georgios')

In [None]:
# def build_model(hp):
#     """
#     Builds model and sets up hyperparameter space to search.

#     Parameters
#     ----------
#     hp : HyperParameter object
#         Configures hyperparameters to tune.

#     Returns
#     -------
#     model : keras model
#         Compiled model with hyperparameters to tune.
#     """
#     # Initialize sequential API and start building model.
#     num_classes = len(y_train[0])
#     model = Sequential()
#     model.add(Conv2D(hp.Int('filters_1', min_value=32, max_value=256, step=32), (3, 3), activation='relu',
#                      input_shape=(X_train.shape[1], X_train.shape[2], 1),padding='same'))
#     model.add(MaxPooling2D((2, 2)))
#     for i in range(1, hp.Int("num_layers", 2, 6)):
#         model.add(
#             keras.layers.Conv2D(hp.Int('filters_'+str(i), min_value=32, max_value=256, step=32), (3, 3), activation='relu', padding='same')
#         )
#         model.add(keras.layers.MaxPooling2D(pool_size=(2, 2),padding='same', data_format='channels_last'))

#     model.add(Flatten())
#     for i in range(1, hp.Int("num_layers", 2, 6)):
#         model.add(
#             keras.layers.Dense(
#                 units=hp.Int("units_" + str(i), min_value=32, max_value=512, step=32),
#                 activation="relu")
#         )
#     model.add(Dense(num_classes, activation='softmax'))
#     model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

#     return model


# hp = HyperParameters()

# tuner = Hyperband(build_model,
#                 objective="val_accuracy",
#                 max_epochs=20,
#                 factor=3,
#                 hyperband_iterations=10,
#                 directory="kt_dir",
#                 project_name="kt_hyperband")

# stop_early = EarlyStopping(monitor='val_loss', patience=5)

# tuner.search(X_train, y_train, epochs=20, validation_split=0.2, callbacks=[stop_early], verbose=2)
# best_hps = tuner.get_best_hyperparameters()[0]

# #### Build model

# h_model = tuner.hypermodel.build(best_hps)

# #### Train the hypertuned model

# h_model.fit(X_train, y_train, epochs=100, validation_split=0.2, callbacks=[stop_early],batch_size=hp.Choice("batch_size", [16, 32]), verbose=2)
# y_pred = h_model.predict(X_test)