In [7]:
%pip install -U -q tensorflow_io
%pip install -U -q tensorflow_hub

In [41]:
import pandas as pd
import numpy as np
import os
from os.path import isfile, join, splitext
import librosa
import soundfile as sf
from pydub import AudioSegment
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
import tensorflow_io as tfio
import tensorflow_hub as tfhub
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from keras.models import Sequential

import matplotlib.pyplot as plt
from IPython.display import Audio


EPOCHS = 100
SEED = 1337
MODEL_NAME = "english_accent_recognition"
AUDIOPATH = os.getcwd() + '/data/audio'

class_names = [
    "afrikaans",
    "agni",
    "akan",
    "albanian",
    "amharic",
    "arabic",
    "armenian",
    "azerbaijani",
    "belarusan",
    "bafang",
    "baga",
    "bai",
    "bambara",
    "bamun",
    "bavarian",
    "burmese",
    "balanta ganja",
    "bari",
    "basque",
    "bengali",
    "bosnian",
    "bulgarian",
    "chittagonian",
    "croatian",
    "cebuano",
    "charapa-spanish",
    "cantonese",
    "chaldean",
    "carolinian",
    "catalan",
    "chamorro",
    "czech",
    "dinka",
    "danish",
    "dari",
    "dutch",
    "ebira",
    "edo",
    "english",
    "ewe",
    "fang",
    "faroese",
    "fanti",
    "farsi",
    "filipino",
    "finnish",
    "french",
    "frisian",
    "gaddang",
    "ga",
    "gedeo",
    "ganda",
    "georgian",
    "german",
    "greek",
    "gujarati",
    "gusii",
    "hebrew",
    "hindi",
    "haitian creole french",
    "icelandic",
    "xiang",
    "hungarian",
    "igbo",
    "ibibio",
    "indonesian",
    "italian",
    "japanese",
    "javanese",
    "kannada",
    "kazakh",
    "khmer",
    "kyrgyz",
    "kiswahili",
    "korean",
    "krio",
    "kurdish",
    "kalanga",
    "kabyle",
    "lingala",
    "lamaholot",
    "lao",
    "latvian",
    "lithuanian",
    "luo",
    "lamotrekese",
    "macedonian",
    "malay",
    "malayalam",
    "mandinka",
    "mankanya",
    "mandarin",
    "mandingo",
    "marathi",
    "mauritian",
    "mongolian",
    "moore",
    "mortlockese",
    "nepali",
    "nama",
    "nandi",
    "northern sotho",
    "norwegian",
    "oromo",
    "pohnpeian",
    "polish",
    "portuguese",
    "punjabi",
    "pahari",
    "quechua",
    "romanian",
    "russian",
    "sardinian",
    "sarua",
    "sa&#39;a",
    "satawalese",
    "sindhi",
    "serbian",
    "swiss german",
    "sicilian",
    "sinhala",
    "slovak",
    "somali",
    "sundanese",
    "spanish",
    "swedish",
    "synthesized",
    "tagalog",
    "taishan",
    "taiwanese",
    "tamil",
    "tatar",
    "telugu",
    "thai",
    "tibetan",
    "tigrigna",
    "tok pisin",
    "turkish",
    "twi",
    "urdu",
    "uyghur",
    "uzbek",
    "vietnamese",
    "wolof",
    "yiddish",
    "zulu",
    "maltese",
    "yoruba",
    "yapese",
    "mende",
    "konkani",
    "kikongo",
    "kikuyu",
    "oriya",
    "tswana",
    "teochew",
    "yupik",
    "ngemba",
    "hindko",
    "estonian",
    "shona",
    "amazigh",
    "slovenian",
    "ukrainian",
    "fijian",
    "rotuman",
    "pashto",
    "sesotho",
    "newari",
    "sylheti",    
    "pulaar",
    "serer",
    "jola",
    "xasonga",
    "vlaams",
    "hainanese",
    "jamaican creole english",
    "kambaata",    
    "moba",
    "fataluku",
    "tetun-dili",
    "susu",
    "chichewa",
    "hadiyya",
    "shilluk",    
    "hausa",
    "fulfulde adamawa",
    "rwanda",
    "kanuri",
    "yakut",
    "rundi",
    "luxembourgeois",
    "malagasy",
    "nuer",
    "gan",
    "wu",
    "hakka",
    "hawaii creole english",
    "turkmen",
    "garifuna",
    "kru",
    "irish",
    "liberian pidgin english",
    "papiamentu",
    "ife",
    "hiligaynon",
    "temne",
    "cameroon creole english",
    "shan",
    "ashanti",
    "hmong",
    "miskito",
    "mizo",
    "nicaragua creole english",
    "tajiki",
    "naxi",
    "luba-kasai",
    "tigre",
    "wali",
    "american sign language",
    "home sign",
    "ndebele",
    "kamba",
    "ossetic",
    "voro",
    "masbatenyo",
    "min nan",
    "tumbuka",
]

keras.utils.set_random_seed(SEED)

### Use Yamnet as feature extractor

In [42]:
yamnet_model = tf.saved_model.load(os.getcwd() + '/yamnet_1')

### Prepare training and validation sets

In [43]:
df = pd.read_csv(os.getcwd() + "/data/audio.csv", on_bad_lines = 'skip' , delimiter= ';')
# dataframe = df[(df["native_langs"] == "amharic") | (df["native_langs"] == "indonesian")]
# print(len(df['native_langs'].unique()))
# for lang in df['native_langs'].unique():
#     print(f'"{lang}",')
# print(dataframe.shape)
# dataframe.head()


split = int(len(df) * 0.8)
train_df = df[:split]
valid_df = df[split:]
print(f"There are {len(train_df)} training set and {len(valid_df)} validation set.")

There are 2401 training set and 601 validation set.


### Prepare Tensorflow dataset

In [46]:
@tf.function
def load_16k_audio_wav(filename):
    # Read file content
    # audio = tfio.audio.AudioIOTensor(AUDIOPATH + '/indonesian1.mp3')
    file_content = tfio.audio.AudioIOTensor(filename)

    # Decode audio
    audio_decoded = tfio.audio.decode_mp3(file_content)
    audio_decoded = tf.squeeze(audio_decoded, axis=-1)
    sample_rate = tf.cast(file_content.rate, dtype=tf.int64)

    # Resample to 16k
    audio_wav = tfio.audio.resample(audio_decoded, rate_in=sample_rate, rate_out=16000)

    return audio_wav



def filepath_to_embeddings(filename, label):
    # Load 16k audio wave
    audio_wav = load_16k_audio_wav(filename)

    # Get audio embeddings & scores.
    # The embeddings are the audio features extracted using transfer learning
    # while scores will be used to identify time slots that are not speech
    # which will then be gathered into a specific new category 'other'
    scores, embeddings, _ = yamnet_model(audio_wav)

    # Number of embeddings in order to know how many times to repeat the label
    embeddings_num = tf.shape(embeddings)[0]
    labels = tf.repeat(label, embeddings_num)

    # Change labels for time-slots that are not speech into a new category 'other'
    labels = tf.where(tf.argmax(scores, axis=1) == 0, label, len(class_names) - 1)

    # Using one-hot in order to use AUC
    return (embeddings, tf.one_hot(labels, len(class_names)))



def dataframe_to_dataset(dataframe, batch_size=64):
    dataset = tf.data.Dataset.from_tensor_slices(
        (dataframe["file_name"], dataframe["native_langs"])
    )

    dataset = dataset.map(
        lambda x, y: filepath_to_embeddings(x, y),
        num_parallel_calls=tf.data.experimental.AUTOTUNE,
    ).unbatch()

    return dataset.cache().batch(batch_size).prefetch(tf.data.AUTOTUNE)


train_ds = dataframe_to_dataset(train_df)
valid_ds = dataframe_to_dataset(valid_df)

AssertionError: in user code:

    File "/var/folders/3p/fc9cp69s1lsfgnzqtqtddvz00000gn/T/ipykernel_47199/3737147357.py", line 47, in None  *
        lambda x, y: filepath_to_embeddings(x, y)
    File "/var/folders/3p/fc9cp69s1lsfgnzqtqtddvz00000gn/T/ipykernel_47199/3737147357.py", line 21, in filepath_to_embeddings  *
        audio_wav = load_16k_audio_wav(filename)
    File "/var/folders/3p/fc9cp69s1lsfgnzqtqtddvz00000gn/T/ipykernel_47199/3737147357.py", line 5, in load_16k_audio_wav  *
        file_content = tfio.audio.AudioIOTensor(filename)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/tensorflow_io/python/ops/audio_ops.py", line 663, in __init__  **
        assert dtype is not None, "dtype must be provided in graph mode"

    AssertionError: dtype must be provided in graph mode


### Build the model

In [None]:
keras.backend.clear_session()


def build_and_compile_model():
    inputs = keras.layers.Input(shape=(1024), name="embedding")

    x = keras.layers.Dense(256, activation="relu", name="dense_1")(inputs)
    x = keras.layers.Dropout(0.15, name="dropout_1")(x)

    x = keras.layers.Dense(384, activation="relu", name="dense_2")(x)
    x = keras.layers.Dropout(0.2, name="dropout_2")(x)

    x = keras.layers.Dense(192, activation="relu", name="dense_3")(x)
    x = keras.layers.Dropout(0.25, name="dropout_3")(x)

    x = keras.layers.Dense(384, activation="relu", name="dense_4")(x)
    x = keras.layers.Dropout(0.2, name="dropout_4")(x)

    outputs = keras.layers.Dense(len(class_names), activation="softmax", name="ouput")(x)

    model = keras.Model(inputs=inputs, outputs=outputs, name="accent_recognition")

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1.9644e-5),
        loss=keras.losses.CategoricalCrossentropy(),
        metrics=["accuracy", keras.metrics.AUC(name="auc")],
    )

    return model


model = build_and_compile_model()
model.summary()

### Class weights calculation

In [None]:
class_counts = tf.zeros(shape=(len(class_names),), dtype=tf.int32)

for x, y in iter(train_ds):
    class_counts = class_counts + tf.math.bincount(
        tf.cast(tf.math.argmax(y, axis=1), tf.int32), minlength=len(class_names)
    )

class_weight = {
    i: tf.math.reduce_sum(class_counts).numpy() / class_counts[i].numpy()
    for i in range(len(class_counts))
}

print(class_weight)

### Prepare callbacks

In [None]:
early_stopping_cb = keras.callbacks.EarlyStopping(
    monitor="val_auc", patience=10, restore_best_weights=True
)

model_checkpoint_cb = keras.callbacks.ModelCheckpoint(
    MODEL_NAME + ".h5", monitor="val_auc", save_best_only=True
)

tensorboard_cb = keras.callbacks.TensorBoard(
    os.path.join(os.curdir, "logs", model.name)
)

callbacks = [early_stopping_cb, model_checkpoint_cb, tensorboard_cb]

### Training

In [None]:
history = model.fit(
    train_ds,
    epochs=EPOCHS,
    validation_data=valid_ds,
    class_weight=class_weight,
    callbacks=callbacks,
    verbose=2,
)

### Results

Plot the training and validation AUC and accuracy.

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(14, 5))

axs[0].plot(range(EPOCHS), history.history["accuracy"], label="Training")
axs[0].plot(range(EPOCHS), history.history["val_accuracy"], label="Validation")
axs[0].set_xlabel("Epochs")
axs[0].set_title("Training & Validation Accuracy")
axs[0].legend()
axs[0].grid(True)

axs[1].plot(range(EPOCHS), history.history["auc"], label="Training")
axs[1].plot(range(EPOCHS), history.history["val_auc"], label="Validation")
axs[1].set_xlabel("Epochs")
axs[1].set_title("Training & Validation AUC")
axs[1].legend()
axs[1].grid(True)

plt.show()

### Evaluate

In [None]:
train_loss, train_acc, train_auc = model.evaluate(train_ds)
valid_loss, valid_acc, valid_auc = model.evaluate(valid_ds)