In [None]:
import numpy as np

In [None]:
data = np.load("./data/language_phonemes.npz", allow_pickle=True)
language_phonemes_all = data["language_phonemes_all"]
language_names_all = data["language_names_all"]
language_phonemes_selected = data["language_phonemes_selected"]
language_names_selected = data["language_names_selected"]

assert language_phonemes_all.shape[0] == language_names_all.shape[0]
assert language_phonemes_selected.shape[0] == language_names_selected.shape[0]
language_phonemes_all.shape, language_names_all.shape, language_phonemes_selected.shape, language_names_selected.shape

In [None]:
unique_names, unique_names_count = np.unique(language_names_all, return_counts=True)
multiple_samples = unique_names[unique_names_count > 1]

count_sort_idx_desc = np.argsort(-unique_names_count)

most_common_languages = list(
    zip(unique_names[count_sort_idx_desc], unique_names_count[count_sort_idx_desc])
)

print(most_common_languages[0:5])

X = language_phonemes_all[np.in1d(language_names_all, multiple_samples)]
y = language_names_all[np.in1d(language_names_all, multiple_samples)]

sample_shape = X[0].shape
num_classes = len(unique_names)

y.shape, X.shape, sample_shape, num_classes

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [None]:
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.39, random_state=33)

y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)

y_train.shape, y_test.shape

In [None]:
import matplotlib.pyplot as plt
plt.imshow(X[0], cmap=plt.cm.binary)
plt.show()

In [None]:
from keras import Input, Sequential, layers

# Create network architecture

input_shape = X[0].shape

print(f"shape={input_shape}")

model = Sequential(
    [
        Input(shape=input_shape),
        layers.Flatten(),
        layers.Dense(128, activation="relu"),
        layers.Dropout(0.2),
        layers.Dense(num_classes, activation="softmax"),
    ],
    name="language_model"
)

model.summary()

In [None]:
#Compilation

model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"])

In [None]:
# Train

print(X_train.shape)
print(y_train.shape)



model.fit(X_train, y_train, epochs=10, batch_size=64)

In [None]:
# accuracy doesn't look great, but it's not bad considering there
# is only one training sample of many of the languages and the
# number of classes is in the thousands

test_loss, test_acc = model.evaluate(X_test, y_test)

print(f"Classes: {num_classes}")
print(f"Samples per language: {num_classes / len(y_train):.2f}")
print(f"Random guess probability: {1 / num_classes:.5f}")
print(
    f"Most common language probability: {most_common_languages[0][1] / num_classes:.5f}"
)
print(
    f"5 most common language probability: {np.sum([y for _, y in most_common_languages[0:5]]) / num_classes:.5f}"
)
print(f"Model accuracy: {test_acc:.5f}")

In [None]:
# The predictions are heavily skewed towards languages that appear multiple
# times in the dataset (to be expected really). So, to genrate synthetic examples
# that don't just copy the most common languages, it'll probably be necessary to 
# limit the trainling data for synthetic data genertion to 1 example or a small 
# number of examples per language.

num_samples = 50
top_n_languages = set([name for name, _ in most_common_languages[0:5]])

predictions = label_encoder.inverse_transform(
    model.predict(X_test[0:num_samples]).argmax(axis=-1)
)
actual = label_encoder.inverse_transform(y_test[0:num_samples].argmax(axis=-1))

correct_predictions = sum([1 if p == a else 0 for p, a in zip(predictions, actual)])

predicted_from_common = sum([1 if p in top_n_languages else 0 for p in predictions])

print(f"Correct preditions: {correct_predictions / len(predictions) * 100}%")
print(f"Predicted from top 5: {predicted_from_common / len(predictions) * 100}%")