# Language Identification

- https://arxiv.org/pdf/1701.03682.pdf
- https://cs229.stanford.edu/proj2015/324_report.pdf
- https://cs229.stanford.edu/proj2015/324_poster.pdf
- https://sites.google.com/view/vardial2021/home
- http://ttg.uni-saarland.de/resources/DSLCC/
- https://mzampieri.com/publications.html
- https://mzampieri.com/papers/dsl2016.pdf

## Imports

In [None]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

## Make workspace

In [None]:
# Make directories if they don't exist
os.makedirs(os.path.join('datasets/DSLCC-v2.0'), exist_ok=True)
if not os.path.exists('models'):
    os.mkdir('models')

## Download dataset

We use the [DSLCC v2.0](https://github.com/alvations/bayesmax/tree/master/bayesmax/data/DSLCC-v2.0) dataset from the [DSL Shared Task 2015](http://ttg.uni-saarland.de/lt4vardial2015/dsl.html)

In [None]:
# DSLCC v2.0
if not os.path.exists('datasets/DSLCC-v2.0/train.txt'):
    !cd datasets/DSLCC-v2.0 && curl -o train.txt https://raw.githubusercontent.com/alvations/bayesmax/master/bayesmax/data/DSLCC-v2.0/train-dev/train.txt
if not os.path.exists('datasets/DSLCC-v2.0/devel.txt'):
    !cd datasets/DSLCC-v2.0 && curl -o devel.txt https://raw.githubusercontent.com/alvations/bayesmax/master/bayesmax/data/DSLCC-v2.0/train-dev/devel.txt
if not os.path.exists('datasets/DSLCC-v2.0/test.txt'):
    !cd datasets/DSLCC-v2.0 && curl -o test.txt https://raw.githubusercontent.com/alvations/bayesmax/master/bayesmax/data/DSLCC-v2.0/test/test.txt

## Data

The corpus contains 20,000 instances per language (18,000 training + 2,000 development). Each instance is an excerpt extracted from journalistic texts containing 20 to 100 tokens and tagged with the country of origin of the text. A list of languages and the corresponing codes is shown in the following table:

<table>
    <tr>
        <th>Group Name</th>
        <th>Language Name</th>
        <th>Language Code</th>
    </tr>
    <tr>
        <td rowspan=2>South Eastern Slavic</td>
        <td>Bulgarian</td>
        <td>bg</td>
    </tr>
    <tr>
        <td>Macedonian</td>
        <td>mk</td>
    </tr>
    <tr>
        <td rowspan=3>South Western Slavic</td>
        <td>Bosnian</td>
        <td>bs</td>
    </tr>
    <tr>
        <td>Croatian</td>
        <td>hr</td>
    </tr>
    <tr>
        <td>Serbian</td>
        <td>sr</td>
    </tr>
    <tr>
        <td rowspan=2>West-Slavic</td>
        <td>Czech</td>
        <td>cz</td>
    </tr>
    <tr>
        <td>Slovak</td>
        <td>sk</td>
    </tr>
    <tr>
        <td rowspan=2>Ibero-Romance (Spanish)</td>
        <td>Peninsular Spanish</td>
        <td>es-ES</td>
    </tr>
    <tr>
        <td>Argentinian Spanish</td>
        <td>es-AR</td>
    </tr>
    <tr>
        <td rowspan=2>Ibero-Romance (Portugese)</td>
        <td>Brazilian Portugese</td>
        <td>pt-BR</td>
    </tr>
    <tr>
        <td>European Portugese</td>
        <td>pt-PT</td>
    </tr>
    <tr>
        <td rowspan=2>Astronesian</td>
        <td>Indonesian</td>
        <td>id</td>
    </tr>
    <tr>
        <td>Malay</td>
        <td>my</td>
    </tr>
    <tr>
        <td>Other</td>
        <td>Various Languages</td>
        <td>xx</td>
    </tr>
</table>

In [None]:
train = pd.read_csv('datasets/DSLCC-v2.0/train.txt', sep='\t', names=['sentence', 'language'])
validation = pd.read_csv('datasets/DSLCC-v2.0/devel.txt', sep='\t', names=['sentence', 'language'])
test = pd.read_csv('datasets/DSLCC-v2.0/test.txt', sep='\t', names=['sentence', 'language'])

In [None]:
print(f'Training set size:   {len(train)}')
print(f'Validation set size: {len(validation)}')
print(f'Test set size:       {len(test)}')

In [None]:
# Print number of instances per label
print(train['language'].value_counts())

In [None]:
train[train['language'] == 'xx'].head()

In [None]:
print(train.head())

In [None]:
# TODO use CLASSES with OneHotEncoder and CLASS_NAMES for output
CLASS_UNKNOWN = 'xx'
CLASSES = ['bg', 'mk', 'bs', 'hr', 'sr', 'cz', 'sk', 'es-ES', 'es-AR', 'pt-BR', 'pt-PT', 'id', 'my', CLASS_UNKNOWN]
CLASS_NAMES = [
    'Bulgarian', 'Macedonian', 'Bosnian', 'Croatian', 'Serbian', 'Czech', 'Slovak',
    'Peninsular Spanish', 'Argentinian Spanish', 'Brazilian Portuguese', 'European Portuguese',
    'Indonesian', 'Malay', 'Other'
]
NUM_CLASSES = len(CLASSES)

In [None]:
# NUM_CLASSES = len(train['language'].value_counts())

In [None]:
NUM_CLASSES

In [None]:
# Change all other language codes to xx
def mark_unknown_languages(data):
    data['language'].where([x in CLASSES for x in data['language']], CLASS_UNKNOWN, inplace=True)
mark_unknown_languages(train)
mark_unknown_languages(validation)
mark_unknown_languages(test)

## Common preprocessing

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

In [None]:
X_train = train['sentence']
y_train = train['language']
X_validation = validation['sentence']
y_validation = validation['language']

In [None]:
print(X_train.head())
print(y_train.head())

In [None]:
# y_train = pd.get_dummies(y_train).to_numpy()

In [None]:
# print(y_train)

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Use OneHotEncoder for target variable
# this is better than get_dummies because here we specify all classes
# so all possible classes will have a column and the order will be specified
# if the language code is unkown, an error is thrown
target_encoder = OneHotEncoder(sparse=False, dtype=np.int32)
target_encoder.fit(np.array(CLASSES).reshape(-1, 1))

In [None]:
target_encoder.transform(np.asarray(y_train[30000:30010]).reshape(-1, 1))

In [None]:
# Create, configure and train a tokenizer 
def get_tokenizer(data, num_words=None):
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“„”–', num_words=num_words, oov_token='<OOV>')
    tokenizer.fit_on_texts(data)
    return tokenizer

In [None]:
# trim and pad data
def preprocess_data(X, y, max_length=None):
    if max_length is not None:
        y = y[[len(x)<=max_length for x in X]]
        X = [x for x in X if len(x)<=max_length]
    # TODO pre or post padding?
    X = pad_sequences(X)
    return X, y

## Model

In [None]:
from tensorflow.keras import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, GRU, Dropout, Dense

In [None]:
def get_model(input_shape, hidden_layer_size, dropout_rate): #, recurrent_droupout_rate=0, dropout_rate=0, use_lstm=False):
    # TODO Test with LSTM instead of GRU
    # TODO Test with dropout after hidden layer
    model = Sequential([
        InputLayer(input_shape=input_shape),
        GRU(hidden_layer_size, recurrent_dropout=dropout_rate),
        # Dropout(rate=dropout_rate),
        Dense(NUM_CLASSES, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
# Plot model training history
def plot_history(history):
    plt.plot(history.epoch, history.history['accuracy'])
    plt.plot(history.epoch, history.history['val_accuracy'])
    plt.legend(['Training accuracy', 'Validation accuracy'])

In [None]:
# Save trained model, compilation and training data and history plot
def save_model(model, epochs, batch_size, hidden_layer_size, dropout_rate):
    model_name = f'models/model_{epochs}_{batch_size}_{hidden_layer_size}_{int(100*dropout_rate)}_{time.strftime("%Y%m%d_%H%M%S")}'
    model.save(model_name)
    with open(f'{model_name}/training.txt', 'w') as f:
        f.write(f'EPOCHS:            \t {EPOCHS}\n')
        f.write(f'BATCH SIZE:        \t {BATCH_SIZE}\n')
        f.write(f'HIDDEN LAYER SIZE: \t {HIDDEN_LAYER_SIZE}\n')
        f.write(f'DROPOUT RATE:      \t {DROPOUT_RATE}\n')
        model.summary(print_fn = lambda x: f.write(x + '\n'))
        f.write(f'ACCURACY:     \t {history.history["accuracy"]}\n')
        f.write(f'VAL ACCURACY: \t {history.history["val_accuracy"]}\n')
    plot_history(history)
    plt.savefig(f'{model_name}/history.png')
    return model_name

## Character n-grams

## Word unigrams

### Preprocessing

In [None]:
NUM_UNIQUE_WORDS = 10_000

In [None]:
# tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“„”–', num_words=NUM_UNIQUE_WORDS, oov_token='<OOV>')
# tokenizer.fit_on_texts(X_train)
tokenizer_w1 = get_tokenizer(X_train, NUM_UNIQUE_WORDS)

In [None]:
X_train = tokenizer_w1.texts_to_sequences(X_train)

In [None]:
# Find max word dict index
max([max(x) for x in X_train])

In [None]:
# Find max length of train sequences
max([len(x) for x in X_train])

In [None]:
# Remove texts longer than 50 words
# Max text length before this step is 2000+ words
# y_train = y_train[[len(x)<=50 for x in X_train]]
# X_train = [X for X in X_train if len(X)<=50]

In [None]:
# X_train = pad_sequences(X_train)

In [None]:
X_train, y_train = preprocess_data(X_train, y_train, 50)

In [None]:
X_train.shape

In [None]:
tokenizer_w1.num_words

In [None]:
print(len(X_train))
print(len(y_train))

In [None]:
len(tokenizer_w1.word_index)

In [None]:
tokenizer_w1.word_index

In [None]:
len([x for x in tokenizer_w1.word_counts.items() if x[1] > 10])

In [None]:
sorted(tokenizer_w1.word_counts.items(), key=lambda w: w[1], reverse=False)[:150]

In [None]:
X_train.shape

In [None]:
print(X_train[0])

### Model

In [None]:
from tensorflow.keras.utils import Sequence

In [None]:
class DataGenerator(Sequence):
    def __init__(self, input_sequences, vocabulary_size, labels, batch_size=32, shuffle=True):
        self.input_sequences = input_sequences
        self.vocabulary_size = vocabulary_size
        # TODO use target_encoder and move to __getitem__
#         self.labels = pd.get_dummies(labels).to_numpy()
        self.labels = target_encoder.transform(np.asarray(labels).reshape(-1, 1))
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.on_epoch_end()

    # Number of batches per epoch
    def __len__(self):
        return int(np.floor(len(self.input_sequences) / self.batch_size))

    # Generate one batch
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        X = to_categorical([self.input_sequences[index] for index in indexes], num_classes=self.vocabulary_size)
        # y = np.asarray([to_categorical(self.decoder_output[index], num_classes=self.n_classes) for index in indexes])
        y = np.asarray([self.labels[index] for index in indexes])
        return X, y

    # Update indexes for next epoch
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.input_sequences))
        if self.shuffle:
            np.random.shuffle(self.indexes)

In [None]:
EPOCHS = 20
BATCH_SIZE = 32
HIDDEN_LAYER_SIZE = 768
DROPOUT_RATE = 0.45

In [None]:
train_generator = DataGenerator(X_train, tokenizer_w1.num_words, y_train, batch_size=BATCH_SIZE)
# validation_generator = DataGenerator()

In [None]:
model = get_model((X_train.shape[1], tokenizer_w1.num_words), HIDDEN_LAYER_SIZE, DROPOUT_RATE)

In [None]:
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='models/checkpoint',
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    verbose=1
)

history = model.fit(
    train_generator,
#     validation_data=validation_generator,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint_callback]
)

In [None]:
save_model(model, EPOCHS, BATCH_SIZE, HIDDEN_LAYER_SIZE, DROPOUT_RATE)

## Hyperparameter search

In [1]:
from sklearn.model_selection import train_test_split

In [3]:
def grid_search_model(devel_X_processor=lambda x: x):
    X = devel_X_processor(X_validation.copy())

    devel_train_X, devel_test_X, devel_train_Y, devel_test_Y = train_test_split(
        X, y_validation, train_size=0.75, stratify=y_validation
    )

    tokenizer = get_tokenizer(devel_train_X, 10_000)
    
    devel_train_X = tokenizer.texts_to_sequences(devel_train_X)
    devel_test_X = tokenizer.texts_to_sequences(devel_test_X)

    devel_train_X, devel_train_Y = preprocess_data(devel_train_X, devel_train_Y, 50)
    devel_test_X, devel_test_Y = preprocess_data(devel_test_X, devel_test_Y, 50)

    hidden_layer_sizes = [768, 1024, 1280]
    dropout_rates = [0.2, 0.25, 0.35, 0.4, 0.45]
    
    results_acc = np.zeros((
        len(hidden_layer_sizes), len(dropout_rates)
    ))
    results_val_acc = np.zeros((
        len(hidden_layer_sizes), len(dropout_rates)
    ))

    for i, hidden_layer_size in enumerate(hidden_layer_sizes):
        for j, dropout_rate in enumerate(dropout_rates):
#             if i == 0 and j == 0:
#                 continue
            print('Training network with params:')
            print(f' - hidden_layer_size = {hidden_layer_size}')
            print(f' - dropout_rate      = {dropout_rate}')
            
            devel_train_generator = DataGenerator(devel_train_X, tokenizer.num_words, devel_train_Y, batch_size=BATCH_SIZE)
            devel_test_generator = DataGenerator(devel_test_X, tokenizer.num_words, devel_test_Y, batch_size=BATCH_SIZE)
            # TODO len(tokenizer.word_index)+1 ?
            model = get_model((devel_train_X.shape[1], tokenizer.num_words), hidden_layer_size, dropout_rate)
            history = model.fit(
                devel_train_generator,
                validation_data=devel_test_generator,
                epochs=EPOCHS
            )
            model_name = save_model(model)
      
            results_acc[i][j] = history.history["accuracy"]
            results_val_acc[i][j] = history.history["val_accuracy"]
            print(f'Results for {hidden_layer_size}, {dropout_rate} ({i}, {j}):')
            print(f'accuracy:     {history.history["accuracy"]}')
            print(f'val_accuracy: {history.history["val_accuracy"]}')
  

    grid_search_results_acc_df = pd.DataFrame(results_acc, index=hidden_layer_sizes, columns=dropout_rates)
    grid_search_results_val_acc_df = pd.DataFrame(results_val_acc, index=hidden_layer_sizes, columns=dropout_rates)
    grid_search_results_acc_df.to_csv('grid_search_acc.csv')
    grid_search_results_val_acc_df.to_csv('grid_search_acc.csv')

## Ensemble

In [None]:
grid_search_model()