# Dependencies

In [None]:
!pip install keras_tuner --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras_tuner
  Downloading keras_tuner-1.1.3-py3-none-any.whl (135 kB)
[K     |████████████████████████████████| 135 kB 31.5 MB/s 
Collecting kt-legacy
  Downloading kt_legacy-1.0.4-py3-none-any.whl (9.6 kB)
Collecting jedi>=0.10
  Downloading jedi-0.18.1-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 54.6 MB/s 
Installing collected packages: jedi, kt-legacy, keras-tuner
Successfully installed jedi-0.18.1 keras-tuner-1.1.3 kt-legacy-1.0.4


# Libraries

In [None]:
import re
import json
import pickle
import random
import string
import pathlib
import numpy as np
import pandas as pd
import tensorflow as tf
import keras_tuner as kt
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import TextVectorization
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

# Dataset

The dataset is taken and pre-modified from [Tatoeba](https://tatoeba.org), contains about 13k+ data of Indonesian-English translated sentences.

## Convert Dataset to Tab-Separated Texts

In [None]:
# One time call
# df = pd.read_csv("translation.tsv", sep="\t")
# df.to_csv("translation.txt", header=None, index=None, sep="\t", mode="a")

## Load the Dataset

In [None]:
data_path = "translation.txt"
with open(data_path) as f:
    texts = f.read().split("\n")[:-1]

## Add Start-of-Sentence (SOS) and End-of-Sentence (EOS) Token

In [None]:
text_pairs = []

for text in texts:
    # Split and add the sentence into a tuple pair (ID, EN)
    id, en = text.split("\t")
    text_pairs.append((id, f"thisissos {en} thisiseos"))

In [None]:
random.choice(text_pairs)

('Dia dikenal sebagai seorang penyanyi rok.',
 'thisissos He is known as a rock singer. thisiseos')

## Little EDA

In [None]:
# Find max number of tokens in a sentence for each language
max_seq_len_id = 0
max_seq_len_en = 0

for pair in text_pairs:
    if len(pair[0]) > max_seq_len_id:
        max_seq_len_id = len(pair[0])

    if len(pair[1]) > max_seq_len_en:
        max_seq_len_en = len(pair[1])

In [None]:
max_seq_len_id, max_seq_len_en

(447, 414)

## Split Dataset

In [None]:
# Shuffle the texts
random.shuffle(text_pairs)

# Determine the number of each datasets
# 10% of validation and test set
n_val = int(0.10 * len(text_pairs))
n_train = len(text_pairs) - 2 * n_val

In [None]:
# Slice the pairs list
train_pairs = text_pairs[: n_train]
val_pairs = text_pairs[n_train : n_train + n_val]
test_pairs = text_pairs[n_train + n_val :]

In [None]:
len(text_pairs), len(train_pairs), len(val_pairs), len(test_pairs)

(13846, 11078, 1384, 1384)

# Vectorization

## Instantiate the vectorizer

In [None]:
# The output sequence of the vectorizer will be padded to max_seq_len
# OOM :(
max_seq_len = int(max(max_seq_len_id, max_seq_len_en) / 2)

def create_vectorizer(vocab_size=20000):
    id_vectorizer = TextVectorization(
        max_tokens=vocab_size, 
        output_mode="int", 
        output_sequence_length=max_seq_len,
        standardize="lower_and_strip_punctuation"
    )

    en_vectorizer = TextVectorization(
        max_tokens=vocab_size,
        output_mode="int",
        output_sequence_length=max_seq_len + 1,
        standardize="lower_and_strip_punctuation",
    )

    return id_vectorizer, en_vectorizer

## Adapt the Vectorizer

In [None]:
# Split the training set for each language
def split_data_lang(pairs):
    train_1, train_2 = zip(*train_pairs)
    train_1 = list(train_1)
    train_2 = list(train_2)

    return train_1, train_2

In [None]:
# Adapt the Vectorizer to the training set
def adapt_vectorizer(vectorizer, train_set):
    vectorizer.adapt(train_set)

## Transform the Dataset

In [None]:
def vectorize_data(id_text, en_text):
    tfed_id = id_vectorizer(id_text)
    tfed_en = en_vectorizer(en_text)

    return ({"enc_inputs": tfed_id, "dec_inputs": tfed_en[:, :-1],}, tfed_en[:, 1:])

In [None]:
def transform_dataset(pairs, batch_size=64):
    lang_1, lang_2 = split_data_lang(pairs)
    dataset = tf.data.Dataset.from_tensor_slices((lang_1, lang_2))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(vectorize_data)
    return dataset.shuffle(2048).prefetch(16).cache()

# Transformer

## Encoder

The `Encoder` consists of a `Multi-Head Attention` layer, `Normalization` layer, and fully-connected `Dense` layer, will receive the training sequences as input and produce new representation of the sequence and pass it to the `Decoder`.

In [None]:
class Encoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, n_heads, **kwargs):
        super(Encoder, self).__init__(**kwargs)

        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.n_heads = n_heads
        self.supports_masking = True

        self.attention = layers.MultiHeadAttention(
            num_heads=n_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(dense_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        
        self.norm_1 = layers.LayerNormalization()
        self.norm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
    
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )

        proj_input = self.norm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)

        return self.norm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "dense_dim": self.dense_dim,
            "n_heads": self.n_heads,
        })

        return config

## Decoder

Aside of the output from the `Encoder`, the `Decoder` also receives the target sequences that will be predicted later on. Because of it, we ensure that the `Decoder` cannot peek to the future by masking the information of the sequences after the target prediction.

In [None]:
class Decoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, n_heads, **kwargs):
        super(Decoder, self).__init__(**kwargs)

        self.n_heads = n_heads
        self.embed_dim = embed_dim
        self.supports_masking = True
        self.latent_dim = latent_dim

        self.attention_1 = layers.MultiHeadAttention(
            num_heads=n_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=n_heads, key_dim=embed_dim
        )

        self.dense_proj = keras.Sequential(
            [
                layers.Dense(latent_dim, activation="relu"), 
                layers.Dense(embed_dim),
            ]
        )
        self.norm_1 = layers.LayerNormalization()
        self.norm_2 = layers.LayerNormalization()
        self.norm_3 = layers.LayerNormalization()

    def get_causal_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)

        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [
                tf.expand_dims(batch_size, -1), 
                tf.constant([1, 1], dtype=tf.int32)
            ],
            axis=0,
        )

        return tf.tile(mask, mult)

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.norm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.norm_2(out_1 + attention_output_2)
        proj_output = self.dense_proj(out_2)

        return self.norm_3(out_2 + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "latent_dim": self.latent_dim,
            "n_heads": self.n_heads,
        })

        return config

## Positional Embeddings

We need `Positional Embeddings` to make sure that the model built later will be aware of the orders behind the sequences of every words.

In [None]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, seq_len, vocab_size, embed_dim, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)

        self.seq_len = seq_len
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=seq_len, output_dim=embed_dim
        )

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)

        return embedded_tokens + embedded_positions

    def get_config(self):
        config = super().get_config()
        config.update({
            "seq_len": self.seq_len,
            "vocab_size": self.vocab_size,
            "embed_dim": self.embed_dim,
        })

        return config

# Building the Model

In [None]:
# Model Builder
def build_model(hp):
    vocab_size = 20000
    seq_len = 223
    # Hyperparameters
    # Embed dim -> dim of input token vectors
    # Latent dim -> dim of dense layer
    # Num heads -> number of multi-heads attention layer
    hp_embed_dim = hp.Int("embed_dim", min_value=32, max_value=128, step=32)
    hp_latent_dim = hp.Choice("latent_dim", values=[1024, 2048])
    hp_num_heads = hp.Choice("num_heads", values=[4, 8, 16])

    # Encoder
    encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="enc_inputs")
    tf = PositionalEmbedding(seq_len, vocab_size, hp_embed_dim)(encoder_inputs)
    encoder_outputs = Encoder(hp_embed_dim, hp_latent_dim, hp_num_heads)(tf)
    encoder = keras.Model(encoder_inputs, encoder_outputs)

    # Decoder Inputs
    decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="dec_inputs")
    encoded_seq_inputs = keras.Input(shape=(None, hp_embed_dim), name="dec_state_inputs")

    # Positional Embeddings
    tf = PositionalEmbedding(seq_len, vocab_size, hp_embed_dim)(decoder_inputs)
    tf = Decoder(hp_embed_dim, hp_latent_dim, hp_num_heads)(tf, encoded_seq_inputs)
    tf = layers.Dropout(0.5)(tf)

    # Decoder Outputs
    decoder_outputs = layers.Dense(vocab_size, activation="softmax")(tf)
    decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)
    decoder_outputs = decoder([decoder_inputs, encoder_outputs])

    # Combine the components
    transformer = keras.Model(
        [encoder_inputs, decoder_inputs], decoder_outputs, name="vanilla_transformer"
    )

    transformer.compile(loss='sparse_categorical_crossentropy', optimizer="rmsprop", metrics=["accuracy"])

    return transformer

In [None]:
def create_tuner():
    tuner = kt.Hyperband(build_model,
                         objective='val_loss',
                         max_epochs=4,
                         factor=3,
                         directory='models',
                         project_name='nlp-lang-trans')
    
    cb = EarlyStopping(monitor='val_loss', patience=5)

    return tuner, cb

In [None]:
def execute_tuning(tuner, cb, train, val, epochs=2):
    tuner.search(train, epochs=epochs, validation_data=val, callbacks=[cb])
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

    return tuner, best_hps

In [None]:
# Create and train the hyper model
def fit_train(tuner, best_hps, train, val, epochs=30):
    hypermodel = tuner.hypermodel.build(best_hps)
    history = hypermodel.fit(train, epochs=epochs, validation_data=val)
    
    val_acc_per_epoch = history.history['val_accuracy']
    best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1

    hypermodel = tuner.hypermodel.build(best_hps)

    # Retrain the model
    hypermodel.fit(train, epochs=best_epoch, validation_data=val)

    return hypermodel

# Evaluation

In [None]:
# Predict
def predict(model, input_sentence, input_vectorizer, output_vectorizer, max_seq_len):
    output_vocab = output_vectorizer.get_vocabulary()
    output_lookup = dict(zip(range(len(output_vocab)), output_vocab))

    tokenized_input = input_vectorizer([input_sentence])
    output = "thisissos"

    for i in range(max_seq_len):
        tokenized_target = output_vectorizer([output])[:, :-1]
        predictions = model([tokenized_input, tokenized_target])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = output_lookup[sampled_token_index]
        output += " " + sampled_token

        if sampled_token == "thisiseos":
            break

    return output

In [None]:
# Evaluate the Model
def evaluate(model, input_vectorizer, output_vectorizer, max_seq_len, test_pairs, sf=SmoothingFunction().method7):
    hypotheses = []
    references = []
    for input, output in test_pairs:
        translated = predict(model, input, input_vectorizer, output_vectorizer, max_seq_len)
        hypotheses.append(translated)
        references.append(output.split())
        
    bleu_score = corpus_bleu(references, hypotheses, smoothing_function=sf)
    return bleu_score

# Export

In [None]:
# Save Transformer Model
def save_model(model, path="translation.h5"):
    model.save(path)

In [None]:
# Save Vectorizer
def save_vectorizer(vectorizer, path):
    pickle.dump(
        {
            "config": vectorizer.get_config(),
            "weights": vectorizer.get_weights(),
            "vocab": vectorizer.get_vocabulary()
        }, 
        open(path, "wb")
    )

In [None]:
# Load Transformer Model
def load_model(custom_objects, path="translation.h5"):
    return keras.models.load_model(path, custom_objects=custom_objects)

In [None]:
# Load Vectorizer
def load_vectorizer(path):
    v = pickle.load(open(path, "rb"))
    vec = TextVectorization.from_config(v["config"])
    vec.set_vocabulary(v["vocab"])
    return vec

# End-to-End

## Vectorizer

In [None]:
# Split the Dataset
train_id, train_en = split_data_lang(train_pairs)

In [None]:
# Instantiate the vectorizer
id_vectorizer, en_vectorizer = create_vectorizer()

In [None]:
# Adapt the vectorizer
adapt_vectorizer(id_vectorizer, train_id)
adapt_vectorizer(en_vectorizer, train_en)

In [None]:
# Transform the dataset
train_data = transform_dataset(train_pairs)
val_data = transform_dataset(val_pairs)
test_data = [pair[0] for pair in test_pairs]

## Model Training

In [None]:
tuner, cb = create_tuner()
tuner, best_hps = execute_tuning(tuner, cb, train_data, val_data)

Trial 10 Complete [00h 04m 47s]
val_loss: 0.14645525813102722

Best val_loss So Far: 0.11296574026346207
Total elapsed time: 00h 39m 53s


In [None]:
# Tuning results
print(best_hps.get("embed_dim"))
print(best_hps.get("latent_dim"))
print(best_hps.get("num_heads"))

128
1024
8


In [None]:
# Train model with best epoch and hyperparameters
hypermodel = fit_train(tuner, best_hps, train_data, val_data)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


## Model Saving

In [None]:
# Save the Model
save_model(hypermodel)

In [None]:
# Save the Vectorizer
save_vectorizer(id_vectorizer, "id_vectorizer.pkl")
save_vectorizer(en_vectorizer, "en_vectorizer.pkl")

## Model Evaluation

In [None]:
# Evaluate with Corpus-Level BLEU Score
evaluate(hypermodel, id_vectorizer, en_vectorizer, max_seq_len, test_pairs)

0.4021032423522424

# Error Analysis

## Saving Data Distribution for Further Analysis

In [None]:
with open('train.json', 'w') as f:
    json.dump(train_pairs,f)

In [None]:
with open('val.json', 'w') as f:
    json.dump(val_pairs,f)

In [None]:
with open('test.json', 'w') as f:
    json.dump(test_pairs,f)

## Analysis

In [None]:
def load_data(path):
    with open(path) as f:
        lst = [tuple(x) for x in json.load(f)]
        return lst

In [None]:
loaded_train = load_data("train.json")
loaded_val = load_data("val.json")
loaded_test = load_data("test.json")