# Libraries

In [1]:
import re
import pickle
import random
import string
import pathlib
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

# Dataset

The dataset is taken and pre-modified from [Tatoeba](https://tatoeba.org), contains about 13k+ data of Indonesian-English translated sentences.

## Convert Dataset to Tab-Separated Texts

In [2]:
# One time call
# df = pd.read_csv("translation.tsv", sep="\t")
# df.to_csv("translation.txt", header=None, index=None, sep="\t", mode="a")

## Load the Dataset

In [3]:
data_path = "translation.txt"
with open(data_path) as f:
    texts = f.read().split("\n")[:-1]

## Add Start-of-Sentence (SOS) and End-of-Sentence (EOS) Token

In [4]:
text_pairs = []

for text in texts:
    # Split and add the sentence into a tuple pair (ID, EN)
    id, en = text.split("\t")
    text_pairs.append((id, f"thisissos {en} thisiseos"))

In [5]:
random.choice(text_pairs)

('Aku lebih memilih tinggal di rumah.',
 'thisissos I would rather stay at home. thisiseos')

## Little EDA

In [6]:
# Find max number of tokens in a sentence for each language
max_seq_len_id = 0
max_seq_len_en = 0

for pair in text_pairs:
    if len(pair[0]) > max_seq_len_id:
        max_seq_len_id = len(pair[0])

    if len(pair[1]) > max_seq_len_en:
        max_seq_len_en = len(pair[1])

In [7]:
max_seq_len_id, max_seq_len_en

(447, 414)

## Split Dataset

In [8]:
# Shuffle the texts
random.shuffle(text_pairs)

# Determine the number of each datasets
# 10% of validation and test set
n_val = int(0.10 * len(text_pairs))
n_train = len(text_pairs) - 2 * n_val

In [9]:
# Slice the pairs list
train_pairs = text_pairs[: n_train]
val_pairs = text_pairs[n_train : n_train + n_val]
test_pairs = text_pairs[n_train + n_val :]

In [10]:
len(text_pairs), len(train_pairs), len(val_pairs), len(test_pairs)

(13846, 11078, 1384, 1384)

# Vectorization

## Instantiate the vectorizer

In [11]:
# The output sequence of the vectorizer will be padded to max_seq_len
# OOM :(
max_seq_len = int(max(max_seq_len_id, max_seq_len_en) / 2)

def create_vectorizer(vocab_size=20000):
    id_vectorizer = TextVectorization(
        max_tokens=vocab_size, 
        output_mode="int", 
        output_sequence_length=max_seq_len,
        standardize="lower_and_strip_punctuation"
    )

    en_vectorizer = TextVectorization(
        max_tokens=vocab_size,
        output_mode="int",
        output_sequence_length=max_seq_len + 1,
        standardize="lower_and_strip_punctuation",
    )

    return id_vectorizer, en_vectorizer

## Adapt the Vectorizer

In [12]:
# Split the training set for each language
def split_data_lang(pairs):
    train_1, train_2 = zip(*train_pairs)
    train_1 = list(train_1)
    train_2 = list(train_2)

    return train_1, train_2

In [13]:
# Adapt the Vectorizer to the training set
def adapt_vectorizer(vectorizer, train_set):
    vectorizer.adapt(train_set)

## Transform the Dataset

In [14]:
def vectorize_data(id_text, en_text):
    tfed_id = id_vectorizer(id_text)
    tfed_en = en_vectorizer(en_text)

    return ({"enc_inputs": tfed_id, "dec_inputs": tfed_en[:, :-1],}, tfed_en[:, 1:])

In [15]:
def transform_dataset(pairs, batch_size=64):
    lang_1, lang_2 = split_data_lang(pairs)
    dataset = tf.data.Dataset.from_tensor_slices((lang_1, lang_2))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(vectorize_data)
    return dataset.shuffle(2048).prefetch(16).cache()

# Transformer

## Encoder

The `Encoder` consists of a `Multi-Head Attention` layer, `Normalization` layer, and fully-connected `Dense` layer, will receive the training sequences as input and produce new representation of the sequence and pass it to the `Decoder`.

In [16]:
class Encoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, n_heads, **kwargs):
        super(Encoder, self).__init__(**kwargs)

        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.n_heads = n_heads
        self.supports_masking = True

        self.attention = layers.MultiHeadAttention(
            num_heads=n_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(dense_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        
        self.norm_1 = layers.LayerNormalization()
        self.norm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
    
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )

        proj_input = self.norm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)

        return self.norm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "dense_dim": self.dense_dim,
            "n_heads": self.n_heads,
        })

        return config

## Decoder

Aside of the output from the `Encoder`, the `Decoder` also receives the target sequences that will be predicted later on. Because of it, we ensure that the `Decoder` cannot peek to the future by masking the information of the sequences after the target prediction.

In [17]:
class Decoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, n_heads, **kwargs):
        super(Decoder, self).__init__(**kwargs)

        self.n_heads = n_heads
        self.embed_dim = embed_dim
        self.supports_masking = True
        self.latent_dim = latent_dim

        self.attention_1 = layers.MultiHeadAttention(
            num_heads=n_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=n_heads, key_dim=embed_dim
        )

        self.dense_proj = keras.Sequential(
            [
                layers.Dense(latent_dim, activation="relu"), 
                layers.Dense(embed_dim),
            ]
        )
        self.norm_1 = layers.LayerNormalization()
        self.norm_2 = layers.LayerNormalization()
        self.norm_3 = layers.LayerNormalization()

    def get_causal_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)

        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [
                tf.expand_dims(batch_size, -1), 
                tf.constant([1, 1], dtype=tf.int32)
            ],
            axis=0,
        )

        return tf.tile(mask, mult)

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.norm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.norm_2(out_1 + attention_output_2)
        proj_output = self.dense_proj(out_2)

        return self.norm_3(out_2 + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "latent_dim": self.latent_dim,
            "n_heads": self.n_heads,
        })

        return config

## Positional Embeddings

We need `Positional Embeddings` to make sure that the model built later will be aware of the orders behind the sequences of every words.

In [18]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, seq_len, vocab_size, embed_dim, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)

        self.seq_len = seq_len
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=seq_len, output_dim=embed_dim
        )

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)

        return embedded_tokens + embedded_positions

    def get_config(self):
        config = super().get_config()
        config.update({
            "seq_len": self.seq_len,
            "vocab_size": self.vocab_size,
            "embed_dim": self.embed_dim,
        })

        return config

## Building the Model

In [19]:
# Model Builder
def build_model(seq_len, vocab_size=20000, embed_dim=64, latent_dim=2048, num_heads=8):
    # Encoder
    encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="enc_inputs")
    tf = PositionalEmbedding(seq_len, vocab_size, embed_dim)(encoder_inputs)
    encoder_outputs = Encoder(embed_dim, latent_dim, num_heads)(tf)
    encoder = keras.Model(encoder_inputs, encoder_outputs)

    # Decoder Inputs
    decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="dec_inputs")
    encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="dec_state_inputs")

    # Positional Embeddings
    tf = PositionalEmbedding(seq_len, vocab_size, embed_dim)(decoder_inputs)
    tf = Decoder(embed_dim, latent_dim, num_heads)(tf, encoded_seq_inputs)
    tf = layers.Dropout(0.5)(tf)

    # Decoder Outputs
    decoder_outputs = layers.Dense(vocab_size, activation="softmax")(tf)
    decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)
    decoder_outputs = decoder([decoder_inputs, encoder_outputs])

    # Combine the components
    transformer = keras.Model(
        [encoder_inputs, decoder_inputs], decoder_outputs, name="vanilla_transformer"
    )

    return transformer

In [20]:
# Model Training
def train_model(
    model, 
    train_data, 
    val_data, 
    epochs=30, 
    opt="rmsprop", 
    loss="sparse_categorical_crossentropy", 
    metrics=["accuracy"]
):
    model.summary()
    model.compile(opt, loss, metrics)
    model.fit(train_data, epochs=epochs, validation_data=val_data)

# Evaluation

In [21]:
# Predict
def predict(model, input_sentence, input_vectorizer, output_vectorizer, max_seq_len):
    output_vocab = output_vectorizer.get_vocabulary()
    output_lookup = dict(zip(range(len(output_vocab)), output_vocab))

    tokenized_input = input_vectorizer([input_sentence])
    output = "thisissos"

    for i in range(max_seq_len):
        tokenized_target = output_vectorizer([output])[:, :-1]
        predictions = model([tokenized_input, tokenized_target])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = output_lookup[sampled_token_index]
        output += " " + sampled_token

        if sampled_token == "thisiseos":
            break

    return output

In [22]:
# Evaluate the Model
def evaluate(model, input_vectorizer, output_vectorizer, max_seq_len, test_pairs, sf=SmoothingFunction().method7):
    hypotheses = []
    references = []
    for input, output in test_pairs:
        translated = predict(model, input, input_vectorizer, output_vectorizer, max_seq_len)
        hypotheses.append(translated)
        references.append(output.split())
        
    bleu_score = corpus_bleu(references, hypotheses, smoothing_function=sf)
    return bleu_score

# Export

In [23]:
# Save Transformer Model
def save_model(model, path="translation.h5"):
    model.save(path)

In [24]:
# Save Vectorizer
def save_vectorizer(vectorizer, path):
    pickle.dump(
        {
            "config": vectorizer.get_config(),
            "weights": vectorizer.get_weights(),
            "vocab": vectorizer.get_vocabulary()
        }, 
        open(path, "wb")
    )

In [25]:
# Load Transformer Model
def load_model(custom_objects, path="translation.h5"):
    return keras.models.load_model(path, custom_objects=custom_objects)

In [26]:
# Load Vectorizer
def load_vectorizer(path):
    v = pickle.load(open(path, "rb"))
    vec = TextVectorization.from_config(v["config"])
    vec.set_vocabulary(v["vocab"])
    return vec

# End-to-End

## Vectorizer

In [27]:
# Split the Dataset
train_id, train_en = split_data_lang(train_pairs)

In [28]:
# Instantiate the vectorizer
id_vectorizer, en_vectorizer = create_vectorizer()

In [29]:
# Adapt the vectorizer
adapt_vectorizer(id_vectorizer, train_id)
adapt_vectorizer(en_vectorizer, train_en)

In [30]:
# Transform the dataset
train_data = transform_dataset(train_pairs)
val_data = transform_dataset(val_pairs)
test_data = [pair[0] for pair in test_pairs]

## Model Training

In [31]:
# Instantiate the model
transformer = build_model(max_seq_len)

In [32]:
# Train the model
train_model(transformer, train_data, val_data)

Model: "vanilla_transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 enc_inputs (InputLayer)        [(None, None)]       0           []                               
                                                                                                  
 positional_embedding (Position  (None, None, 64)    1294272     ['enc_inputs[0][0]']             
 alEmbedding)                                                                                     
                                                                                                  
 dec_inputs (InputLayer)        [(None, None)]       0           []                               
                                                                                                  
 encoder (Encoder)              (None, None, 64)     397184      ['positional_em

## Model Saving

In [33]:
# Save the Model
save_model(transformer)

In [34]:
# Save the Vectorizer
save_vectorizer(id_vectorizer, "id_vectorizer.pkl")
save_vectorizer(en_vectorizer, "en_vectorizer.pkl")

## Model Evaluation

In [35]:
# Evaluate with Corpus-Level BLEU Score
evaluate(transformer, id_vectorizer, en_vectorizer, max_seq_len, test_pairs)

0.3661214545980486