#### Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd "/content/drive/MyDrive/IASNLP"

## Importing Necessary Libraries

In [None]:
!pip install sentencepiece
!pip install --upgrade -q jax
!pip install --upgrade -q jaxlib
!pip install --upgrade -q trax

In [None]:
import numpy as np
import pandas as pd

import sentencepiece as spm
from tensorflow.keras.preprocessing.sequence import pad_sequences

import trax
from trax.data import inputs
from trax.fastmath import numpy as fnp
from trax import layers as tl
from trax.supervised import training
from trax.models import Transformer

import matplotlib.pyplot as plt 

from functools import reduce
from collections import Counter
import os

In [None]:
# Make sure the Colab Runtime is set to Accelerator: TPU.
import requests
import os
if 'TPU_DRIVER_MODE' not in globals():
  url = 'http://' + os.environ['COLAB_TPU_ADDR'].split(':')[0] + ':8475/requestversion/tpu_driver0.1-dev20191206'
  resp = requests.post(url)
  TPU_DRIVER_MODE = 1

# The following is required to use TPU Driver as JAX's backend.
from jax.config import config
config.FLAGS.jax_xla_backend = "tpu_driver"
config.FLAGS.jax_backend_target = "grpc://" + os.environ['COLAB_TPU_ADDR']
print(config.FLAGS.jax_backend_target)

## Setting up Data Generator

Loading the `train_data` and `train_dev_data` along with the SentencePiece BPE models for English and Bengali.

In [None]:
train_data, train_dev_data = pd.read_csv("train_data.csv")[['src', 'tgt']], pd.read_csv("train_dev.csv")[['src', 'tgt']]
sp_en_bpe, sp_ben_bpe = spm.SentencePieceProcessor(), spm.SentencePieceProcessor()
sp_en_bpe.load('eng_bpe.model'); sp_ben_bpe.load('ben_bpe.model');

These functions are taken from the data preprocessing part to get tokenized representation of sentences as well as detokenized sentences.

In [None]:
def tokenize(sentence, sp_model):
    # We add the EOS token at the end of each encoded sentence
    inputs = sp_model.encode_as_ids(sentence) + [sp_model.eos_id()]
    return np.reshape(np.array(inputs), [1, -1])
def detokenize(tokenized, sp_model):
    integers = np.squeeze(tokenized).tolist()
    return sp_model.DecodeIdsWithCheck(integers[:integers.index(sp_model.eos_id())])
def data_generator(batch_size, src, tgt, maxlen=60, shuffle=False, verbose=False):
    num_lines = len(src)
    lines_index = [*range(num_lines)]
    if shuffle:
        np.random.shuffle(lines_index)
    index = 0
    while True:
        buffer_src = list()
        buffer_tgt = list() 
        max_len = 0 
        for i in range(batch_size):
            if index >= num_lines:
                index = 0
                if shuffle:
                    np.random.shuffle(lines_index)
            buffer_src.append(src[lines_index[index]])
            buffer_tgt.append(tgt[lines_index[index]])
            index += 1
        batch_src = pad_sequences(buffer_src, maxlen = maxlen, padding='post', truncating='post')
        batch_tgt = pad_sequences(buffer_tgt, maxlen = maxlen, padding='post', truncating='post')
        if verbose: print("index=", index)
        yield((batch_src, batch_tgt))

In [None]:
src_train_data_enc = [tokenize(train_data['src'].iloc[i], sp_en_bpe) for i in range(train_data.shape[0])]
tgt_train_data_enc = [tokenize(train_data['tgt'].iloc[i], sp_ben_bpe) for i in range(train_data.shape[0])]
src_train_dev_data_enc = [tokenize(train_dev_data['src'].iloc[i], sp_en_bpe) for i in range(train_dev_data.shape[0])]
tgt_train_dev_data_enc = [tokenize(train_dev_data['tgt'].iloc[i], sp_ben_bpe) for i in range(train_dev_data.shape[0])]

Data Generators for `train_data` and `train_dev_data`.

In [None]:
train_data_gen = data_generator(128, src_train_data_enc, tgt_train_data_enc)
train_dev_data_gen = data_generator(32, src_train_dev_data_enc, tgt_train_dev_data_enc)

## Model

### 1. Model-1

#### 1.1 Basic Transformer

Now, we are ready to train our first model. We will start by training a basic transformer model from scratch with `embedding dimension = 256`, `dense layer units = 512`, `number of heads = 4`, `number of encoder layers = number of decoder layers = 3` and `maximum number of tokens = 60`.\
The model takes two inputs i.e. the source and corresponding target sentences as a tuple and results in two outputs i.e. 

In [None]:
model = Transformer(input_vocab_size=16000, output_vocab_size=16000, d_model=256, d_ff=512, dropout = 0.1, n_heads=4, n_encoder_layers=3, n_decoder_layers=3, max_len=60, mode='train')
model

Serial_in2_out2[
  Select[0,1,1]_in2_out3
  Branch_out2[
    []
    Serial[
      PaddingMask(0)
    ]
  ]
  Serial_in2_out2[
    Embedding_16000_256
    Dropout
    PositionalEncoding
    Serial_in2_out2[
      Branch_in2_out3[
        None
        Serial_in2_out2[
          LayerNorm
          Serial_in2_out2[
            _in2_out2
            Serial_in2_out2[
              Select[0,0,0]_out3
              Serial_in4_out2[
                _in4_out4
                Serial_in4_out2[
                  Parallel_in3_out3[
                    Dense_256
                    Dense_256
                    Dense_256
                  ]
                  PureAttention_in4_out2
                  Dense_256
                ]
                _in2_out2
              ]
            ]
            _in2_out2
          ]
          Dropout
        ]
      ]
      Add_in2
    ]
    Serial[
      Branch_out2[
        None
        Serial[
          LayerNorm
          Dense_512
          Serial[
            Re

#### 1.2 Training

Converting our data generator to be fed to the training.

In [None]:
np.random.seed(43)
train_generator = trax.data.inputs.add_loss_weights(train_data_gen, id_to_mask= sp_en_bpe.pad_id())
train_dev_generator = trax.data.inputs.add_loss_weights(train_dev_data_gen, id_to_mask= sp_en_bpe.pad_id())

Setting up `train_task` on which our model will be trained. We use `loss function = CrossEntropyLossWithLogSotmax`, `optimizer = Adam with learning rate 0.01`, `learning rate schedule = warm up and square root decay`.

In [None]:
train_task = training.TrainTask(labeled_data= train_generator, 
                                loss_layer= tl.CrossEntropyLossWithLogSoftmax(),
                                optimizer= trax.optimizers.Adam(0.01),
                                lr_schedule= trax.lr.warmup_and_rsqrt_decay(1000, 0.01),
                                n_steps_per_checkpoint= 100,
                                n_steps_per_permanent_checkpoint= 1000)

Setting up `eval_task` for evaluating our model performance. We monitor two metrics on our evaluation dataset i.e. `train_dev_data` which are `CrossEntropyLossWithLogSoftmax` and `WeightedCategoryAccuracy`.

In [None]:
eval_task = training.EvalTask(labeled_data=train_dev_generator,
                              metrics=[tl.CrossEntropyLossWithLogSoftmax(), tl.WeightedCategoryAccuracy()])

Setting up the `training_loop` for training our model.

In [None]:
output_dir = './Model'
training_loop = training.Loop(model,
                              train_task,
                              eval_tasks=[eval_task],
                              output_dir=output_dir)

In [None]:
# training_loop.run(5000)


Step    100: Ran 99 train steps in 118.46 secs
Step    100: train CrossEntropyLossWithLogSoftmax |  8.31935406
Step    100: eval  CrossEntropyLossWithLogSoftmax |  7.54077768
Step    100: eval        WeightedCategoryAccuracy |  0.10845987

Step    200: Ran 100 train steps in 42.03 secs
Step    200: train CrossEntropyLossWithLogSoftmax |  7.46800280
Step    200: eval  CrossEntropyLossWithLogSoftmax |  6.85576916
Step    200: eval        WeightedCategoryAccuracy |  0.16158536

Step    300: Ran 100 train steps in 44.55 secs
Step    300: train CrossEntropyLossWithLogSoftmax |  7.26776886
Step    300: eval  CrossEntropyLossWithLogSoftmax |  7.23670197
Step    300: eval        WeightedCategoryAccuracy |  0.10821643

Step    400: Ran 100 train steps in 44.16 secs
Step    400: train CrossEntropyLossWithLogSoftmax |  7.04190826
Step    400: eval  CrossEntropyLossWithLogSoftmax |  6.91076279
Step    400: eval        WeightedCategoryAccuracy |  0.15454544

Step    500: Ran 100 train steps in 49.

In [None]:
# If Model directory not already present with the trained model weights, uncomment and run the previous line and this line
# training_loop.load_checkpoint(directory=output_dir, filename="model.pkl.gz")

In [None]:
model.init_from_file("./Model/model.pkl.gz")

#### 1.3 Testing

##### Greedy Search

At every step of prediction, we feed in the input sequence and the predicted translated part upto that point. Based on this, the model predicts the next token id given the previous were the ones predicted till that point. We have `temperature=0.0`, so it will not sample the next token instead it will choose the one with maximum probability. 

In [None]:
def next_symbol(model, input_tokens, cur_output_tokens, temperature):
    token_length = len(cur_output_tokens)
    padded_length = 60
    padded = cur_output_tokens + [0] * (padded_length - token_length) 
    padded_with_batch = np.expand_dims(padded, axis=0)
    output, _ = model((input_tokens, padded_with_batch))   
    log_probs = output[0, token_length, :]
    symbol = int(tl.logsoftmax_sample(log_probs, temperature))
    return symbol, float(log_probs[symbol])

In [None]:
def sampling_decode(input_sentence, model = None, temperature=0.0):
    input_tokens = tokenize(input_sentence, sp_en_bpe)
    cur_output_tokens = []
    cur_output = 0  
    EOS = 1
    while cur_output != EOS: 
        cur_output, log_prob = next_symbol(model, input_tokens, cur_output_tokens, temperature)
        cur_output_tokens.append(cur_output) 
    sentence = detokenize(cur_output_tokens, sp_ben_bpe)
    return cur_output_tokens, log_prob, sentence

In [None]:
def greedy_decode_test(sentence, model=None):  
    _,_, translated_sentence = sampling_decode(sentence, model)   
    return translated_sentence

In [None]:
sentence = 'I love you.'
translated_sentence = greedy_decode_test(sentence, model)
print("English: ", sentence)
print("Bengali: ", translated_sentence)

English:  I love you.
Bengali:  আমি আমার কথা বলার।


##### Minimum Bayes-Risk Decoding

We sample multiple sentence(with non-zero `temperature`). Then, we calculate the similarity score(`rougel_similarity`) of all of the sampled sentences with every other. Then we take the average of it to be the score of each sentence. The one with the highest score is selected.

In [None]:
def generate_samples(sentence, n_samples, model=None, temperature=0.6):
    samples, log_probs = [], []
    for _ in range(n_samples):
        sample, logp, _ = sampling_decode(sentence, model, temperature)
        samples.append(sample)
        log_probs.append(logp)
    return samples, log_probs

We use ROUGE score as the similarity metric for two sentences.
$$ROUGE\enspace score= 2* \frac{(precision * recall)}{(precision + recall)}$$

In [None]:
def rouge1_similarity(system, reference):
    sys_counter = Counter(system)
    ref_counter = Counter(reference)
    overlap = 0
    for token in sys_counter:
        token_count_sys = sys_counter[token]
        token_count_ref = ref_counter[token]
        overlap += min(token_count_ref, token_count_sys)
    precision = overlap / sum(sys_counter.values())
    recall = overlap / sum(ref_counter.values())
    if precision + recall != 0:
        rouge1_score = 2 * ((precision * recall)/(precision + recall))
    else:
        rouge1_score = 0     
    return rouge1_score

In [None]:
def weighted_avg_overlap(samples, log_probs):
    scores = {}
    for index_candidate, candidate in enumerate(samples):    
        overlap, weight_sum = 0.0, 0.0
        for index_sample, (sample, logp) in enumerate(zip(samples, log_probs)):           
            if index_candidate == index_sample:
                continue
            sample_p = float(np.exp(logp))
            weight_sum += sample_p
            sample_overlap = rouge1_similarity(candidate, sample)
            overlap += sample_p * sample_overlap
        score = overlap / weight_sum
        scores[index_candidate] = score
    return scores

In [None]:
def mbr_decode(sentence, n_samples, model=None, temperature=0.6):
    samples, log_probs = generate_samples(sentence, n_samples, model, temperature)
    scores = weighted_avg_overlap(samples, log_probs)
    max_score_key = max(scores, key=scores.get)
    translated_sentence = detokenize(samples[max_score_key], sp_ben_bpe)
    return (translated_sentence, max_score_key, scores)

In [92]:
sentence = "I love you"
translated_sentence = mbr_decode(sentence, 4, model, 1.0)
print("English: ", sentence)
print("Bengali: ", translated_sentence[0])

English:  I love you
Bengali:  আমি ঘটবে আমি ভাবছি।
