#### Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd "/content/drive/MyDrive/IASNLP"

## Importing Necessary Libraries

In [None]:
!pip install sentencepiece
# !pip install --upgrade jax # --> Doesn't require GPU # --> Needed in Basic, Bigger Transformer and LSTM with Attention
# !pip install jax[cuda11_cudnn82] -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html # Requires GPU # --> Needed in Basic, Bigger Transformer and LSTM with Attention
# !pip install --upgrade -q jax # --> Needed in Basic, Bigger Transformer and LSTM with Attention
# !pip install --upgrade -q jaxlib # --> Needed in Basic, Bigger Transformer and LSTM with Attention
# !pip install --upgrade -q trax # --> Needed in Basic, Bigger Transformer and LSTM with Attention
!pip install tensorflow-text

In [5]:
import numpy as np
import pandas as pd

import sentencepiece as spm
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences

# Needed in Basic Transformer and Bigger Transformer and LSTM with Attention
# import jax
# import trax
# from trax.data import inputs
# from trax.fastmath import numpy as fnp
# from trax import layers as tl
# from trax.supervised import training
# from trax.models import Transformer

import matplotlib.pyplot as plt 

from functools import reduce
from collections import Counter
import os
import time

In [9]:
# Make sure the Colab Runtime is set to Accelerator: TPU.
import requests
import os
if 'TPU_DRIVER_MODE' not in globals():
  url = 'http://' + os.environ['COLAB_TPU_ADDR'].split(':')[0] + ':8475/requestversion/tpu_driver0.1-dev20191206'
  resp = requests.post(url)
  TPU_DRIVER_MODE = 1

# The following is required to use TPU Driver as JAX's backend.
from jax.config import config
config.FLAGS.jax_xla_backend = "tpu_driver"
config.FLAGS.jax_backend_target = "grpc://" + os.environ['COLAB_TPU_ADDR']
print(config.FLAGS.jax_backend_target)

grpc://10.13.197.50:8470


## Setting up Data Generator

Loading the `train_data` and `train_dev_data` along with the SentencePiece BPE models for English and Bengali.

In [6]:
train_data, train_dev_data = pd.read_csv("train_data.csv")[['src', 'tgt']], pd.read_csv("train_dev.csv")[['src', 'tgt']] 
test_val_data, test_data = pd.read_csv("test_val.csv")[['src', 'tgt']], pd.read_csv("test_data.csv")[['src', 'tgt']]
sp_en_bpe, sp_ben_bpe = spm.SentencePieceProcessor(), spm.SentencePieceProcessor()
sp_en_bpe.load('eng_bpe.model'); sp_ben_bpe.load('ben_bpe.model');

These functions are taken from the data preprocessing part to get tokenized representation of sentences as well as detokenized sentences.

In [7]:
def tokenize(sentence, sp_model):
    # We add the EOS token at the end of each encoded sentence
    inputs = [sp_model.bos_id()] + sp_model.encode_as_ids(sentence) + [sp_model.eos_id()]
    return np.reshape(np.array(inputs), [1, -1])
def detokenize(tokenized, sp_model):
    integers = np.squeeze(tokenized).tolist()
    return sp_model.DecodeIdsWithCheck(integers[1:integers.index(sp_model.eos_id())])
def data_generator(batch_size, src, tgt, maxlen=60, shuffle=False, verbose=False):
    num_lines = len(src)
    lines_index = [*range(num_lines)]
    if shuffle:
        np.random.shuffle(lines_index)
    index = 0
    while True:
        buffer_src = list()
        buffer_tgt = list() 
        max_len = 0 
        for i in range(batch_size):
            if index >= num_lines:
                index = 0
                if shuffle:
                    np.random.shuffle(lines_index)
            buffer_src.append(src[lines_index[index]])
            buffer_tgt.append(tgt[lines_index[index]])
            index += 1
        batch_src = pad_sequences(buffer_src, maxlen = maxlen, padding='post', truncating='post')
        batch_tgt = pad_sequences(buffer_tgt, maxlen = maxlen, padding='post', truncating='post')
        if verbose: print("index=", index)
        yield((batch_src, batch_tgt))

In [8]:
src_train_data_enc = [np.squeeze(tokenize(train_data['src'].iloc[i], sp_en_bpe)) for i in range(train_data.shape[0])]
tgt_train_data_enc = [np.squeeze(tokenize(train_data['tgt'].iloc[i], sp_ben_bpe)) for i in range(train_data.shape[0])]
src_train_dev_data_enc = [np.squeeze(tokenize(train_dev_data['src'].iloc[i], sp_en_bpe)) for i in range(train_dev_data.shape[0])]
tgt_train_dev_data_enc = [np.squeeze(tokenize(train_dev_data['tgt'].iloc[i], sp_ben_bpe)) for i in range(train_dev_data.shape[0])]
src_test_val_data_enc = [np.squeeze(tokenize(test_val_data['src'].iloc[i], sp_en_bpe)) for i in range(test_val_data.shape[0])]
tgt_test_val_data_enc = [np.squeeze(tokenize(test_val_data['tgt'].iloc[i], sp_ben_bpe)) for i in range(test_val_data.shape[0])]

Data Generators for `train_data` and `train_dev_data`.

In [51]:
train_data_gen = data_generator(256, src_train_data_enc, tgt_train_data_enc)
train_dev_data_gen = data_generator(128, src_train_dev_data_enc, tgt_train_dev_data_enc)
test_val_data_gen = data_generator(64, src_test_val_data_enc, tgt_test_val_data_enc)

## Model

### 1. Transformer Based Models

#### 1.1 Basic Transformer (Trax)

We take `vocabulary size = 16000` for source and target.(Train the Sentence Piece model with 16000 vocab_size from Data-Preparation part before training this).\
Now, we are ready to train our first model. We will start by training a basic transformer model from scratch with:
-  `embedding dimension = 256`
-  `dense layer units = 512`
-  `number of heads = 4`
-  `number of encoder layers = number of decoder layers = 3`
- `maximum number of tokens = 60`

The model takes two inputs i.e. the source and corresponding target sentences as a tuple and results in two outputs.)

In [6]:
model = Transformer(input_vocab_size=16000, output_vocab_size=16000, d_model=256, d_ff=512, dropout = 0.1, n_heads=4, n_encoder_layers=3, n_decoder_layers=3, max_len=60, mode='train')
model

Serial_in2_out2[
  Select[0,1,1]_in2_out3
  Branch_out2[
    []
    Serial[
      PaddingMask(0)
    ]
  ]
  Serial_in2_out2[
    Embedding_16000_256
    Dropout
    PositionalEncoding
    Serial_in2_out2[
      Branch_in2_out3[
        None
        Serial_in2_out2[
          LayerNorm
          Serial_in2_out2[
            _in2_out2
            Serial_in2_out2[
              Select[0,0,0]_out3
              Serial_in4_out2[
                _in4_out4
                Serial_in4_out2[
                  Parallel_in3_out3[
                    Dense_256
                    Dense_256
                    Dense_256
                  ]
                  PureAttention_in4_out2
                  Dense_256
                ]
                _in2_out2
              ]
            ]
            _in2_out2
          ]
          Dropout
        ]
      ]
      Add_in2
    ]
    Serial[
      Branch_out2[
        None
        Serial[
          LayerNorm
          Dense_512
          Serial[
            Re

#### 1.2 Bigger Transformer (Trax)

We take `vocabulary size = 32000` for source and target.\
Now, we are ready to train our first model. We will start by training a basic transformer model from scratch with:
-  `embedding dimension = 256`
-  `dense layer units = 512`
-  `number of heads = 4`
-  `number of encoder layers = number of decoder layers = 6`
- `maximum number of tokens = 60`

The model takes two inputs i.e. the source and corresponding target sentences as a tuple and results in two outputs.)

In [11]:
model = Transformer(input_vocab_size=32000, output_vocab_size=32000, d_model=256, d_ff=512, dropout = 0.3, n_heads=4, n_encoder_layers=6, n_decoder_layers=6, max_len=60, mode='train')
model

Serial_in2_out2[
  Select[0,1,1]_in2_out3
  Branch_out2[
    []
    Serial[
      PaddingMask(0)
    ]
  ]
  Serial_in2_out2[
    Embedding_32000_256
    Dropout
    PositionalEncoding
    Serial_in2_out2[
      Branch_in2_out3[
        None
        Serial_in2_out2[
          LayerNorm
          Serial_in2_out2[
            _in2_out2
            Serial_in2_out2[
              Select[0,0,0]_out3
              Serial_in4_out2[
                _in4_out4
                Serial_in4_out2[
                  Parallel_in3_out3[
                    Dense_256
                    Dense_256
                    Dense_256
                  ]
                  PureAttention_in4_out2
                  Dense_256
                ]
                _in2_out2
              ]
            ]
            _in2_out2
          ]
          Dropout
        ]
      ]
      Add_in2
    ]
    Serial[
      Branch_out2[
        None
        Serial[
          LayerNorm
          Dense_512
          Serial[
            Re

#### 1.2 Training (Trax)

Converting our data generator to be fed to the training.

In [12]:
np.random.seed(43)
train_generator = trax.data.inputs.add_loss_weights(train_data_gen, id_to_mask= sp_en_bpe.pad_id())
train_dev_generator = trax.data.inputs.add_loss_weights(train_dev_data_gen, id_to_mask= sp_en_bpe.pad_id())
test_val_generator = trax.data.inputs.add_loss_weights(test_val_data_gen, id_to_mask= sp_en_bpe.pad_id())

Setting up `train_task` on which our model will be trained. We use:
- `loss function = CrossEntropyLossWithLogSotmax`
- `optimizer = Adam with learning rate 0.01`
- `learning rate schedule = warm up and square root decay`.

In [13]:
train_task = training.TrainTask(labeled_data= train_generator, 
                                loss_layer= tl.CrossEntropyLossWithLogSoftmax(),
                                optimizer= trax.optimizers.Adam(0.01),
                                lr_schedule= trax.lr.warmup_and_rsqrt_decay(1000, 0.01),
                                n_steps_per_checkpoint= 100,
                                n_steps_per_permanent_checkpoint= 1000)

Setting up `eval_task` for evaluating our model performance on `train_dev_data` and `test_val_data`. We monitor quite a few metrics on our evaluation datasets `train_dev_data` and `test_val_data` which are:
- `CrossEntropyLossWithLogSoftmax`
- `WeightedCategoryAccuracy`.

In [14]:
eval_train_dev_task = training.EvalTask(labeled_data=train_dev_generator,
                              metrics=[tl.CrossEntropyLossWithLogSoftmax(), tl.WeightedCategoryAccuracy()])

In [15]:
eval_test_val_task = training.EvalTask(labeled_data=test_val_generator,
                              metrics=[tl.CrossEntropyLossWithLogSoftmax(), tl.WeightedCategoryAccuracy(), ])

Setting up the `training_loop` for training our model.

In [None]:
output_dir = './Model-1'
training_loop = training.Loop(model,
                              train_task,
                              eval_tasks=[eval_train_dev_task, eval_test_val_task],
                              output_dir=output_dir)

In [None]:
# training_loop.run(5000) # -- Basic Transformer


Step    100: Ran 99 train steps in 118.46 secs
Step    100: train CrossEntropyLossWithLogSoftmax |  8.31935406
Step    100: eval  CrossEntropyLossWithLogSoftmax |  7.54077768
Step    100: eval        WeightedCategoryAccuracy |  0.10845987

Step    200: Ran 100 train steps in 42.03 secs
Step    200: train CrossEntropyLossWithLogSoftmax |  7.46800280
Step    200: eval  CrossEntropyLossWithLogSoftmax |  6.85576916
Step    200: eval        WeightedCategoryAccuracy |  0.16158536

Step    300: Ran 100 train steps in 44.55 secs
Step    300: train CrossEntropyLossWithLogSoftmax |  7.26776886
Step    300: eval  CrossEntropyLossWithLogSoftmax |  7.23670197
Step    300: eval        WeightedCategoryAccuracy |  0.10821643

Step    400: Ran 100 train steps in 44.16 secs
Step    400: train CrossEntropyLossWithLogSoftmax |  7.04190826
Step    400: eval  CrossEntropyLossWithLogSoftmax |  6.91076279
Step    400: eval        WeightedCategoryAccuracy |  0.15454544

Step    500: Ran 100 train steps in 49.

In [17]:
import jax
jax.default_backend()

'tpu'

In [18]:
training_loop.run(15000) # Bigger Transformer


Step      1: Total number of trainable weights: 32547072
Step      1: Ran 1 train steps in 211.16 secs
Step      1: train CrossEntropyLossWithLogSoftmax |  10.36063194
Step      1: eval  CrossEntropyLossWithLogSoftmax |  10.35689640
Step      1: eval        WeightedCategoryAccuracy |  0.00000000
Step      1: eval  CrossEntropyLossWithLogSoftmax |  10.36632061
Step      1: eval        WeightedCategoryAccuracy |  0.00000000

Step    100: Ran 99 train steps in 77.94 secs
Step    100: train CrossEntropyLossWithLogSoftmax |  8.76878548
Step    100: eval  CrossEntropyLossWithLogSoftmax |  7.48929882
Step    100: eval        WeightedCategoryAccuracy |  0.10610933
Step    100: eval  CrossEntropyLossWithLogSoftmax |  8.00431538
Step    100: eval        WeightedCategoryAccuracy |  0.05882353

Step    200: Ran 100 train steps in 82.94 secs
Step    200: train CrossEntropyLossWithLogSoftmax |  7.74584150
Step    200: eval  CrossEntropyLossWithLogSoftmax |  7.75866127
Step    200: eval        Weigh

In [None]:
# If Model directory not already present with the trained model weights, uncomment and run the previous line and this line
# In case Basic Transformer Training session crashes
# training_loop.load_checkpoint(directory=output_dir, filename="model.pkl.gz")

In [None]:
# Basic Transformer
# model.init_from_file("./Model/model.pkl.gz")

In [None]:
# Bigger Transformer
model.init_from_file("./Model-1/model.pkl.gz")

#### 1.3 Testing (Trax)

##### Greedy Search

At every step of prediction, we feed in the input sequence and the predicted translated part upto that point. Based on this, the model predicts the next token id given the previous were the ones predicted till that point. We have `temperature=0.0`, so it will not sample the next token instead it will choose the one with maximum probability. 

In [20]:
def next_symbol(model, input_tokens, cur_output_tokens, temperature):
    token_length = len(cur_output_tokens)
    padded_length = 60
    padded = cur_output_tokens + [0] * (padded_length - token_length) 
    padded_with_batch = np.expand_dims(padded, axis=0)
    output, _ = model((input_tokens, padded_with_batch))   
    log_probs = output[0, token_length, :]
    symbol = int(tl.logsoftmax_sample(log_probs, temperature))
    return symbol, float(log_probs[symbol])

In [21]:
def sampling_decode(input_sentence, model = None, temperature=0.0):
    input_tokens = tokenize(input_sentence, sp_en_bpe)
    cur_output_tokens = []
    cur_output = 0  
    EOS = 1
    while cur_output != EOS: 
        cur_output, log_prob = next_symbol(model, input_tokens, cur_output_tokens, temperature)
        cur_output_tokens.append(cur_output) 
    sentence = detokenize(cur_output_tokens, sp_ben_bpe)
    return cur_output_tokens, log_prob, sentence

In [22]:
def greedy_decode_test(sentence, model=None):  
    _,_, translated_sentence = sampling_decode(sentence, model)   
    return translated_sentence

In [23]:
sentence = 'I love you.'
translated_sentence = greedy_decode_test(sentence, model)
print("English: ", sentence)
print("Bengali: ", translated_sentence)

English:  I love you.
Bengali:  আমি কি?


##### Minimum Bayes-Risk Decoding

We sample multiple sentence(with non-zero `temperature`). Then, we calculate the similarity score(`rougel_similarity`) of all of the sampled sentences with every other. Then we take the average of it to be the score of each sentence. The one with the highest score is selected.

In [24]:
def generate_samples(sentence, n_samples, model=None, temperature=0.6):
    samples, log_probs = [], []
    for _ in range(n_samples):
        sample, logp, _ = sampling_decode(sentence, model, temperature)
        samples.append(sample)
        log_probs.append(logp)
    return samples, log_probs

We use ROUGE score as the similarity metric for two sentences.
$$ROUGE\enspace score= 2* \frac{(precision * recall)}{(precision + recall)}$$

In [25]:
def rouge1_similarity(system, reference):
    sys_counter = Counter(system)
    ref_counter = Counter(reference)
    overlap = 0
    for token in sys_counter:
        token_count_sys = sys_counter[token]
        token_count_ref = ref_counter[token]
        overlap += min(token_count_ref, token_count_sys)
    precision = overlap / sum(sys_counter.values())
    recall = overlap / sum(ref_counter.values())
    if precision + recall != 0:
        rouge1_score = 2 * ((precision * recall)/(precision + recall))
    else:
        rouge1_score = 0     
    return rouge1_score

In [26]:
def weighted_avg_overlap(samples, log_probs):
    scores = {}
    for index_candidate, candidate in enumerate(samples):    
        overlap, weight_sum = 0.0, 0.0
        for index_sample, (sample, logp) in enumerate(zip(samples, log_probs)):           
            if index_candidate == index_sample:
                continue
            sample_p = float(np.exp(logp))
            weight_sum += sample_p
            sample_overlap = rouge1_similarity(candidate, sample)
            overlap += sample_p * sample_overlap
        score = overlap / weight_sum
        scores[index_candidate] = score
    return scores

In [27]:
def mbr_decode(sentence, n_samples, model=None, temperature=0.6):
    samples, log_probs = generate_samples(sentence, n_samples, model, temperature)
    scores = weighted_avg_overlap(samples, log_probs)
    max_score_key = max(scores, key=scores.get)
    translated_sentence = detokenize(samples[max_score_key], sp_ben_bpe)
    return (translated_sentence, max_score_key, scores)

In [31]:
sentence = "I love you."
translated_sentence = mbr_decode(sentence, 4, model, 1)
print("English: ", sentence)
print("Bengali: ", translated_sentence[0])

English:  I love you.
Bengali:  তখন আমাকে খুবই মধ্যে কথা যাবেন।


In [32]:
sentence = "research has shown that exercise also helps in removing stress."
translated_sentence = mbr_decode(sentence, 4, model, 1)
print("English: ", sentence)
print("Bengali: ", translated_sentence[0])

English:  research has shown that exercise also helps in removing stress.
Bengali:  আমাদের কন্যা থেকে শুটিং করে দিয়েছেন।


### 2. LSTM Based Models

#### 2.1 LSTM with Attention (Trax)

We prepare the inputs for `AttentionQKV` layer.

In [10]:
def prepare_attention_input(encoder_activations, decoder_activations, inputs):
    keys = encoder_activations
    values = encoder_activations
    queries = decoder_activations
    mask = 1-fnp.equal(inputs, 0)
    mask = fnp.reshape(mask, (mask.shape[0], 1, 1, mask.shape[1]))
    # Mask is of dimension [batch_dim, attention_heads, decoder_dim(since, if a position is not padded all the elements in decoder_dim contributes, else none), padded_length]
    mask = mask + fnp.zeros((1, 1, decoder_activations.shape[1], 1))
    return queries, keys, values, mask

We implement the full architechture below

In [11]:
def LSTMAttn(input_vocab_size=32000,
            target_vocab_size=32000,
            d_model=512,
            n_encoder_layers=3,
            n_decoder_layers=3,
            n_attention_heads=2,
            attention_dropout=0.3,
            mode='train'):
    
    input_encoder = tl.Serial(tl.Embedding(vocab_size=input_vocab_size, d_feature=d_model),
                              [tl.LSTM(d_model) for _ in range(n_encoder_layers)])
    pre_attention_decoder = pre_attention_decoder = tl.Serial(tl.ShiftRight(mode=mode), # Teacher Forcing
                                                              tl.Embedding(vocab_size=target_vocab_size, d_feature=d_model),
                                                              tl.LSTM(d_model))
    model = tl.Serial(
        tl.Select([0, 1, 0, 1]),
        tl.Parallel(input_encoder, pre_attention_decoder),
        tl.Fn('PrepareAttentionInput', prepare_attention_input, n_out=4),
        tl.Residual(tl.AttentionQKV(d_model, n_heads=n_attention_heads, dropout=attention_dropout, mode=mode)),
        tl.Select([0, 2]),
        [tl.LSTM(d_model) for _ in range(n_decoder_layers)],
        tl.Dense(target_vocab_size),
        tl.LogSoftmax())
    return model

In [12]:
model = LSTMAttn()
model

Serial_in2_out2[
  Select[0,1,0,1]_in2_out4
  Parallel_in2_out2[
    Serial[
      Embedding_32000_512
      LSTM_512
      LSTM_512
      LSTM_512
    ]
    Serial[
      Serial[
        ShiftRight(1)
      ]
      Embedding_32000_512
      LSTM_512
    ]
  ]
  PrepareAttentionInput_in3_out4
  Serial_in4_out2[
    Branch_in4_out3[
      None
      Serial_in4_out2[
        _in4_out4
        Serial_in4_out2[
          Parallel_in3_out3[
            Dense_512
            Dense_512
            Dense_512
          ]
          PureAttention_in4_out2
          Dense_512
        ]
        _in2_out2
      ]
    ]
    Add_in2
  ]
  Select[0,2]_in3_out2
  LSTM_512
  LSTM_512
  LSTM_512
  Dense_32000
  LogSoftmax
]

#### 2.2 Training (Trax)

Converting our data generator to be fed to the training.

In [13]:
np.random.seed(43)
train_generator = trax.data.inputs.add_loss_weights(train_data_gen, id_to_mask= sp_en_bpe.pad_id())
train_dev_generator = trax.data.inputs.add_loss_weights(train_dev_data_gen, id_to_mask= sp_en_bpe.pad_id())
test_val_generator = trax.data.inputs.add_loss_weights(test_val_data_gen, id_to_mask= sp_en_bpe.pad_id())

Setting up `train_task` on which our model will be trained. We use:
- `loss function = CrossEntropyLossWithLogSotmax`
- `optimizer = Adam with learning rate 0.01`
- `learning rate schedule = warm up and square root decay`.

In [14]:
train_task = training.TrainTask(labeled_data= train_generator, 
                                loss_layer= tl.CrossEntropyLossWithLogSoftmax(),
                                optimizer= trax.optimizers.Adam(0.01),
                                lr_schedule= trax.lr.warmup_and_rsqrt_decay(1000, 0.01),
                                n_steps_per_checkpoint= 100,
                                n_steps_per_permanent_checkpoint= 1000)

Setting up `eval_task` for evaluating our model performance on `train_dev_data` and `test_val_data`. We monitor quite a few metrics on our evaluation datasets `train_dev_data` and `test_val_data` which are:
- `CrossEntropyLossWithLogSoftmax`
- `WeightedCategoryAccuracy`.

In [15]:
eval_train_dev_task = training.EvalTask(labeled_data=train_dev_generator,
                              metrics=[tl.CrossEntropyLossWithLogSoftmax(), tl.WeightedCategoryAccuracy()])

In [16]:
eval_test_val_task = training.EvalTask(labeled_data=test_val_generator,
                              metrics=[tl.CrossEntropyLossWithLogSoftmax(), tl.WeightedCategoryAccuracy()])

Setting up the `training_loop` for training our model.

In [17]:
jax.default_backend()

'tpu'

In [18]:
output_dir = './Model-2'
training_loop = training.Loop(model,
                              train_task,
                              eval_tasks=[eval_train_dev_task, eval_test_val_task],
                              output_dir=output_dir)

In [None]:
training_loop.run(15000)

Couldn't even start training as RAM usage exceeds in Google Colab and it keeps on crashing.

In [None]:
# If Model directory not already present with the trained model weights, uncomment and run the previous line and this line
# In case LSTM Attention Training session crashes
# training_loop.load_checkpoint(directory=output_dir, filename="model.pkl.gz")