#### Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd "/content/drive/MyDrive/IASNLP"

### Importing Necessary Libraries

In [None]:
!pip install sentencepiece
!pip install trax

In [4]:
import numpy as np
import pandas as pd

import sentencepiece as spm
from tensorflow.keras.preprocessing.sequence import pad_sequences

import trax
from trax.data import inputs
from trax.fastmath import numpy as fnp
from trax import layers as tl
from trax.supervised import training
from trax.models import Transformer

import matplotlib.pyplot as plt 

from functools import reduce
import os

### Setting up Data Generator

In [5]:
train_data, train_dev_data = pd.read_csv("train_data.csv")[['src', 'tgt']], pd.read_csv("train_dev.csv")[['src', 'tgt']]
sp_en_bpe, sp_ben_bpe = spm.SentencePieceProcessor(), spm.SentencePieceProcessor()
sp_en_bpe.load('eng_bpe.model'); sp_ben_bpe.load('ben_bpe.model');

In [6]:
def tokenize(sentence, sp_model):
    # We add the EOS token at the end of each encoded sentence
    return sp_model.encode_as_ids(sentence) + [sp_model.eos_id()]
def detokenize(tokenized, sp_model):
    integers = tokenized.tolist()
    return sp_model.DecodeIdsWithCheck(integers[:integers.index(sp_model.eos_id())])
def data_generator(batch_size, src, tgt, maxlen=60, shuffle=False, verbose=False):
    num_lines = len(src)
    lines_index = [*range(num_lines)]
    if shuffle:
        np.random.shuffle(lines_index)
    index = 0
    while True:
        buffer_src = list()
        buffer_tgt = list() 
        max_len = 0 
        for i in range(batch_size):
            if index >= num_lines:
                index = 0
                if shuffle:
                    np.random.shuffle(lines_index)
            buffer_src.append(src[lines_index[index]])
            buffer_tgt.append(tgt[lines_index[index]])
            index += 1
        batch_src = pad_sequences(buffer_src, maxlen = maxlen, padding='post', truncating='post')
        batch_tgt = pad_sequences(buffer_tgt, maxlen = maxlen, padding='post', truncating='post')
        if verbose: print("index=", index)
        yield((batch_src, batch_tgt))

In [7]:
src_train_data_enc = [tokenize(train_data['src'].iloc[i], sp_en_bpe) for i in range(train_data.shape[0])]
tgt_train_data_enc = [tokenize(train_data['tgt'].iloc[i], sp_ben_bpe) for i in range(train_data.shape[0])]
src_train_dev_data_enc = [tokenize(train_dev_data['src'].iloc[i], sp_en_bpe) for i in range(train_dev_data.shape[0])]
tgt_train_dev_data_enc = [tokenize(train_dev_data['tgt'].iloc[i], sp_ben_bpe) for i in range(train_dev_data.shape[0])]

In [8]:
train_data_gen = data_generator(128, src_train_data_enc, tgt_train_data_enc)
train_dev_data_gen = data_generator(32, src_train_dev_data_enc, tgt_train_dev_data_enc)

### Model

#### Basic Transformer Model

Now, we are ready to train our first model. We will start by training a basic transformer model from scratch with `embedding dimension = 256`, `dense layer units = 512`, `number of heads = 4`, `number of encoder layers = number of decoder layers = 3` and `maximum number of tokens = 60`.

In [9]:
model = Transformer(input_vocab_size=16000, output_vocab_size=16000, d_model=256, d_ff=512, dropout = 0.1, n_heads=4, n_encoder_layers=3, n_decoder_layers=3, max_len=60, mode='train')
model

Serial_in2_out2[
  Select[0,1,1]_in2_out3
  Branch_out2[
    []
    Serial[
      PaddingMask(0)
    ]
  ]
  Serial_in2_out2[
    Embedding_16000_256
    Dropout
    PositionalEncoding
    Serial_in2_out2[
      Branch_in2_out3[
        None
        Serial_in2_out2[
          LayerNorm
          Serial_in2_out2[
            _in2_out2
            Serial_in2_out2[
              Select[0,0,0]_out3
              Serial_in4_out2[
                _in4_out4
                Serial_in4_out2[
                  Parallel_in3_out3[
                    Dense_256
                    Dense_256
                    Dense_256
                  ]
                  PureAttention_in4_out2
                  Dense_256
                ]
                _in2_out2
              ]
            ]
            _in2_out2
          ]
          Dropout
        ]
      ]
      Add_in2
    ]
    Serial[
      Branch_out2[
        None
        Serial[
          LayerNorm
          Dense_512
          Serial[
            Re

#### Training

In [10]:
np.random.seed(43)
train_generator = trax.data.inputs.add_loss_weights(train_data_gen, id_to_mask= sp_en_bpe.pad_id())
train_dev_generator = trax.data.inputs.add_loss_weights(train_dev_data_gen, id_to_mask= sp_en_bpe.pad_id())

In [11]:
train_task = training.TrainTask(labeled_data= train_generator, 
                                loss_layer= tl.CrossEntropyLossWithLogSoftmax(),
                                optimizer= trax.optimizers.Adam(0.01),
                                lr_schedule= trax.lr.warmup_and_rsqrt_decay(1000, 0.01),
                                n_steps_per_checkpoint= 100,
                                n_steps_per_permanent_checkpoint= 1000)



In [12]:
eval_task = training.EvalTask(labeled_data=train_dev_generator,
                              metrics=[tl.CrossEntropyLossWithLogSoftmax(), tl.WeightedCategoryAccuracy()])

In [None]:
output_dir = './Model'
training_loop = training.Loop(model,
                              train_task,
                              eval_tasks=[eval_task],
                              output_dir=output_dir)

In [None]:
training_loop.run(5000)


Step      1: Total number of trainable weights: 16289408
Step      1: Ran 1 train steps in 213.24 secs
Step      1: train CrossEntropyLossWithLogSoftmax |  9.67185497
Step      1: eval  CrossEntropyLossWithLogSoftmax |  9.65214634
Step      1: eval        WeightedCategoryAccuracy |  0.00000000
