In [None]:
import sentencepiece as spm
import pandas as pd
import torch
from torch import nn, Tensor
from torch.nn import functional as F
from transformers import (
    T5ForConditionalGeneration, 
    T5Config,
    AdamW,
    get_cosine_with_hard_restarts_schedule_with_warmup
)
import pytorch_lightning as pl
import time
from datetime import datetime
import math

device = torch.device(
    'cuda:0' if torch.cuda.is_available() else 'cpu'
)
print(f'device = {device}')

In [None]:
srcDataPath = '../data/train.bo'
tgtDataPath = '../data/train.en'

srcTokenizerPath = '../preProcessing/bo.model'
tgtTokenizerPath = '../preProcessing/en.model'

Load data

In [None]:
srcFile = open(srcDataPath, 'r', encoding = 'utf-8')
tgtFile = open(tgtDataPath, 'r', encoding = 'utf-8')

dataMatrix = []

while True: 
    srcLine = srcFile.readline().strip()
    tgtLine = tgtFile.readline().strip()
    if not srcLine or not tgtLine: 
        break 
    dataMatrix.append([srcLine, tgtLine])
  
# Create pandas dataframe 
df = pd.DataFrame(dataMatrix, columns = ['src', 'tgt'])
df

In [None]:
srcTextsAll = df['src'].tolist()
tgtTextsAll = df['tgt'].tolist()

## Tokenizers for Tibetan and English

The code cell below uses Google SentencePiece tokenizer. 

In [None]:
# Load tokenizers that are already trained
srcTokenizer = spm.SentencePieceProcessor(model_file=srcTokenizerPath)
tgtTokenizer = spm.SentencePieceProcessor(model_file=tgtTokenizerPath)

# Verify for Tibetan
print(srcTokenizer.encode(['ངའི་མིང་ལ་བསྟན་སྒྲོལ་མ་ཟེར་'], out_type=str))
print(srcTokenizer.encode(['ངའི་མིང་ལ་བསྟན་སྒྲོལ་མ་ཟེར་', 'བཀ྄ྲ་ཤིས་བདེ་ལེགས།'], out_type=int))
print(srcTokenizer.decode([4149, 306, 6, 245, 4660, 748]))
print(srcTokenizer.decode(['▁ངའི་', 'མིང་', 'ལ་', 'བསྟན་', 'སྒྲོལ་མ་', 'ཟེར་']))
print('Vocab size of Tibetan Tokenizer:', srcTokenizer.get_piece_size())

# Verify for English
print(tgtTokenizer.encode(["My name isn't Tenzin Dolma Gyalpo"], out_type=str))
print(tgtTokenizer.encode(['My name is Tenzin Dolma Gyalpo', 'Hello'], out_type=int))
print(tgtTokenizer.decode([[8804, 181, 13, 5520, 15172, 17895], [888, 21492]]))
print('Vocab size of English Tokenizer:', tgtTokenizer.get_piece_size())

In [None]:
src_bos_id = srcTokenizer.piece_to_id('<s>')
src_eos_id = srcTokenizer.piece_to_id('</s>')
src_pad_id = srcTokenizer.piece_to_id('<pad>')
tgt_bos_id = tgtTokenizer.piece_to_id('<s>')
tgt_eos_id = tgtTokenizer.piece_to_id('</s>')
tgt_pad_id = tgtTokenizer.piece_to_id('<pad>')

print(src_bos_id, src_eos_id, src_pad_id, tgt_bos_id, tgt_eos_id, tgt_pad_id)

The vectors of tokenization must have the same length. We thus define several helper functions for truncation and padding. 

In [None]:
def truncate(sentvec, maxlen, enable_bos_eos, **kwargs): 
    '''
    Truncate a sentence vector to maxlen by deleting the trailing ids. 
    Args
    -- sentvec. List. Vector of tokenization of a sentence 
    -- maxlen. Int. The max length of tokenization. Must >=3 
    -- pad_id. Int. The id for <pad>
    -- enable_bos_eos. Bool. Indicate whether to wrap a sentence with <s> and </s> 
    -- kwargs['bos_id']. Int. The id for <s>
    -- kwargs['eos_id']. Int. The id for </s> 
    '''
    
    # No error checking for now
    ## For a transformer model, the target sentences have to be wrapped by <s> and </s>, but the source sentences don't have to 
    
    if enable_bos_eos: 
        maxlen = maxlen - 2    # Need to reserve two positions for <s></s>
        bos_id = kwargs['bos_id']
        eos_id = kwargs['eos_id']
        
    # Truncate the sentence if needed 
    if len(sentvec) > maxlen: 
        newvec = sentvec[:maxlen].copy()
    else: 
        newvec = sentvec.copy()
        
    # Return the new vector
    if enable_bos_eos: 
        return [bos_id] + newvec + [eos_id]
    else: 
        return newvec

In [None]:
def pad_and_get_attention_mask(sentvec, tolen, pad_id): 
    ''' 
    If a token list is shorter than tolen, then add <pad> until `tolen` and get the attention mask where 0--><pad> and 1-->non-pad characters 
    '''
    sentlen = len(sentvec)
    
    # No need to pad if the sentence is long enough 
    if len(sentvec) >= tolen: 
        return sentvec, [1] * sentlen
    
    else: 
        return sentvec + [pad_id] * (tolen - sentlen), [1] * sentlen + [0] * (tolen - sentlen)

In [None]:
def trim(sentvec, tolen, pad_id, enable_bos_eos, **kwargs): 
    '''truncate and then pad a sentence. Return a tuple with ids and attention mask'''
    
    ids = truncate(sentvec, tolen, enable_bos_eos, **kwargs)
    ids, attention_mask = pad_and_get_attention_mask(ids, tolen, pad_id)
    return ids, attention_mask

Show some examples to verify that our `trim()` function works. 

In [None]:
ids, attention_mask = trim([100, 200, 300, 400, 500], tolen = 4, pad_id = tgt_pad_id, enable_bos_eos = False)
print(ids, attention_mask)

In [None]:
ids, attention_mask = trim([100, 200, 300, 400, 500], tolen = 9, pad_id = tgt_pad_id, enable_bos_eos = False)
print(ids, attention_mask)

In [None]:
ids, attention_mask = trim([100, 200, 300, 400, 500], tolen = 4, pad_id = tgt_pad_id, enable_bos_eos = True, bos_id = tgt_bos_id, eos_id = tgt_eos_id)
print(ids, attention_mask)

In [None]:
ids, attention_mask = trim([100, 200, 300, 400, 500], tolen = 9, pad_id = tgt_pad_id, enable_bos_eos = True, bos_id = tgt_bos_id, eos_id = tgt_eos_id)
print(ids, attention_mask)

## Batch iterator

Returns a batch of token ids as torch tensors upon each call of `__next__()`. 

In [None]:
class MyBatchIterator: 
    def __init__(self, srcTexts, tgtTexts, 
                 srcTokenizer, tgtTokenizer,
                 start_idx, end_idx, batch_size, 
                 src_pad_id, tgt_pad_id, 
                 src_bos_id = None, tgt_bos_id = None, 
                 src_eos_id = None, tgt_eos_id = None
                ): 
        self.srcTexts = srcTexts
        self.tgtTexts = tgtTexts
        self.srcTokenizer = srcTokenizer 
        self.tgtTokenizer = tgtTokenizer
        self.start_idx = start_idx    # Starting index of original dataset, inclusive
        self.end_idx = end_idx    # Ending index of original dataset, exclusive 
        self.batch_size = batch_size    # batch_size specified by user s
        self.src_pad_id = src_pad_id
        self.tgt_pad_id = tgt_pad_id
        self.src_bos_id = src_bos_id
        self.tgt_bos_id = tgt_bos_id 
        self.src_eos_id = src_eos_id
        self.tgt_eos_id = tgt_eos_id 
        
    
    # Tokenize a list of texts and trim with special tokens
    # Return a tuple (list of [ids], list of [masks])
    def tokenize_batch_and_trim(self, text_batch, tokenizer, pad_id, enable_bos_eos, **kwargs):
        ids_batch = []
        maxlen = 0
        res_ids, res_attention_mask = [], []
        
        # Add <s></s> if needed and get maxlen 
        for text in text_batch: 
            ids = tokenizer.encode(text)
            # Add <s></s> if needed
            ids = truncate(ids, len(ids) + 10, enable_bos_eos, **kwargs)
            ids_batch.append(ids)
            # Update maxlen 
            if len(ids) > maxlen: 
                maxlen = len(ids)
        
        # Pad to the current maxlen in the batch 
        for ids in ids_batch: 
            padded_ids, attention_mask = pad_and_get_attention_mask(ids, maxlen, pad_id)
            res_ids.append(padded_ids)
            res_attention_mask.append(attention_mask)
        
        return res_ids, res_attention_mask
    
    
    def __iter__(self): 
        self.curr_idx = self.start_idx 
        return self 
    
    
    def __next__(self): 
        if self.curr_idx >= self.end_idx: 
            raise StopIteration 
            
        # Take care of indices for correct iteration 
        if self.curr_idx + self.batch_size < self.end_idx: 
            head, tail = self.curr_idx, self.curr_idx + self.batch_size
            self.curr_idx += self.batch_size
        else:
            head, tail = self.curr_idx, self.end_idx
            self.curr_idx = self.end_idx 
            
        # Get source and target texts 
        src_texts = self.srcTexts[head:tail]
        tgt_texts = self.tgtTexts[head:tail]
        
        # Tokenize
        src_ids, src_mask = self.tokenize_batch_and_trim(src_texts, self.srcTokenizer, self.src_pad_id, enable_bos_eos = False)
        tgt_ids, tgt_mask = self.tokenize_batch_and_trim(tgt_texts, self.tgtTokenizer, self.tgt_pad_id, enable_bos_eos = True, bos_id = self.tgt_bos_id, eos_id = self.tgt_eos_id)
        
        # Return the results as dictionaries of torch tensors 
        return {
            'src_ids': torch.LongTensor(src_ids).to(device),
            'src_mask': torch.FloatTensor(src_mask).to(device),
            'tgt_ids': torch.LongTensor(tgt_ids).to(device),
            'tgt_mask': torch.FloatTensor(tgt_mask).to(device),
        }
    
    def __len__(self):
        return math.ceil((self.end_idx - self.start_idx) / self.batch_size)
        

Here is an example of how batch iterator works. 

In [None]:
mbi = MyBatchIterator(
    srcTextsAll, tgtTextsAll, 
    srcTokenizer, tgtTokenizer,
    start_idx = 475, end_idx = 485, batch_size = 8, 
    src_pad_id = src_pad_id, tgt_pad_id = tgt_pad_id, 
    src_bos_id = src_bos_id, tgt_bos_id = tgt_bos_id, 
    src_eos_id = src_eos_id, tgt_eos_id = tgt_eos_id
)

mbi = iter(mbi)

print('length of iterator:', len(mbi))

for idx, batch in enumerate(mbi): 
    print(f"batch index: {idx}, src size: {batch['src_ids'].size()} = {batch['src_mask'].size()}; tgt size: {batch['tgt_ids'].size()} = {batch['tgt_mask'].size()}")
    print(f"sample src ids: {batch['src_ids'][0]}")
    print(f"sample src mask: {batch['src_mask'][0]}")
    print(f"sample tgt ids: {batch['tgt_ids'][0]}")
    print(f"sample tgt mask: {batch['tgt_mask'][0]}")
    print('='*50)

## Helper classes and functions

We define a `Timer` class for estimating remaining time for an epoch. 

In [None]:
class Timer:
    def __init__(self, num_total_units):
        # num_total_units: How many units of tasks need to be done
        self.start = datetime.datetime.now()
        self.num_total_units = num_total_units

    def remains(self, num_done_units):
        # num_done_units: How many units of tasks are done
        now  = datetime.datetime.now()
        time_taken = now - self.start
        sec_taken = int(time_taken.total_seconds())
        time_left = (self.num_total_units - num_done_units) * (now - self.start) / num_done_units
        sec_left = int(time_left.total_seconds())
        return f"Time taken {sec_taken // 60:02d}:{sec_taken % 60:02d}, Estimated time left {sec_left // 60:02d}:{sec_left % 60:02d}"

## Instantiate: model, optimizer, scheduler, hyperparameters

After reading the documentation for `PretrainedConfig` and `PretrainedModel`, I got the idea that hyperparameters shall be passed as `**kwargs` when calling `from_pretrained()`. To see what hyperparameters are available to configure, 

```
T5model = T5ForConditionalGeneration.from_pretrained('t5-small')
T5model.config_class().to_dict()
```

In [None]:
hparams = dict(
    num_epochs = 50
    train_batch_size = 8, 
    val_batch_size = 1, 
    train_percentage = 0.95, 
    # --------------------------------------------------
    weight_decay = 1e-4, 
    warmup_steps = 4000, 
    dropout = 0.2,
    target_lr = 1e-4,     # max learning rate achieved by scheduler 
    adam_betas = (0.9, 0.98), 
    # adam_eps = 1e-9, 
    max_length = 100,    # max length of sequence to be generated 
)

T5model = T5ForConditionalGeneration.from_pretrained(
    't5-small', 
    return_dict = True, 
    bos_token_id = tgt_bos_id, 
    eos_token_id = tgt_eos_id, 
    pad_token_id = tgt_pad_id, 
    dropout_rate = hparams['dropout'], 
    max_length = hparams['max_length']
)

optimizer_grouped_parameters = [
    {
        # parameters with weight decay 
        'params': [param for name, param in T5model.named_parameters() if ('bias' not in name and 'layer_norm.weight' not in name)], 
        'weight_decay': self.hparams['weight_decay'], 
    }, 
    {
        # parameters without weight decay
        'params': [param for name, param in T5model.named_parameters() if ('bias' in name or 'layer_norm.weight' in name)], 
        'weight_decay': 0.0, 
    }
]

optimizer = AdamW(
    optimizer_grouped_parameters, 
    lr = hparams['target_lr'], 
    betas = hparams['adam_betas'],
)

scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
    optimizer, 
    num_warmup_steps = hparams['warmup_steps'], 
    num_training_steps = hparams['num_epochs'] * math.ceil(len(srcTextsAll) / hparams['train_batch_size']), 
    num_cycles = 3
)