In [1]:
import sentencepiece as spm
import pandas as pd
from tokenizers import SentencePieceBPETokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import SparseAdam
from transformers import (
    T5Model, 
    T5ForConditionalGeneration, 
    AdamW,
    get_linear_schedule_with_warmup
)
import pytorch_lightning as pl
import time
from datetime import datetime
import textwrap

device = torch.device(
    'cuda:0' if torch.cuda.is_available() else 'cpu'
)
print(f'device = {device}')

device = cuda:0


In [2]:
boDataForTokenizerPath = '../data/all_bo.txt'
enDataForTokenizerPath = '../data/all_en.txt'

boDataPath = '../data/train.bo'
enDataPath = '../data/train.en'

boTokenizerPath = '../preProcessing/bo.model'
enTokenizerPath = '../preProcessing/en.model'

## Load data 


In [3]:
boFile = open(boDataPath, 'r', encoding = 'utf-8')
enFile = open(enDataPath, 'r', encoding = 'utf-8')

dataMatrix = []

while True: 
    boLine = boFile.readline().strip()
    enLine = enFile.readline().strip()
    if not boLine or not enLine: 
        break 
    dataMatrix.append([boLine, enLine])
  
# Create pandas dataframe 
df = pd.DataFrame(dataMatrix, columns = ['bo', 'en'])
df

Unnamed: 0,bo,en
0,རྒྱལ་པོ་ཞེས་བྱ་བས་རྒྱལ་སྲིད་འབྱོར་པ་རྒྱས་པ་བདེ...,under his rule the kingdom prospered and thriv...
1,དེས་དཔུང་གི་ཚོགས་ཡན་ལག་བཞི་པ་གླང་པོ་ཆེ་པའི་ཚོག...,he called up the four branches of his armed fo...
2,སུམ་ཅུ་རྩ་གསུམ་པའི་ལྷ་རྣམས་ཀྱི་ཁ་དོག་གི་མཐུ་བས...,bathed in a vast light more luminous than the ...
3,མ་མ་བརྒྱད་པོ་པང་ན་འཚོ་བའི་མ་མ་གཉིས་དང་ནུ་མ་སྣུ...,was entrusted to eight nursemaids two to cuddl...
4,རྒྱལ་པོ་རྒྱལ་རིགས་སྤྱི་བོར་དབང་བསྐུར་བ་ལྗོངས་ཀ...,he trained in and mastered those arts and skil...
...,...,...
106861,མད་གལ་གྱི་བུ་དེ་བཞིན་གཤེགས་པ་དགྲ་བཅོམ་པ་ཡང་དག་...,maudgalyayana the thusgone worthy perfect budd...
106862,བཅོམ་ལྡན་འདས་ཀྱིས་དེ་སྐད་ཅེས་བཀའ་སྩལ་པ་དང་་ཚེ་...,when the blessed one had spoken venerable maha...
106863,འཕགས་པ་བཅོམ་ལྡན་འདས་ཀྱི་ཡེ་ཤེས་རྒྱས་པའི་མདོ་སྡ...,this completes the great vehicle sutra the pre...
106864,རྒྱ་གར་གྱི་མཁན་པོ་པྲཛྙ་བར་མ་དང་་ལོཙྪ་བ་བན་དེ་ཡ...,this was translated by the indian preceptor pr...


In [4]:
boTextsAll = df['bo'].tolist()
enTextsAll = df['en'].tolist()

## Tokenizers for Tibetan and English

The code cell below uses Google SentencePiece tokenizer, but we cannot yet figure out how to truncate and pad the tokenizations to the same length, nor the special characters. Save the code but we will not use it for now. 

In [5]:
'''
## Ignore this cell
'''

# Load tokenizers that are already trained
boTokenizer = spm.SentencePieceProcessor(model_file=boTokenizerPath)
enTokenizer = spm.SentencePieceProcessor(model_file=enTokenizerPath)

# Verify for Tibetan
print(boTokenizer.encode(['ངའི་མིང་ལ་བསྟན་སྒྲོལ་མ་ཟེར་'], out_type=str))
print(boTokenizer.encode(['ངའི་མིང་ལ་བསྟན་སྒྲོལ་མ་ཟེར་', 'བཀ྄ྲ་ཤིས་བདེ་ལེགས།'], out_type=int))
print(boTokenizer.decode([4149, 306, 6, 245, 4660, 748]))
print(boTokenizer.decode(['▁ངའི་', 'མིང་', 'ལ་', 'བསྟན་', 'སྒྲོལ་མ་', 'ཟེར་']))
print('Vocab size of Tibetan Tokenizer:', boTokenizer.get_piece_size())

# Verify for English
print(enTokenizer.encode(["My name isn't Tenzin Dolma Gyalpo"], out_type=str))
print(enTokenizer.encode(['My name is Tenzin Dolma Gyalpo', 'Hello'], out_type=int))
print(enTokenizer.decode([[8803, 180, 12, 5519, 15171, 17894], [887, 21491]]))
print('Vocab size of English Tokenizer:', enTokenizer.get_piece_size())

[['▁ངའི་', 'མིང་ལ་', 'བསྟན་', 'སྒྲོལ་མ་', 'ཟེར་']]
[[3644, 18002, 530, 6257, 2154], [4, 3333, 0, 6081, 3, 6750, 1030, 2261, 1961, 0]]
ཆོས་སྟོན་ཏོ་་རང་གི་ལ་ཡོད་པའི་ ཨིན་ཡུལ་དང་ལྡན་པའི་
ངའི་མིང་ལ་བསྟན་སྒྲོལ་མ་ཟེར་
Vocab size of Tibetan Tokenizer: 32000
[['▁My', '▁name', '▁is', 'n', "'", 't', '▁Tenzin', '▁Dolma', '▁Gyalpo']]
[[8803, 180, 12, 5519, 15171, 17894], [887, 21491]]
['My name is Tenzin Dolma Gyalpo', 'Hello']
Vocab size of English Tokenizer: 25000


Instead, we use huggingface tokenizer now. The bad news is we can no longer find the right API. 

In [6]:
boTokenizer = SentencePieceBPETokenizer()
boTokenizer.train([boDataForTokenizerPath], vocab_size = 32000, special_tokens = ['<s>', '<pad>', '</s>', '<unk>', '<mask>'])

enTokenizer = SentencePieceBPETokenizer()
enTokenizer.train([enDataForTokenizerPath], vocab_size = 32000, special_tokens = ['<s>', '<pad>', '</s>', '<unk>', '<mask>'])

print('Tibetan tokenizer vocab size:', boTokenizer.get_vocab_size())
print('English tokenizer vocab size:', enTokenizer.get_vocab_size())

Tibetan tokenizer vocab size: 32000
English tokenizer vocab size: 32000


In [7]:
# Verify for Tibetan
outputs = boTokenizer.encode_batch(['ངའི་མིང་ལ་བསྟན་སྒྲོལ་མ་ཟེར་', 'ངའི་མིང་ལ་བསྟན་སྒྲོལ་མ་ཟེར་', 'བཀ྄ྲ་ཤིས་བདེ་ལེགས།'])

for output in outputs: 
    print('ids:', output.ids)
    print('token:', output.tokens)
    print('mask:', output.attention_mask)

ids: [6068, 753, 8639, 4042, 16876]
token: ['▁ངའི་', 'མིང་', 'ལ་བསྟན་', 'སྒྲོལ་', 'མ་ཟེར་']
mask: [1, 1, 1, 1, 1]
ids: [6068, 753, 8639, 4042, 16876]
token: ['▁ངའི་', 'མིང་', 'ལ་བསྟན་', 'སྒྲོལ་', 'མ་ཟེར་']
mask: [1, 1, 1, 1, 1]
ids: [10148, 1225, 2234, 162]
token: ['▁བཀྲ་ཤིས་', 'བདེ་', 'ལེ', 'གས']
mask: [1, 1, 1, 1]


In [8]:
# Verify for English
outputs = enTokenizer.encode_batch(["My name isn't Tenzin Dolma Gyalpo", 'My name is Tenzin Dolma Gyalpo', 'Hello'])

for output in outputs: 
    print('ids:', output.ids)
    print('token:', output.tokens)
    print('mask:', output.attention_mask)

ids: [20490, 1222, 169, 78, 12, 84, 16640, 27027, 221, 12146, 4043]
token: ['▁My', '▁name', '▁is', 'n', "'", 't', '▁Tenzin', '▁Dol', 'ma', '▁Gyal', 'po']
mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
ids: [20490, 1222, 169, 16640, 27027, 221, 12146, 4043]
token: ['▁My', '▁name', '▁is', '▁Tenzin', '▁Dol', 'ma', '▁Gyal', 'po']
mask: [1, 1, 1, 1, 1, 1, 1, 1]
ids: [961, 786, 79]
token: ['▁H', 'ell', 'o']
mask: [1, 1, 1]


## Pytorch `Dataset`

In [9]:
class MyDataset(Dataset): 
    def __init__(self, boTexts, enTexts, boTokenizer, enTokenizer, boMaxLen, enMaxLen): 
        super().__init__()
        self.boTexts = boTexts
        self.enTexts = enTexts
        self.boTokenizer = boTokenizer
        self.enTokenizer = enTokenizer
        
        # Enable padding and truncation 
        self.boTokenizer.enable_padding(length = boMaxLen)
        self.boTokenizer.enable_truncation(max_length = boMaxLen)
        self.enTokenizer.enable_padding(length = enMaxLen)
        self.enTokenizer.enable_truncation(max_length = enMaxLen)
        
    ''' Return the size of dataset '''
    def __len__(self): 
        return len(self.boTexts)
    
    '''
    -- The routine for querying one data entry 
    -- The index of must be specified as an argument
    -- Return a dictionary 
    '''
    def __getitem__(self, idx): 
        # Apply tokenizer
        boOutputs = self.boTokenizer.encode(self.boTexts[idx])
        enOutputs = self.enTokenizer.encode(self.enTexts[idx])
        
        # Get numerical tokens 
        boEncoding = boOutputs.ids
        enEncoding = enOutputs.ids
        
        # Get attention mask 
        boMask = boOutputs.attention_mask
        enMask = enOutputs.attention_mask
        
        return {
            'source_ids': torch.tensor(boEncoding), 
            'source_mask': torch.tensor(boMask), 
            'target_ids': torch.tensor(enEncoding), 
            'target_mask': torch.tensor(enMask)
        }

## Define model class

In [10]:
class T5FineTuner(pl.LightningModule): 
    ''' Part 1: Define the architecture of model in init '''
    def __init__(self, hparams):
        super(T5FineTuner, self).__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(
            hparams['pretrainedModelName'], 
            return_dict = True    # I set return_dict true so that outputs  are presented as dictionaries
        )
        self.boTokenizer = hparams['boTokenizer']
        self.enTokenizer = hparams['enTokenizer']
        self.hparams = hparams
        self.scheduler_is_created = False
        
        
    ''' Part 2: Define the forward propagation '''
    def forward(self, input_ids, attention_mask = None, decoder_input_ids = None, decoder_attention_mask = None, labels = None):  
        return self.model(
            input_ids, 
            attention_mask = attention_mask, 
            decoder_input_ids = decoder_input_ids, 
            decoder_attention_mask = decoder_attention_mask, 
            labels = labels
        )
    
    
    ''' Part 3: Configure optimizer and scheduler '''
    def configure_optimizers(self): 
        # Optimizer
        # I have no idea why to configure parameter this way 
        optimizer_grouped_parameters = [
            {
                # parameter with weight decay 
                'params': [param for name, param in model.named_parameters() if ('bias' not in name and 'LayerNorm.weight' not in name)], 
                'weight_decay': self.hparams['weight_decay'], 
            }, 
            {
                'params': [param for name, param in model.named_parameters() if ('bias' in name or 'LayerNorm.weight' in name)], 
                'weight_decay': 0.0, 
            }
        ]
        
        self.optimizer = AdamW(optimizer_grouped_parameters, lr = self.hparams['learning_rate'])
        
        # Scheduler
        # To create a scheduler with linear decay, we need to manually compute the number of training steps and pass it as an argument for the schduler 
        train_size = int(self.hparams['train_percentage'] * len(boTextsAll))
        batch_size = self.hparams['batch_size']
        num_processor = max(1, self.hparams['num_gpu'])
        num_epoch = self.hparams['num_train_epochs']
        total_training_steps = train_size // (batch_size * num_processor) * num_epoch
        
        # Create a scheduler for adjusting learning rate 
        self.lr_scheduler = get_linear_schedule_with_warmup(
            optimizer = self.optimizer, 
            num_warmup_steps = self.hparams['warmup_steps'], 
            num_training_steps = total_training_steps
        )
        
        self.lr_dict = {
            'scheduler': self.lr_scheduler, # The LR schduler
            'interval': 'step', # The unit of the scheduler's step size
            'frequency': 1, # The frequency of the scheduler
        }
        
        # Do constant rate this time
        return [self.optimizer]# , [self.lr_dict]

    
    ''' Part 4.1: Training logic '''
    def training_step(self, batch, batch_idx):         
        loss = self._step(batch)
        self.log('train_loss', loss)
        # For monitoring purpose, log learning rate 
        for param_group in self.optimizer.param_groups:
            if param_group['lr']:
                self.log('learning_rate*e-4', param_group['lr'] * 1e4)
        return loss
    
    
    def _step(self, batch): 
        labels = batch['target_ids'] 
        labels[labels[:, ] == 0] = -100    # Change the pad id from 0 to -100, but I do not know why the example chooses to do so. I will comment it out for now
        
        outputs = self(
            input_ids = batch['source_ids'], 
            attention_mask = batch['source_mask'], 
            labels = labels, 
            decoder_attention_mask = batch['target_mask']
        )
        
        return outputs.loss

    
    ''' Part 4.2: Validation logic '''
    def validation_step(self, batch, batch_idx):        
        loss = self._step(batch)
        self.log('val_loss', loss)
        
        
    ''' Part 4.3: Test logic '''
    def test_step(self, batch, batch_idx): 
        loss = self._step(batch)
        self.log('test_loss', loss)
    
    
    ''' Part 5: Data loaders '''
    def _get_dataloader(self, start_idx, end_idx): 
        dataset = MyDataset(
            boTexts = boTextsAll[start_idx:end_idx], 
            enTexts = enTextsAll[start_idx:end_idx], 
            boTokenizer = self.hparams['boTokenizer'], 
            enTokenizer = self.hparams['enTokenizer'], 
            boMaxLen = self.hparams['max_input_len'], 
            enMaxLen = self.hparams['max_output_len']
        )
        
        return DataLoader(dataset, batch_size = hparams['batch_size'])
    
    
    def train_dataloader(self): 
        start_idx = 0
        end_idx = int(self.hparams['train_percentage'] * len(boTextsAll))
        return self._get_dataloader(start_idx, end_idx)
    
    
    def val_dataloader(self): 
        start_idx = int(self.hparams['train_percentage'] * len(boTextsAll))
        end_idx = int((self.hparams['train_percentage'] + self.hparams['val_percentage']) * len(boTextsAll))
        return self._get_dataloader(start_idx, end_idx)
    
    
    def test_dataloader(self): 
        start_idx = int((self.hparams['train_percentage'] + self.hparams['val_percentage']) * len(boTextsAll))
        end_idx = len(boTextsAll)
        return self._get_dataloader(start_idx, end_idx)

In [11]:
hparams = {
    'boTokenizer': boTokenizer,
    'enTokenizer': enTokenizer,
    'pretrainedModelName': 't5-small', 
    'train_percentage': 0.95, 
    'val_percentage': 0.04, 
    'learning_rate': 1e-4, 
    'max_input_len': 100, 
    'max_output_len': 100, 
    'batch_size': 8, 
    'num_train_epochs': 10, 
    'num_gpu': 1, 
    'weight_decay': 0, 
    'warmup_steps': 0,  # For scheduler 
}

## Training

In [12]:
torch.cuda.empty_cache()

train_params = dict(
    gpus = hparams['num_gpu'], 
    max_epochs = hparams['num_train_epochs'], 
    progress_bar_refresh_rate = 20, 
)

model = T5FineTuner(hparams)

trainer = pl.Trainer(**train_params)

trainer.fit(model)

# Save model for later use
now = datetime.now()
trainer.save_checkpoint('04_t5simple_bo_en_' + now.strftime("%Y-%m-%d--%H=%M=%S") + '.ckpt')

trainer.test()

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60 M  


HBox(children=(HTML(value='Validation sanity check'), FloatProgress(value=1.0, bar_style='info', layout=Layout…

HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…




HBox(children=(HTML(value='Testing'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=…

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'learning_rate*e-4': 1.0,
 'test_loss': tensor(5.2341, device='cuda:0'),
 'train_loss': tensor(5.3494, device='cuda:0'),
 'val_loss': tensor(5.3414, device='cuda:0')}
--------------------------------------------------------------------------------



[{'train_loss': 5.3494038581848145,
  'learning_rate*e-4': 1.0,
  'val_loss': 5.341426372528076,
  'test_loss': 5.234098434448242}]

## Testing

In [13]:
# Load a previously saved model

torch.cuda.empty_cache()

modelLoaded = T5FineTuner.load_from_checkpoint(checkpoint_path='__04_t5simple_bo_en_2020-12-15--07=12=57.ckpt').to(device)

In [14]:
start_idx = 0
end_idx = 8

testset = MyDataset(
    boTexts = boTextsAll[start_idx:end_idx], 
    enTexts = enTextsAll[start_idx:end_idx], 
    boTokenizer = hparams['boTokenizer'], 
    enTokenizer = hparams['enTokenizer'], 
    boMaxLen = hparams['max_input_len'], 
    enMaxLen = hparams['max_output_len']
)

test_dataloader = DataLoader(testset, batch_size = hparams['batch_size'])
testit = iter(test_dataloader)

# Take one batch from testset 
batch = next(testit)

# Generate target ids
outs = modelLoaded.model.generate(
    batch['source_ids'].cuda(), 
    attention_mask = batch['source_mask'].cuda(), 
    use_cache = True, 
    decoder_attention_mask = batch['target_mask'], 
    max_length = hparams['max_output_len'], 
    num_beams = 4, 
    repetition_penalty = 2.5, 
    length_penalty = 0.6, 
    early_stopping = True, 
)

pred_texts = [enTokenizer.decode(ids) for ids in outs.tolist()]
source_texts = [boTokenizer.decode(ids) for ids in batch['source_ids'].tolist()]
target_texts = [enTokenizer.decode(ids) for ids in batch['target_ids'].tolist()]

for i in range(len(pred_texts)): 
    lines = textwrap.wrap("Tibetan Text:\n%s\n" % source_texts[i], width=100)
    print("\n".join(lines))
    print("\nActual translation: %s" % target_texts[i])
    print("\nPredicted translation: %s" % pred_texts[i])
    print('=' * 50 + '\n')

Tibetan Text: རྒྱལ་པོ་ཞེས་བྱ་བས་རྒྱལ་སྲིད་འབྱོར་པ་རྒྱས་པ་བདེ་བ་ལོ་ལེགས་པ་སྐྱེ་བོ་དང་མི་མང་པོས་གང་བ་བ
ྱེད་དུ་བཅུག་གོ་

Actual translation: under his rule the kingdom prospered and thrived crops were bountiful and the land teemed with animals and people

Predicted translation: monks in this way the thusgone one correctly understands the knowledge of the path that leads to cessation as related to knowledge of what is impossible to be a tathagata an arhat a totally and completely awakened buddha possessed of insight and perfect conduct a sugata a knower of the world a tamer of persons a charioteer an unsurpassed one a teacher of humans and gods a teacher of humans and gods a teacher of humans and gods a teacher of humans and gods a teacher of humans and gods a teacher of humans and gods a teacher of

Tibetan Text: དེས་དཔུང་གི་ཚོགས་ཡན་ལག་བཞི་པ་གླང་པོ་ཆེ་པའི་ཚོགས་དང་རྟ་པའི་ཚོགས་དང་ཤིང་རྟ་པའི་ཚོགས་དང་ད
པུང་བུ་ཆུང་གི་ཚོགས་གོ་བསྐོན་ཏེ་ཡུལ་མ་ག་དཧའི་རྒྱལ་པོའི་ཁ་བ་མ་གཏོགས་པ་བཅོམ་ནས་ཕྱིར་ལྡོག་པར་བྱ

In [None]:
# %tensorboard --logdir lightning_logs/