# Build a transformer from scratch

Link for tutorial: https://lionbridge.ai/articles/transformers-in-nlp-creating-a-translator-model-from-scratch/

In [230]:
import torch
from torch import nn, Tensor
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import sentencepiece as spm
import pandas as pd
from typing import Optional
import math

device = torch.device(
    'cuda:0' if torch.cuda.is_available() else 'cpu'
)
device = torch.device('cpu')    # Use CPU for now because GPU is training another model 
print(f'device = {device}')

device = cpu


In [231]:
boDataPath = '../data/train.bo'
enDataPath = '../data/train.en'

boTokenizerPath = '../preProcessing/bo.model'
enTokenizerPath = '../preProcessing/en.model'

## Load data 


In [232]:
boFile = open(boDataPath, 'r', encoding = 'utf-8')
enFile = open(enDataPath, 'r', encoding = 'utf-8')

dataMatrix = []

while True: 
    boLine = boFile.readline().strip()
    enLine = enFile.readline().strip()
    if not boLine or not enLine: 
        break 
    dataMatrix.append([boLine, enLine])
  
# Create pandas dataframe 
df = pd.DataFrame(dataMatrix, columns = ['bo', 'en'])
df

Unnamed: 0,bo,en
0,རྒྱལ་པོ་ཞེས་བྱ་བས་རྒྱལ་སྲིད་འབྱོར་པ་རྒྱས་པ་བདེ...,under his rule the kingdom prospered and thriv...
1,དེས་དཔུང་གི་ཚོགས་ཡན་ལག་བཞི་པ་གླང་པོ་ཆེ་པའི་ཚོག...,he called up the four branches of his armed fo...
2,སུམ་ཅུ་རྩ་གསུམ་པའི་ལྷ་རྣམས་ཀྱི་ཁ་དོག་གི་མཐུ་བས...,bathed in a vast light more luminous than the ...
3,མ་མ་བརྒྱད་པོ་པང་ན་འཚོ་བའི་མ་མ་གཉིས་དང་ནུ་མ་སྣུ...,was entrusted to eight nursemaids two to cuddl...
4,རྒྱལ་པོ་རྒྱལ་རིགས་སྤྱི་བོར་དབང་བསྐུར་བ་ལྗོངས་ཀ...,he trained in and mastered those arts and skil...
...,...,...
106861,མད་གལ་གྱི་བུ་དེ་བཞིན་གཤེགས་པ་དགྲ་བཅོམ་པ་ཡང་དག་...,maudgalyayana the thusgone worthy perfect budd...
106862,བཅོམ་ལྡན་འདས་ཀྱིས་དེ་སྐད་ཅེས་བཀའ་སྩལ་པ་དང་་ཚེ་...,when the blessed one had spoken venerable maha...
106863,འཕགས་པ་བཅོམ་ལྡན་འདས་ཀྱི་ཡེ་ཤེས་རྒྱས་པའི་མདོ་སྡ...,this completes the great vehicle sutra the pre...
106864,རྒྱ་གར་གྱི་མཁན་པོ་པྲཛྙ་བར་མ་དང་་ལོཙྪ་བ་བན་དེ་ཡ...,this was translated by the indian preceptor pr...


In [233]:
boTextsAll = df['bo'].tolist()
enTextsAll = df['en'].tolist()

## Tokenizers for Tibetan and English

The code cell below uses Google SentencePiece tokenizer. 

In [234]:
# Load tokenizers that are already trained
boTokenizer = spm.SentencePieceProcessor(model_file=boTokenizerPath)
enTokenizer = spm.SentencePieceProcessor(model_file=enTokenizerPath)

# Verify for Tibetan
print(boTokenizer.encode(['ངའི་མིང་ལ་བསྟན་སྒྲོལ་མ་ཟེར་'], out_type=str))
print(boTokenizer.encode(['ངའི་མིང་ལ་བསྟན་སྒྲོལ་མ་ཟེར་', 'བཀ྄ྲ་ཤིས་བདེ་ལེགས།'], out_type=int))
print(boTokenizer.decode([4149, 306, 6, 245, 4660, 748]))
print(boTokenizer.decode(['▁ངའི་', 'མིང་', 'ལ་', 'བསྟན་', 'སྒྲོལ་མ་', 'ཟེར་']))
print('Vocab size of Tibetan Tokenizer:', boTokenizer.get_piece_size())

# Verify for English
print(enTokenizer.encode(["My name isn't Tenzin Dolma Gyalpo"], out_type=str))
print(enTokenizer.encode(['My name is Tenzin Dolma Gyalpo', 'Hello'], out_type=int))
print(enTokenizer.decode([[8804, 181, 13, 5520, 15172, 17895], [888, 21492]]))
print('Vocab size of English Tokenizer:', enTokenizer.get_piece_size())

[['▁ངའི་', 'མིང་ལ་', 'བསྟན་', 'སྒྲོལ་མ་', 'ཟེར་']]
[[3645, 18003, 531, 6258, 2155], [5, 3334, 0, 6082, 4, 6751, 1031, 2262, 1962, 0]]
བྲག་སྐུ་དང་ དེའི་ཚེ་མུ་སྟེགས་ཅན་ལོངས་སྤྱོད་
ངའི་མིང་ལ་བསྟན་སྒྲོལ་མ་ཟེར་
Vocab size of Tibetan Tokenizer: 32000
[['▁My', '▁name', '▁is', 'n', "'", 't', '▁Tenzin', '▁Dolma', '▁Gyalpo']]
[[8804, 181, 13, 5520, 15172, 17895], [888, 21492]]
['My name is Tenzin Dolma Gyalpo', 'Hello']
Vocab size of English Tokenizer: 25000


We need to get the ids for our special tokens `<s>`, `</s>`, `<pad>`. 

In [235]:
bo_bos_id = boTokenizer.piece_to_id('<s>')
bo_eos_id = boTokenizer.piece_to_id('</s>')
bo_pad_id = boTokenizer.piece_to_id('<pad>')
en_bos_id = enTokenizer.piece_to_id('<s>')
en_eos_id = enTokenizer.piece_to_id('</s>')
en_pad_id = enTokenizer.piece_to_id('<pad>')

print(bo_bos_id, bo_eos_id, bo_pad_id, en_bos_id, en_eos_id, en_pad_id)

1 2 3 1 2 3


The vectors of tokenization must have the same length. We thus define several helper functions for truncation and padding

In [236]:
def truncate(sentvec, maxlen, enable_bos_eos, **kwargs): 
    '''
    Truncate a sentence vector to maxlen by deleting the trailing ids. 
    Args
    -- sentvec. List. Vector of tokenization of a sentence 
    -- maxlen. Int. The max length of tokenization. Must >=3 
    -- pad_id. Int. The id for <pad>
    -- enable_bos_eos. Bool. Indicate whether to wrap a sentence with <s> and </s> 
    -- kwargs['bos_id']. Int. The id for <s>
    -- kwargs['eos_id']. Int. The id for </s> 
    '''
    
    # No error checking for now
    ## For a transformer model, the target sentences have to be wrapped by <s> and </s>, but the source sentences don't have to 
    
    if enable_bos_eos: 
        maxlen = maxlen - 2    # Need to reserve two positions for <s></s>
        bos_id = kwargs['bos_id']
        eos_id = kwargs['eos_id']
        
    # Truncate the sentence if needed 
    if len(sentvec) > maxlen: 
        newvec = sentvec[:maxlen].copy()
    else: 
        newvec = sentvec.copy()
        
    # Return the new vector
    if enable_bos_eos: 
        return [bos_id] + newvec + [eos_id]
    else: 
        return newvec

In [237]:
def pad_and_get_attention_mask(sentvec, maxlen, pad_id): 
    ''' 
    Pad a sentence to maxlen and get the attention mask where 0--><pad> and 1-->non-pad characters 
    '''
    
    sentlen = len(sentvec)
    
    # No need to pad if the sentence is long enough 
    if len(sentvec) >= maxlen: 
        return sentvec, [1] * sentlen
    
    else: 
        return sentvec + [pad_id] * (maxlen - sentlen), [1] * sentlen + [0] * (maxlen - sentlen)

In [238]:
def trim(sentvec, maxlen, pad_id, enable_bos_eos, **kwargs): 
    '''truncate and then pad a sentence. Return a tuple with ids and attention mask'''
    
    ids = truncate(sentvec, maxlen, enable_bos_eos, **kwargs)
    ids, attention_mask = pad_and_get_attention_mask(ids, maxlen, pad_id)
    return ids, attention_mask

Show some examples to verify that our `trim()` function works. 

In [239]:
ids, attention_mask = trim([100, 200, 300, 400, 500], maxlen = 4, pad_id = en_pad_id, enable_bos_eos = False)
print(ids, attention_mask)

[100, 200, 300, 400] [1, 1, 1, 1]


In [240]:
ids, attention_mask = trim([100, 200, 300, 400, 500], maxlen = 9, pad_id = en_pad_id, enable_bos_eos = False)
print(ids, attention_mask)

[100, 200, 300, 400, 500, 3, 3, 3, 3] [1, 1, 1, 1, 1, 0, 0, 0, 0]


In [241]:
ids, attention_mask = trim([100, 200, 300, 400, 500], maxlen = 4, pad_id = en_pad_id, enable_bos_eos = True, bos_id = en_bos_id, eos_id = en_eos_id)
print(ids, attention_mask)

[1, 100, 200, 2] [1, 1, 1, 1]


In [242]:
ids, attention_mask = trim([100, 200, 300, 400, 500], maxlen = 9, pad_id = en_pad_id, enable_bos_eos = True, bos_id = en_bos_id, eos_id = en_eos_id)
print(ids, attention_mask)

[1, 100, 200, 300, 400, 500, 2, 3, 3] [1, 1, 1, 1, 1, 1, 1, 0, 0]


## Define `Dataset` and `DataLoader`

In [247]:
class MyDataset(Dataset): 
    def __init__(self, boTexts, enTexts, boTokenizer, enTokenizer, boMaxLen, enMaxLen): 
        super().__init__()
        self.boTexts = boTexts
        self.enTexts = enTexts
        self.boTokenizer = boTokenizer
        self.enTokenizer = enTokenizer
        self.boMaxLen = boMaxLen
        self.enMaxLen = enMaxLen
        
    ''' Return the size of dataset '''
    def __len__(self): 
        return len(self.boTexts)
    
    '''
    -- The routine for querying one data entry 
    -- The index of must be specified as an argument
    -- Return a dictionary 
    '''
    def __getitem__(self, idx): 
        # Apply tokenizer
        boOutputs = self.boTokenizer.encode(self.boTexts[idx])
        enOutputs = self.enTokenizer.encode(self.enTexts[idx])
        
        # Truncation and padding 
        boIds, boMask = trim(
            boOutputs, 
            maxlen = self.boMaxLen, 
            pad_id = bo_pad_id, 
            enable_bos_eos = False
        )
        
        enIds, enMask = trim(
            enOutputs, 
            maxlen = self.enMaxLen, 
            pad_id = en_pad_id, 
            enable_bos_eos = True, 
            bos_id = en_bos_id, 
            eos_id = en_eos_id
        )
        
        return {
            'source_ids': torch.tensor(boIds), 
            'source_mask': torch.tensor(boMask), 
            'target_ids': torch.tensor(enIds), 
            'target_mask': torch.tensor(enMask)
        }

In [None]:
def get_dataloader(start_idx, end_idx, batch_size): 
    # Inclusive, exclusive 
    
    

## Define model class

In [243]:
class PositionalEncoding(nn.Module):    # What PositionalEncoding for? 
    def __init__(self, hparams): 
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p = hparams['dropout'])
        self.d_model = hparams['d_model']
        pe = torch.zeros(hparams['max_len'], self.d_model)    # What pe mean? 
        position = torch.arange(0, hparams['max_len']).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, self.d_model, 2).float() * (
                -math.log(10000.0) / self.d_model
            )
        )    # What for? 
        pe[:, 0::2] = torch.sin(position * div_term)    # even dimensions
        pe[:, 1::2] = torch.cos(position * div_term)    # odd dimensions
        pe = pe.unsqueeze(0).transpose(0, 1)    # Unsqueeze turns a matrix to a 3D tensor. Transpose 0th and 1st dim? 
        self.register_buffer('pe', pe)
        
    def forward(self, x): 
        x = x * math.sqrt(self.d_model)    # What for
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


In [244]:
class MyTransformer(nn.Module): 
    def __init__(self, hparams) -> None: 
        super(MyTransformer, self).__init__()
        
        self.source_embedding = nn.Embedding(
            hparams['source_vocab_length'], hparams['d_model']
        )
        self.pos_encoder = PositionalEncoding(hparams)
        encoder_layer = nn.TransformerEncoderLayer(
            hparams['d_model'], hparams['nhead'], 
            hparams['dim_feedforward'], hparams['dropout'], 
            hparams['activation']
        )
        encoder_norm = nn.LayerNorm(hparams['d_model'])    # What for? 
        self.encoder = nn.TransformerEncoder(
            encoder_layer, hparams['num_encoder_layers'], encoder_norm
        )
        
        self.target_embedding = nn.Embedding(
            hparams['target_vocab_length'], hparams['d_model']
        )
        decoder_layer = nn.TransformerDecoderLayer(
            hparams['d_model'], hparams['nhead'], 
            hparams['dim_feedforward'], hparams['dropout'], 
            hparams['activation']
        )
        decoder_norm = nn.LayerNorm(hparams['d_model'])
        self.decoder = nn.TransformerDecoder(
            decoder_layer, hparams['num_decoder_layers'], decoder_norm
        )
        
        self.out = nn.Linear(hparams['d_model'], hparams['target_vocab_length'])   # The original examples wrote nn.Linear(512, target_vocab_length). I suspect this is a typo as hard-coding numbers is not really cool 
        
        self._reset_parameters()
        self.d_model = hparams['d_model']
        self.nhead = hparams['nhead']
        
        
    def forward(self, src: Tensor, tgt: Tensor,
                src_mask: Optional[Tensor] = None, 
                tgt_mask: Optional[Tensor] = None, 
                memory_mask: Optional[Tensor] = None, 
                src_key_padding_mask: Optional[Tensor] = None, 
                tgt_key_padding_mask: Optional[Tensor] = None, 
                memory_key_padding_mask: Optional[Tensor] = None
               ) -> Tensor: 
        # Why batch size is the number of columns instead of rows? 
        if src.size(1) != tgt.size(1): 
            raise RuntimeError('The batch number of src and tgt must be equal')
            
        src = self.source_embedding(src)
        src = self.pos_encoder(src)
        memory = self.encoder(src, mask = src_mask, src_key_padding_mask = src_key_padding_mask)
        
        tgt = self.target_embedding(tgt)
        tgt = self.pos_encoder(tgt)
        output = self.decoder(
            tgt, memory, tgt_mask = tgt_mask, 
            memory_mask = memory_mask, 
            tgt_key_padding_mask = tgt_key_padding_mask, 
            memory_key_padding_mask = memory_key_padding_mask
        )
        output = self.out(output)
        return output
        
    
    def _reset_parameters(self): 
        r'''Initiate parameters in the transformer model'''
        # How work? 
        for p in self.parameters(): 
            if p.dim() > 1: 
                torch.nn.init.xavier_uniform_(p)

In [245]:
hparams = dict(
    d_model = 512, 
    dropout = 0.1, 
    max_len = 100, 
    nhead = 8,    # Little understand what for 
    num_encoder_layers = 6, 
    num_decoder_layers = 6, 
    dim_feedforward = 2048, 
    activation = 'relu', 
    source_vocab_length = boTokenizer.get_piece_size(),    # Consider increase
    target_vocab_length = enTokenizer.get_piece_size(),    # Consider increase 
)

## Instantiate model and optimizer

In [246]:
model = MyTransformer(hparams).to(device)

optim = torch.optim.Adam(model.parameters(), lr = 1e-4, betas = (0.9, 0.98), eps = 1e-9)

## Define the training routine

In [None]:
def train(train_iter, val_iter, model, optim, num_epochs, use_gpu = True): 
    train_losses = []
    val_losses = []
    
    for epoch in range(num_epochs): 
        train_loss = 0
        val_loss = 0
        
        # Flip to train mode 
        model.train()
        