# Import Torch functions and tokenizers

In [6]:
import torch
from torch import Tensor, nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import dataset
from torch.utils.tensorboard import SummaryWriter

import regex as re
import os
import time
from tqdm import tqdm
import copy
import math

from model import TransformerModel
from utils import preProcessText, getTokenizer,try_gpu , word_piece_decoder, word_piece_encoder
from config import getConfig

import pickle

In [7]:
model_config, app_config = getConfig(small = True)
print(model_config)
print(app_config)

bptt=model_config["bptt"]
device = try_gpu(0)

{'emsize': 300, 'd_hid': 800, 'nlayers': 4, 'nhead': 4, 'dropout': 0.05, 'bptt': 32}
{'logs': 'tensorboard_logs', 'epochs': 10}


# Preprocessing Text

In [27]:
file_path = 'data/preprocessed_word_piece.txt'
if not os.path.exists(file_path):
    with open('data/ne_dedup.txt', 'r', encoding='utf-8') as f:
        text = f.read()
        print("Preprocessing file")
        text = preProcessText(text,tokenizer_type = 'word_piece')
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(text)
else:
    print(f"Reading file  : {file_path}")
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

Reading file  : data/preprocessed_word_piece.txt


In [10]:
def ret_t(text):
    text = re.sub(r'\u094D','a',text)
    text = re.sub(r'\u0941','u',text)
    text = re.sub(r'\u0942','e',text)
    text = re.sub(r'\u0901','i',text)
    text = re.sub(r'\u0902','c',text)
    text = re.sub(r'\u0943','r',text)
    text = re.sub(r'\u0947','l',text)
    text = re.sub(r'\u094b','o',text)
    text = re.sub(r'\u094c','p',text)
    text = re.sub(r'\u0948','k',text)
    
    return text

In [9]:
len(text.split('\n'))

319566

In [28]:
train_split = 10000

train_iter_first = text.split('\n')[:train_split]
test_iter = text.split('\n')[train_split:11000]

In [29]:
tokenizer,vocab = getTokenizer(tokenizer_type = 'word_piece')

In [30]:
# vocab = tokenizer.get_vocab()
len(vocab)

30000

In [31]:

# Try the encoder and decoder
l = tokenizer.encode(word_piece_encoder('महानायक राजेश हमाल अहिले चलचित्र क्षेत्रमा पातलिए ।')).tokens
l_ = tokenizer.encode(word_piece_encoder('महानायक राजेश हमाल अहिले चलचित्र क्षेत्रमा पातलिए ।')).ids

print("Encoded $tring: ",l)
print("Decoded $tring:",word_piece_decoder(tokenizer.decode(l_)))


Encoded $tring:  ['महान', '##ा', '##यक', 'राजlश', 'हमाल', 'अहिलl', 'चलचितaर', 'कaषlतaरमा', 'पात', '##लिए', '।']
Decoded $tring: महानायक राजेश हमाल अहिले चलचित्र क्षेत्रमा पातलिए ।


In [32]:

# Try the encoder and decoder
l = tokenizer.encode(word_piece_encoder('हातमा त्रिशुल जटा मुकुट शुशोभीत ब्रम्हा उत्पति हुनु ।')).tokens
l_ = tokenizer.encode(word_piece_encoder('हातमा त्रिशुल जटा मुकुट शुशोभीत ब्रम्हा उत्पति हुनु ।')).ids

print("Encoded $tring: ",l)
print("Encoded id$: ",l_)
print("Decoded $tring:",word_piece_decoder(tokenizer.decode(l_)))


Encoded $tring:  ['हातमा', '[UNK]', 'ज', '##टा', '[UNK]', '[UNK]', 'बaरमaहा', 'उतa', '##पति', '[UNK]', '।']
Encoded id$:  [4308, 1, 42, 307, 1, 1, 27723, 475, 826, 1, 77]
Decoded $tring: हातमा जटा ब्रम्हा उत्पति ।


#  some utility functions

In [15]:
def split_list(l):
    splitted_list = []
    z = 0
    for i,idx in enumerate(l):
        if idx == 220:
            splitted_list.append(l[z:i])
            z = i+1
    if z <= len(l)-1:
        splitted_list.append(l[z:])
    return splitted_list

def splits_to_token(splited_list):
    strings = [tokenizer.decode(l) for l in splited_list]
    
    return strings

# print(tokenizer.encode(' ').ids)

word_piece_decoder(tokenizer.decode([978,
 261,
 264,
624,
 261,
 263]))

k = split_list(l_)

In [16]:
splits_to_token(k)

['हातमा जटा बaरमaहा उतaपति ।']

In [33]:
def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
    """Converts raw text into a flat Tensor."""
    data = [torch.tensor(tokenizer.encode(word_piece_encoder(item)).ids, dtype=torch.long)
            for item in raw_text_iter]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))


def batchify(data: Tensor, bsz: int) -> Tensor:
    """Divides the data into bsz separate sequences, removing extra elements
    that wouldn't cleanly fit.
    Args:
        data: Tensor, shape [N]
        bsz: int, batch size
    Returns:
        Tensor of shape [N // bsz, bsz]
    """
    seq_len = data.size(0) // bsz
    data = data[:seq_len * bsz]
#     data = data.view(bsz, seq_len).t().contiguous()
    data = data.view(bsz,seq_len).t()
#     return data.to(device)
    return data


seq_length = 128
import math


def get_batch(source: Tensor, i: int) -> tuple[Tensor, Tensor]:
    """
    Args:
        source: Tensor, shape [full_seq_len, batch_size]
        i: int
    Returns:
        tuple (data, target), where data has shape [seq_len, batch_size] and
        target has shape [seq_len * batch_size]
    """
    seq_len = min(seq_length, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    #target = source[i+1:i+1+seq_len]
    return data, target

In [34]:
#Train and Test Split
train_data = data_process(train_iter_first)
test_data = data_process(test_iter)

In [35]:
len(train_data)

2733762

In [36]:
# train_data[:400]

In [37]:

# print(len(train_iter_second),len(test_iter))
torch.cuda.empty_cache() 

torch.cuda.memory_allocated() 

0

# Working with a dummy Sample

In [38]:
#Sample Data


text = ['आधिकारिक निर्णयको कारणले , वाणिज्य बिभागले , संयुक्त राज्य अमेरिकी समुद्री पानी निर्माताद्वारा संयुक्त राज्य']
#text = ['जनसंख्या']
sample_data = data_process(
    text)

In [39]:
sample_data

tensor([ 5522, 12438,  2404,     1,  3124, 13185,   177,     1,     1,   964,
         1956,     1,  1373,  5110,  1922,     1,   964])

In [40]:
sample_data = batchify(sample_data, 2)
print("Given word:", text[0])
sample_data

Given word: आधिकारिक निर्णयको कारणले , वाणिज्य बिभागले , संयुक्त राज्य अमेरिकी समुद्री पानी निर्माताद्वारा संयुक्त राज्य


tensor([[ 5522,     1],
        [12438,   964],
        [ 2404,  1956],
        [    1,     1],
        [ 3124,  1373],
        [13185,  5110],
        [  177,  1922],
        [    1,     1]])

# Model Definition

In [51]:

batched_train_data = batchify(train_data, bptt).to(device)  # shape [seq_len, batch_size]
batched_test_data = batchify(test_data, bptt).to(device)

In [52]:
def get_model(model_config, ntokens):
    emsize = model_config["emsize"]
    d_hid = model_config["d_hid"]
    nlayers = model_config["nlayers"]
    nhead = model_config["nhead"]
    dropout = model_config["dropout"]
    model = TransformerModel(ntokens, emsize,nhead, d_hid, nlayers, dropout)
    return model

In [44]:
ntokens = len(vocab)
model = get_model(model_config, ntokens).to(device)
torch.cuda.memory_allocated()

116425728

In [45]:
seq_length = 128

# Hyper-Parameter Tuning

In [46]:
criterion = nn.CrossEntropyLoss()
lr = 1  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
softmax = nn.Softmax(dim=2)
#softmax = nn.LogSoftmax(dim=2)

In [47]:
def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

In [53]:
def train(model: nn.Module) -> None:
    global epoch
    global global_step
    model.train()  # turn on train mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(bptt).to(device)

    num_batches = len(batched_train_data) // bptt
    progress_bar = tqdm(enumerate(range(0, batched_train_data.size(0) - 1, bptt)), total=num_batches, desc=f'Epoch {epoch}', ncols=80)
    for batch_idx, i in progress_bar:
        ### batch_idx -> (1, 2, 3, 4, ...)
        ### i -> (0, bptt, 2*bptt, ....)
        data, targets = get_batch(batched_train_data, i)
        batch_size = data.size(0)
        if batch_size != bptt:  # only on last batch
            src_mask = src_mask[:batch_size, :batch_size]
        output = model(data, src_mask)
        loss = criterion(output.view(-1, ntokens), targets)
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()

        ## calculate the postfix description for the progress bar
        cur_loss = total_loss / (batch_idx + 1)
        ppl = math.exp(cur_loss)
        
        progress_bar.set_postfix({"loss": cur_loss, "ppl" : ppl}, refresh=True)
        
        writer.add_scalar('loss/train loss', cur_loss, global_step)
        writer.flush()
        writer.add_scalar('ppl/train perplexity', ppl, global_step)
        writer.flush()
        global_step += 1

In [57]:
def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(bptt).to(device)

    num_batches = len(eval_data) // bptt
    with torch.no_grad():
        progress_bar = tqdm(enumerate(range(0, eval_data.size(0) - 1, bptt)), total=num_batches, desc=f'Validation {epoch}', ncols=80)
        for batch_idx, i in progress_bar:
            data, targets = get_batch(eval_data, i)
            batch_size = data.size(0)
            if batch_size != bptt:
                src_mask = src_mask[:batch_size, :batch_size]
            output = model(data, src_mask)
            output_softmax = softmax(output)
            output_softmax_permuted = output_softmax.permute(1, 0, 2)
            indices = torch.argmax(output_softmax_permuted, dim=2)
            target_indices = targets.t()
            output_flat = output.view(-1, ntokens)
            total_loss += batch_size * criterion(output_flat, targets).item()
    
    eval_loss = total_loss / (len(eval_data) - 1)
    eval_ppl = math.exp(eval_loss)

    writer.add_scalar('loss/val loss', eval_loss, global_step)
    writer.flush()
    writer.add_scalar('ppl/val perplexity', eval_ppl, global_step)
    writer.flush()

    return eval_loss

# Training Data

In [None]:
best_model_path = 'models/best_model_wp.pt'

In [58]:
# Loop over epochs. Save the model if the validation loss is the best
# we've seen so far. Adjust the learning rate after each epoch.
best_val_loss = float('inf')
initial_epoch = 0
epochs = app_config["epochs"]
global_step = 0
best_model = None

# preload the model if exists to train more epochs

if os.path.exists(best_model_path):
    print(f"Preloading model {best_model_path}")
    state = torch.load(best_model_path)
    
    initial_epoch = state['epoch'] + 1
    model.load_state_dict(state['model_state_dict'])
    optimizer.load_state_dict(state['optimizer_state_dict'])
    global_step = state['global_step']
    best_val_loss = state['best_val_loss']
    
    print(initial_epoch, global_step, best_val_loss)

# initializing the tensorbaord log writer
writer = SummaryWriter(app_config["logs"])


for epoch in range(initial_epoch, epochs):
    train(model)
    eval_loss = evaluate(model, batched_test_data)

    # save the model if validation loss decreases

    if eval_loss < best_val_loss:
        print(f"eval perplexity : {math.exp(eval_loss)}")
        print("saving the model")
        best_val_loss = eval_loss
        best_model = copy.deepcopy(model)

        directory_path = 'models'
        # Create the directory if it doesn't exist
        if not os.path.exists(directory_path):
            os.makedirs(directory_path)
        torch.save({
                'epoch': epoch,
                'model_state_dict': best_model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'global_step': global_step, 
                'best_val_loss' : best_val_loss,
            }, os.path.join(directory_path, 'best_model_wp.pt'))

Epoch 0: 2670it [09:03,  4.91it/s, loss=5.34, ppl=208]                          
Validation 0: 261it [00:20, 12.57it/s]                                          


eval perplexity : 23193946812.60187
saving the model


Epoch 1: 2670it [09:04,  4.90it/s, loss=5.04, ppl=154]                          
Validation 1: 261it [00:20, 12.59it/s]                                          


eval perplexity : 17933978731.69122
saving the model


Epoch 2:  55%|█████▌    | 1469/2669 [04:59<04:04,  4.90it/s, loss=4.84, ppl=127]


KeyboardInterrupt: 

In [59]:
lnsoftmax = nn.LogSoftmax(dim=2)

In [60]:

def id_to_token(x):
    token_list = [tokenizer.id_to_token(id) for id in x]
    
    return token_list

In [61]:
id_to_token([10,15])

['i', 'r']

# sample Data Generation

In [67]:
def loadModel(best_model_path):
    global model
    if os.path.exists(best_model_path):
        print(f"Preloading model {best_model_path}")
        if torch.cuda.is_available():
            state = torch.load(best_model_path)
        else:
            state = torch.load(best_model_path, map_location=torch.device('cpu'))
        model.load_state_dict(state['model_state_dict'])
        return model
    else:
        raise Exception("Model Not Found")
        
loaded_model = loadModel(best_model_path)

Preloading model models/best_model_wp.pt


KeyError: 'model_state_dict'

In [64]:

def generator(model: nn.Module, gen_data: Tensor, no_words = 10):
    model.eval()

    src_mask = generate_square_subsequent_mask(bptt).to(device)
    pred_text = []
    for i in range(no_words):

        batch_size = gen_data.size(0)
        if batch_size != bptt:
            src_mask_ = src_mask[:batch_size, :batch_size]
        else:
            src_mask_ = src_mask[:,:]
        output_softmax = model(gen_data, src_mask_)
        output_softmax_permuted = output_softmax.permute(1, 0, 2)
        indices = torch.argmax(output_softmax_permuted, dim=2)

        pred_text.append(indices[0][-1])
        if(batch_size < 128):
            gen_data = torch.cat((gen_data[:,:],indices.t()[-1:][:]),0)
            batch_size= gen_data.size(0)
        else:
            gen_data = torch.cat((gen_data[1:,:],indices.t()[-1:][:]),0)
            batch_size= gen_data.size(0)
            
    return pred_text



def nonnaive_generator(model: nn.Module, gen_data: Tensor, no_words = 5,k=50):
    model.eval()

    src_mask = generate_square_subsequent_mask(bptt).to(device)
    pred_text = []
    for i in range(no_words):
        batch_size = gen_data.size(0)
        if batch_size != bptt:
            src_mask_ = src_mask[:batch_size, :batch_size]
        output_softmax = model(gen_data, src_mask_)
        output_softmax_permuted = output_softmax.permute(1, 0, 2)
        indices = torch.topk(output_softmax_permuted,k ,dim=2).indices.squeeze(0)
        
        values = torch.topk(softmax(output_softmax_permuted),k ,dim=2).values
        values = values/torch.sum(values,dim = 2,keepdims = True)
        
        ind_sampled = torch.distributions.Categorical(values.squeeze(0)).sample()
        next_index = indices[-1][ind_sampled[-1]]

        pred_text.append(next_index.item())
        if(batch_size < 128):
            gen_data = torch.cat((gen_data[:,:],next_index.unsqueeze(0).unsqueeze(0)),0)
            batch_size= gen_data.size(0)
        else:
            gen_data = torch.cat((gen_data[1:,:],next_index.unsqueeze(0).unsqueeze(0)),0)
            batch_size= gen_data.size(0)
            
    return pred_text

In [65]:
st = ['नेपालमा आधुनिक']
st_i = data_process(st)
st_i = st_i.unsqueeze(1).to(device)
st_i.shape,st_i

(torch.Size([2, 1]),
 tensor([[995],
         [  1]], device='cuda:0'))

In [66]:
z = nonnaive_generator(loaded_model, st_i,no_words = 40, k=10)
m = splits_to_token(split_list(z))
word_piece_decoder(' '.join(st)+ ' '.join(m))

'नेपालमा आधुनिकमा र मिलेर बनेको छ । नेपाल उद्योग वाणिज्य महासंघले र वाणिज्य तथा आपूर्तिका लागि समेत छलफल गरिएको छ । यस्तै विभिन्न समेत आयोजना गर्ने गरी सम्झौता गरिएको'