# Import Torch functions and tokenizers

In [1]:
import torch
from torch import Tensor, nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import dataset
from torch.utils.tensorboard import SummaryWriter

import regex as re
import os
import time
from tqdm import tqdm
import copy
import math

from model import TransformerModel
from utils import preProcessText, getTokenizer,try_gpu , word_piece_decoder, word_piece_encoder
from config import getConfig

import pickle

In [47]:
model_config, app_config = getConfig(small = True)
print(model_config)
print(app_config)

bptt=model_config["bptt"]
device = try_gpu(0)

{'emsize': 300, 'd_hid': 800, 'nlayers': 4, 'nhead': 4, 'dropout': 0.05, 'bptt': 64}
{'logs': 'tensorboard_logs', 'epochs': 10}


# Preprocessing Text

In [3]:
file_path = 'data/preprocessed_word_piece.txt'
if not os.path.exists(file_path):
    with open('data/ne_dedup.txt', 'r', encoding='utf-8') as f:
        text = f.read()
        print("Preprocessing file")
        text = preProcessText(text,tokenizer_type = 'word_piece')
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(text)
else:
    print(f"Reading file  : {file_path}")
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

Reading file  : data/preprocessed_word_piece.txt


In [10]:
# def ret_t(text):
#     text = re.sub(r'\u094D','a',text)
#     text = re.sub(r'\u0941','u',text)
#     text = re.sub(r'\u0942','e',text)
#     text = re.sub(r'\u0901','i',text)
#     text = re.sub(r'\u0902','c',text)
#     text = re.sub(r'\u0943','r',text)
#     text = re.sub(r'\u0947','l',text)
#     text = re.sub(r'\u094b','o',text)
#     text = re.sub(r'\u094c','p',text)
#     text = re.sub(r'\u0948','k',text)
    
#     return text

In [56]:
# def ret_rev(text):
#     text = re.sub('a',r'\u094D',text)
#     text = re.sub('u',r'\u0941',text)
#     text = re.sub('e',r'\u0942',text)
#     text = re.sub('i',r'\u0901',text)
#     text = re.sub('c',r'\u0902',text)
#     text = re.sub('r',r'\u0943',text)
#     text = re.sub('l',r'\u0947',text)
#     text = re.sub('o',r'\u094b',text)
#     text = re.sub('p',r'\u094c',text)
#     text = re.sub('k',r'\u0948',text)
    
#     return text
    

In [57]:
# with open('wordpiece_text.txt', 'w', encoding='utf-8') as f:
#     f.write(text)

In [58]:
len(text.split('\n'))

341961

In [27]:
train_split = 10000

train_iter_first = text.split('\n')[:train_split]
test_iter = text.split('\n')[train_split:]

# Run Here

In [5]:


#Uncomment the line below to look at the documentation of the tokenizer
#help(BertWordPieceTokenizer)

In [61]:
#Initialize the tokenizer
# tokenizer = BertWordPieceTokenizer()

#Uncomment to look at the argument of tokenizer.train
# help(tokenizer.train)

Help on method train in module tokenizers.implementations.bert_wordpiece:

train(files: Union[str, List[str]], vocab_size: int = 30000, min_frequency: int = 2, limit_alphabet: int = 1000, initial_alphabet: List[str] = [], special_tokens: List[Union[str, tokenizers.AddedToken]] = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], show_progress: bool = True, wordpieces_prefix: str = '##') method of tokenizers.implementations.bert_wordpiece.BertWordPieceTokenizer instance
    Train the model using the given files



In [6]:

#Train the tokenizer with given parameter$
# tokenizer.train(files=file_path, vocab_size=30000, min_frequency=2)

NameError: name 'tokenizer' is not defined

In [9]:
tokenizer,vocab = getTokenizer(tokenizer_type = 'word_piece')

In [6]:

# Uncomment and $ave if running for fir$t time
# with open('tokenizer_wp.pickle','wb') as f:
#     pickle.dump(tokenizer,f)


#Load the tokenizer 
# with open('tokenizers/tokenizer_wp.pickle','rb') as f:
#     t = pickle.load(f)

# tokenizer = t

In [11]:
# vocab = tokenizer.get_vocab()
len(vocab)

30000

In [12]:

# Try the encoder and decoder
l = tokenizer.encode(word_piece_encoder('महानायक राजेश हमाल अहिले चलचित्र क्षेत्रमा पातलिए ।')).tokens
l_ = tokenizer.encode(word_piece_encoder('महानायक राजेश हमाल अहिले चलचित्र क्षेत्रमा पातलिए ।')).ids

print("Encoded $tring: ",l)
print("Decoded $tring:",word_piece_decoder(tokenizer.decode(l_)))


Encoded $tring:  ['महान', '##ा', '##यक', 'राजlश', 'हमाल', 'अहिलl', 'चलचितaर', 'कaषlतaरमा', 'पात', '##लिए', '।']
Decoded $tring: महानायक राजेश हमाल अहिले चलचित्र क्षेत्रमा पातलिए ।


In [13]:

# Try the encoder and decoder
l = tokenizer.encode(word_piece_encoder('हातमा त्रिशुल जटा मुकुट शुशोभीत ब्रम्हा उत्पति हुनु ।')).tokens
l_ = tokenizer.encode(word_piece_encoder('हातमा त्रिशुल जटा मुकुट शुशोभीत ब्रम्हा उत्पति हुनु ।')).ids

print("Encoded $tring: ",l)
print("Encoded id$: ",l_)
print("Decoded $tring:",word_piece_decoder(tokenizer.decode(l_)))


Encoded $tring:  ['हातमा', '[UNK]', 'ज', '##टा', '[UNK]', '[UNK]', 'बaरमaहा', 'उतa', '##पति', '[UNK]', '।']
Encoded id$:  [4308, 1, 42, 307, 1, 1, 27723, 475, 826, 1, 77]
Decoded $tring: हातमा जटा ब्रम्हा उत्पति ।


#  some utility functions

In [14]:
def split_list(l):
    splitted_list = []
    z = 0
    for i,idx in enumerate(l):
        if idx == 220:
            splitted_list.append(l[z:i])
            z = i+1
    if z <= len(l)-1:
        splitted_list.append(l[z:])
    return splitted_list

def splits_to_token(splited_list):
    strings = [tokenizer.decode(l) for l in splited_list]
    
    return strings

print(tokenizer.encode(' ').ids)

word_piece_decoder(tokenizer.decode([978,
 261,
 264,
624,
 261,
 263]))

k = split_list(l_)

In [17]:
splits_to_token(k)

['हातमा जटा बaरमaहा उतaपति ।']

In [42]:
def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
    """Converts raw text into a flat Tensor."""
    data = [torch.tensor(tokenizer.encode(word_piece_encoder(item)).ids, dtype=torch.long)
            for item in raw_text_iter]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))


def batchify(data: Tensor, bsz: int) -> Tensor:
    """Divides the data into bsz separate sequences, removing extra elements
    that wouldn't cleanly fit.
    Args:
        data: Tensor, shape [N]
        bsz: int, batch size
    Returns:
        Tensor of shape [N // bsz, bsz]
    """
    seq_len = data.size(0) // bsz
    data = data[:seq_len * bsz]
#     data = data.view(bsz, seq_len).t().contiguous()
    data = data.view(bsz,seq_len).t()
#     return data.to(device)
    return data


seq_length = 128
import math


def get_batch(source: Tensor, i: int) -> tuple[Tensor, Tensor]:
    """
    Args:
        source: Tensor, shape [full_seq_len, batch_size]
        i: int
    Returns:
        tuple (data, target), where data has shape [seq_len, batch_size] and
        target has shape [seq_len * batch_size]
    """
    seq_len = min(seq_length, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    #target = source[i+1:i+1+seq_len]
    return data, target

In [28]:
#Train and Test Split
train_data = data_process(train_iter_first)
test_data = data_process(test_iter)

In [29]:
len(train_data)

2733762

In [86]:
# train_data[:400]

tensor([27894,  3615,  2007,   595,  5195,   254,  3125,   647, 14936,    77,
         6826,   314, 22142,    61,  2372,    77,  3698,   314,  7084,  1202,
            6,   474,   301,  1375,  3128,   791,   359, 11316, 14227,  2138,
         4187,  2952,  2292, 19127,   481,  9798,  1815,   301,  3128,  1645,
          204,  8468,   286, 13485,    98,   291,   274,   923,  2474,  6458,
          511,  1091,   105,  8223,  4890,  8223,  5428,  2073,  5378,   575,
          754,    61, 22889,   113,   271, 27953, 21023,    41,    77,  3081,
         1322,   604,   811, 17844,  2549,    77,   557,     6,  1158,   200,
          409,    77,  1419,  7676,  1976, 17205, 15512,  1109, 16103,  3500,
         4311,   609, 12234,  8023,  2573,   261,    77,  1419,  7676,  1976,
        19056, 27193,  7703, 19850,   108,  1223,   103, 14259,   241,  2622,
         5866,   323, 12700, 17657,  1109, 16103,  3500,  1067,     6,  2321,
         1930,    77,   701,  1998,     6,   591,    61,   629, 

In [87]:

# print(len(train_iter_second),len(test_iter))
torch.cuda.empty_cache() 

torch.cuda.memory_allocated() 

0

# Working with a dummy Sample

In [30]:
#Sample Data


text = ['आधिकारिक निर्णयको कारणले , वाणिज्य बिभागले , संयुक्त राज्य अमेरिकी समुद्री पानी निर्माताद्वारा संयुक्त राज्य']
#text = ['जनसंख्या']
sample_data = data_process(
    text)

In [31]:
sample_data

tensor([ 5522, 12438,  2404,     1,  3124, 13185,   177,     1,     1,   964,
         1956,     1,  1373,  5110,  1922,     1,   964])

In [32]:
sample_data = batchify(sample_data, 2)
print("Given word:", text[0])
sample_data

Given word: आधिकारिक निर्णयको कारणले , वाणिज्य बिभागले , संयुक्त राज्य अमेरिकी समुद्री पानी निर्माताद्वारा संयुक्त राज्य


tensor([[ 5522,     1],
        [12438,   964],
        [ 2404,  1956],
        [    1,     1],
        [ 3124,  1373],
        [13185,  5110],
        [  177,  1922],
        [    1,     1]])

In [46]:
j = get_batch(train_data_batched,2)
print(train_data_batched.shape,j[0].shape,j[1].shape)

torch.Size([170860, 16]) torch.Size([128, 16]) torch.Size([2048])


# Model Definition

In [74]:

batched_train_data = batchify(train_data, bptt).to(device)  # shape [seq_len, batch_size]
eval_data = batchify(test_data, bptt).to(device)

In [75]:
def get_model(model_config, ntokens):
    emsize = model_config["emsize"]
    d_hid = model_config["d_hid"]
    nlayers = model_config["nlayers"]
    nhead = model_config["nhead"]
    dropout = model_config["dropout"]
    model = TransformerModel(ntokens, emsize,nhead, d_hid, nlayers, dropout)
    return model

In [76]:
ntokens = len(vocab)
model = get_model(model_config, ntokens).to(device)
torch.cuda.memory_allocated()

1693978624

In [77]:
seq_length = 128

# Hyper-Parameter Tuning

In [58]:
criterion = nn.CrossEntropyLoss()
lr = 1  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
softmax = nn.Softmax(dim=2)
#softmax = nn.LogSoftmax(dim=2)

In [59]:
def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

In [71]:
def train(model: nn.Module) -> None:
    global epoch
    global global_step
    model.train()  # turn on train mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(bptt).to(device)

    num_batches = len(batched_train_data) // bptt
    progress_bar = tqdm(enumerate(range(0, batched_train_data.size(0) - 1, bptt)), total=num_batches, desc=f'Epoch {epoch}', ncols=80)
    for batch_idx, i in progress_bar:
        ### batch_idx -> (1, 2, 3, 4, ...)
        ### i -> (0, bptt, 2*bptt, ....)
        data, targets = get_batch(batched_train_data, i)
        batch_size = data.size(0)
        if batch_size != bptt:  # only on last batch
            src_mask = src_mask[:batch_size, :batch_size]
        output = model(data, src_mask)
        loss = criterion(output.view(-1, ntokens), targets)
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()

        ## calculate the postfix description for the progress bar
        cur_loss = total_loss / (batch_idx + 1)
        ppl = math.exp(cur_loss)
        
        progress_bar.set_postfix({"loss": cur_loss, "ppl" : ppl}, refresh=True)
        
        writer.add_scalar('loss/train loss', cur_loss, global_step)
        writer.flush()
        writer.add_scalar('ppl/train perplexity', ppl, global_step)
        writer.flush()
        global_step += 1

In [72]:
def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(bptt).to(device)

    num_batches = len(eval_data_batched) // bptt
    with torch.no_grad():
        progress_bar = tqdm(enumerate(range(0, eval_data_batched.size(0) - 1, bptt)), total=num_batches, desc=f'Validation {epoch}', ncols=80)
        for batch_idx, i in progress_bar:
            data, targets = get_batch(eval_data, i)
            batch_size = data.size(0)
            if batch_size != bptt:
                src_mask = src_mask[:batch_size, :batch_size]
            output = model(data, src_mask)
            output_softmax = softmax(output)
            output_softmax_permuted = output_softmax.permute(1, 0, 2)
            indices = torch.argmax(output_softmax_permuted, dim=2)
            target_indices = targets.t()
            output_flat = output.view(-1, ntokens)
            total_loss += batch_size * criterion(output_flat, targets).item()
    
    eval_loss = total_loss / (len(eval_data) - 1)
    eval_ppl = math.exp(eval_loss)

    writer.add_scalar('loss/val loss', eval_loss, global_step)
    writer.flush()
    writer.add_scalar('ppl/val perplexity', eval_ppl, global_step)
    writer.flush()

    return eval_loss

# Training Data

In [78]:
# Loop over epochs. Save the model if the validation loss is the best
# we've seen so far. Adjust the learning rate after each epoch.
best_val_loss = float('inf')
initial_epoch = 0
epochs = app_config["epochs"]
global_step = 0
best_model = None

# preload the model if exists to train more epochs
best_model_path = 'models/best_model_wp.pt'
if os.path.exists(best_model_path):
    print(f"Preloading model {best_model_path}")
    state = torch.load(best_model_path)
    
    initial_epoch = state['epoch'] + 1
    model.load_state_dict(state['model_state_dict'])
    optimizer.load_state_dict(state['optimizer_state_dict'])
    global_step = state['global_step']
    best_val_loss = state['best_val_loss']
    
    print(initial_epoch, global_step, best_val_loss)

# initializing the tensorbaord log writer
writer = SummaryWriter(app_config["logs"])


for epoch in range(initial_epoch, epochs):
    train(model)
    eval_loss = evaluate(model, batched_test_data)

    # save the model if validation loss decreases

    if eval_loss < best_val_loss:
        print(f"eval perplexity : {math.exp(eval_loss)}")
        print("saving the model")
        best_val_loss = eval_loss
        best_model = copy.deepcopy(model)

        directory_path = 'models'
        # Create the directory if it doesn't exist
        if not os.path.exists(directory_path):
            os.makedirs(directory_path)
        torch.save({
                'epoch': epoch,
                'model_state_dict': best_model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'global_step': global_step, 
                'best_val_loss' : best_val_loss,
            }, os.path.join(directory_path, 'best_model_wp.pt'))

Epoch 0:  24%|█▍    | 2585/10678 [04:30<13:52,  9.72it/s, loss=10.7, ppl=4.4e+4]

In [79]:
lnsoftmax = nn.LogSoftmax(dim=2)

In [109]:

def id_to_token(x):
    token_list = [tokenizer.id_to_token(id) for id in x]
    
    return token_list

In [110]:
id_to_token([10,15])

['i', 'r']

# sample Data Generation

In [82]:

best_model = model
def generator(model: nn.Module, gen_data: Tensor, no_words = 10):
    model.eval()

    src_mask = generate_square_subsequent_mask(bptt).to(device)
    pred_text = []
    for i in range(no_words):

        batch_size = gen_data.size(0)
        if batch_size != bptt:
            src_mask_ = src_mask[:batch_size, :batch_size]
        else:
            src_mask_ = src_mask[:,:]
        output_softmax = model(gen_data, src_mask_)
        output_softmax_permuted = output_softmax.permute(1, 0, 2)
        indices = torch.argmax(output_softmax_permuted, dim=2)

        pred_text.append(indices[0][-1])
        if(batch_size < 128):
            gen_data = torch.cat((gen_data[:,:],indices.t()[-1:][:]),0)
            batch_size= gen_data.size(0)
        else:
            gen_data = torch.cat((gen_data[1:,:],indices.t()[-1:][:]),0)
            batch_size= gen_data.size(0)
            
    return pred_text



def nonnaive_generator(model: nn.Module, gen_data: Tensor, no_words = 5,k=50):
    model.eval()

    src_mask = generate_square_subsequent_mask(bptt).to(device)
    pred_text = []
    for i in range(no_words):
        batch_size = gen_data.size(0)
        if batch_size != bptt:
            src_mask_ = src_mask[:batch_size, :batch_size]
        output_softmax = model(gen_data, src_mask_)
        output_softmax_permuted = output_softmax.permute(1, 0, 2)
        indices = torch.topk(output_softmax_permuted,k ,dim=2).indices.squeeze(0)
        
        values = torch.topk(softmax(output_softmax_permuted),k ,dim=2).values
        values = values/torch.sum(values,dim = 2,keepdims = True)
        
        ind_sampled = torch.distributions.Categorical(values.squeeze(0)).sample()
        next_index = indices[-1][ind_sampled[-1]]

        pred_text.append(next_index.item())
        if(batch_size < 128):
            gen_data = torch.cat((gen_data[:,:],next_index.unsqueeze(0).unsqueeze(0)),0)
            batch_size= gen_data.size(0)
        else:
            gen_data = torch.cat((gen_data[1:,:],next_index.unsqueeze(0).unsqueeze(0)),0)
            batch_size= gen_data.size(0)
            
    return pred_text

In [96]:
st = ['नेपालमा आधुनिक']
st_i = data_process(st)
st_i = st_i.unsqueeze(1).to(device)
st_i.shape,st_i

In [94]:
z = nonnaive_generator(model, st_i,no_words = 40, k=10)

In [91]:
m = splits_to_token(split_list(z))

In [92]:
word_piece_decoder(' '.join(st)+ ' '.join(m))

In [88]:
m

In [120]:
v = [t.item() for t in s[0]] 
splits_to_token(split_list(v))

['',
 'लाईसेन्स',
 'दौडिन्छन्',
 'पुनर्वासबेलौरीमा',
 'अटो',
 'रिक्सा',
 '',
 'रञ्जित',
 'लामा',
 'पुनर्वास',
 '?',
 '२५',
 'भाद्र',
 '।',
 'कञ्चनपुरको',
 'पुनर्वासबेलौरीमा',
 'यतिखेर',
 'लाईसेन्स',
 'बिनैै',
 'अटो',
 'रिक्सा',
 'चल']

In [121]:
v_ = [t.item() for t in z_] 
splits_to_token(split_list(v_))

['ाउने',
 'गरेको',
 'छ',
 '।',
 'पुनर्वास',
 'नगरपालिकाको',
 'कार्यालय',
 'पुनर्वास',
 'नगरपाल']

In [127]:
# st = ['लामो समयसम्म प्रयोग गर्न सकिन्छ ।']
# st = ['तपाईंलाई कस्तो पुस्तकहरू मन']
st = ['नेपालमा आधुनिक']
st = ['हरेक सेपालीले']
st_i = data_process(st)
st_i = st_i.unsqueeze(1).to(device)

In [128]:
st_i.shape

torch.Size([4, 1])

In [129]:
z = generator(best_model, st_i,no_words =40)

i: 0
torch.Size([1, 4])
next word:  आफaनo
0 Gen_data:  tensor([[ 1442],
        [  405],
        [12812],
        [  177]], device='cuda:0') Pred_data:  tensor([[ 359, 4037,  177,  525]], device='cuda:0')
i: 1
torch.Size([1, 5])
next word:  घर
1 Gen_data:  tensor([[ 1442],
        [  405],
        [12812],
        [  177],
        [  525]], device='cuda:0') Pred_data:  tensor([[ 359, 4037,  177,  525,  628]], device='cuda:0')
i: 2
torch.Size([1, 6])
next word:  बनाउनl
2 Gen_data:  tensor([[ 1442],
        [  405],
        [12812],
        [  177],
        [  525],
        [  628]], device='cuda:0') Pred_data:  tensor([[ 359, 4037,  177,  525,  628, 1197]], device='cuda:0')
i: 3
torch.Size([1, 7])
next word:  र
3 Gen_data:  tensor([[ 1442],
        [  405],
        [12812],
        [  177],
        [  525],
        [  628],
        [ 1197]], device='cuda:0') Pred_data:  tensor([[ 359, 4037,  177,  525,  628, 1197,   61]], device='cuda:0')
i: 4
torch.Size([1, 8])
next word:  घर
4 Gen_dat

        [  628]], device='cuda:0') Pred_data:  tensor([[ 359, 4037,  177,  525,  628, 1197,   61,  628, 1197,  373,  286,   61,
          628, 1197,  373,  286,  373,  286,  373,  220,  287, 1275, 1310,  220,
          373, 1004, 1746,   41,   77,  345,    6,  628,  552]],
       device='cuda:0')
i: 30
torch.Size([1, 34])
next word:  र
30 Gen_data:  tensor([[ 1442],
        [  405],
        [12812],
        [  177],
        [  525],
        [  628],
        [ 1197],
        [   61],
        [  628],
        [ 1197],
        [  373],
        [  286],
        [   61],
        [  628],
        [ 1197],
        [  373],
        [  286],
        [  373],
        [  286],
        [  373],
        [  220],
        [  287],
        [ 1275],
        [ 1310],
        [  220],
        [  373],
        [ 1004],
        [ 1746],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552]], device='cuda:0') Pred_data:  tensor([[ 359, 4037,  177,  525,  628, 1

next word:  र
46 Gen_data:  tensor([[ 1442],
        [  405],
        [12812],
        [  177],
        [  525],
        [  628],
        [ 1197],
        [   61],
        [  628],
        [ 1197],
        [  373],
        [  286],
        [   61],
        [  628],
        [ 1197],
        [  373],
        [  286],
        [  373],
        [  286],
        [  373],
        [  220],
        [  287],
        [ 1275],
        [ 1310],
        [  220],
        [  373],
        [ 1004],
        [ 1746],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [   61],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [ 9286],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [10984],
        [  429]], device='cuda:0') Pred_data:  tensor([[  359,  4037,   177,   525,   628,  1197,    61,   628,  1197,   373,
           28

next word:  नसकlकo
60 Gen_data:  tensor([[ 1442],
        [  405],
        [12812],
        [  177],
        [  525],
        [  628],
        [ 1197],
        [   61],
        [  628],
        [ 1197],
        [  373],
        [  286],
        [   61],
        [  628],
        [ 1197],
        [  373],
        [  286],
        [  373],
        [  286],
        [  373],
        [  220],
        [  287],
        [ 1275],
        [ 1310],
        [  220],
        [  373],
        [ 1004],
        [ 1746],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [   61],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [ 9286],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [10984],
        [  429],
        [   61],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679]

torch.Size([1, 76])
next word:  बताए
72 Gen_data:  tensor([[ 1442],
        [  405],
        [12812],
        [  177],
        [  525],
        [  628],
        [ 1197],
        [   61],
        [  628],
        [ 1197],
        [  373],
        [  286],
        [   61],
        [  628],
        [ 1197],
        [  373],
        [  286],
        [  373],
        [  286],
        [  373],
        [  220],
        [  287],
        [ 1275],
        [ 1310],
        [  220],
        [  373],
        [ 1004],
        [ 1746],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [   61],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [ 9286],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [10984],
        [  429],
        [   61],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492

next word:  छ
82 Gen_data:  tensor([[ 1442],
        [  405],
        [12812],
        [  177],
        [  525],
        [  628],
        [ 1197],
        [   61],
        [  628],
        [ 1197],
        [  373],
        [  286],
        [   61],
        [  628],
        [ 1197],
        [  373],
        [  286],
        [  373],
        [  286],
        [  373],
        [  220],
        [  287],
        [ 1275],
        [ 1310],
        [  220],
        [  373],
        [ 1004],
        [ 1746],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [   61],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [ 9286],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [10984],
        [  429],
        [   61],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
   

        [  373]], device='cuda:0') Pred_data:  tensor([[  359,  4037,   177,   525,   628,  1197,    61,   628,  1197,   373,
           286,    61,   628,  1197,   373,   286,   373,   286,   373,   220,
           287,  1275,  1310,   220,   373,  1004,  1746,    41,    77,   345,
             6,   628,   552,    61,   628,   552,   267,  1004,  9286,    41,
            77,   345,     6,   628,   552,   267,  1004, 10984,   429,    61,
           373,  1004,  3423,  2519,   492,   679,    77,   492,   257,     6,
           373,  1004,  3423,  2519,   492,   679,    77,   492,   257,     6,
           373,  1004,  3423,  2519,   492,   679,    77,   492,   257,     6,
           373,   286,   373,   220,  1356,    41,    77,   492,   257,     6,
           373,   286]], device='cuda:0')
i: 89
torch.Size([1, 93])
next word:  काम
89 Gen_data:  tensor([[ 1442],
        [  405],
        [12812],
        [  177],
        [  525],
        [  628],
        [ 1197],
        [   61],
        

torch.Size([1, 102])
next word:  ?
98 Gen_data:  tensor([[ 1442],
        [  405],
        [12812],
        [  177],
        [  525],
        [  628],
        [ 1197],
        [   61],
        [  628],
        [ 1197],
        [  373],
        [  286],
        [   61],
        [  628],
        [ 1197],
        [  373],
        [  286],
        [  373],
        [  286],
        [  373],
        [  220],
        [  287],
        [ 1275],
        [ 1310],
        [  220],
        [  373],
        [ 1004],
        [ 1746],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [   61],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [ 9286],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [10984],
        [  429],
        [   61],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],

106 Gen_data:  tensor([[ 1442],
        [  405],
        [12812],
        [  177],
        [  525],
        [  628],
        [ 1197],
        [   61],
        [  628],
        [ 1197],
        [  373],
        [  286],
        [   61],
        [  628],
        [ 1197],
        [  373],
        [  286],
        [  373],
        [  286],
        [  373],
        [  220],
        [  287],
        [ 1275],
        [ 1310],
        [  220],
        [  373],
        [ 1004],
        [ 1746],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [   61],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [ 9286],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [10984],
        [  429],
        [   61],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],

        [ 9286]], device='cuda:0') Pred_data:  tensor([[  359,  4037,   177,   525,   628,  1197,    61,   628,  1197,   373,
           286,    61,   628,  1197,   373,   286,   373,   286,   373,   220,
           287,  1275,  1310,   220,   373,  1004,  1746,    41,    77,   345,
             6,   628,   552,    61,   628,   552,   267,  1004,  9286,    41,
            77,   345,     6,   628,   552,   267,  1004, 10984,   429,    61,
           373,  1004,  3423,  2519,   492,   679,    77,   492,   257,     6,
           373,  1004,  3423,  2519,   492,   679,    77,   492,   257,     6,
           373,  1004,  3423,  2519,   492,   679,    77,   492,   257,     6,
           373,   286,   373,   220,  1356,    41,    77,   492,   257,     6,
           373,   286,   373,  1004,  3423,  2519,   492,   679,    77,   492,
           257,     6,   373,  1004,  3423,  2519,   492,   679,    77,   492,
           257,     6,   373,   286,   373,  1004,  9286,   492]],
       device='cu

122 Gen_data:  tensor([[ 1442],
        [  405],
        [12812],
        [  177],
        [  525],
        [  628],
        [ 1197],
        [   61],
        [  628],
        [ 1197],
        [  373],
        [  286],
        [   61],
        [  628],
        [ 1197],
        [  373],
        [  286],
        [  373],
        [  286],
        [  373],
        [  220],
        [  287],
        [ 1275],
        [ 1310],
        [  220],
        [  373],
        [ 1004],
        [ 1746],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [   61],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [ 9286],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [10984],
        [  429],
        [   61],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],

torch.Size([1, 128])
next word:  ?
130 Gen_data:  tensor([[ 1197],
        [   61],
        [  628],
        [ 1197],
        [  373],
        [  286],
        [   61],
        [  628],
        [ 1197],
        [  373],
        [  286],
        [  373],
        [  286],
        [  373],
        [  220],
        [  287],
        [ 1275],
        [ 1310],
        [  220],
        [  373],
        [ 1004],
        [ 1746],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [   61],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [ 9286],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [10984],
        [  429],
        [   61],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373]

next word:  र
135 Gen_data:  tensor([[  286],
        [   61],
        [  628],
        [ 1197],
        [  373],
        [  286],
        [  373],
        [  286],
        [  373],
        [  220],
        [  287],
        [ 1275],
        [ 1310],
        [  220],
        [  373],
        [ 1004],
        [ 1746],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [   61],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [ 9286],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [10984],
        [  429],
        [   61],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
  

142 Gen_data:  tensor([[  286],
        [  373],
        [  220],
        [  287],
        [ 1275],
        [ 1310],
        [  220],
        [  373],
        [ 1004],
        [ 1746],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [   61],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [ 9286],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [10984],
        [  429],
        [   61],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],

next word:  उनलl
149 Gen_data:  tensor([[  373],
        [ 1004],
        [ 1746],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [   61],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [ 9286],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [10984],
        [  429],
        [   61],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],

155 Gen_data:  tensor([[    6],
        [  628],
        [  552],
        [   61],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [ 9286],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [10984],
        [  429],
        [   61],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [  286],
        [  373],
        [  220],
        [ 1356],
        [   41],
        [   77],
        [  492],

torch.Size([1, 128])
next word:  भइरहlकo
162 Gen_data:  tensor([[ 1004],
        [ 9286],
        [   41],
        [   77],
        [  345],
        [    6],
        [  628],
        [  552],
        [  267],
        [ 1004],
        [10984],
        [  429],
        [   61],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [  286],
        [  373],
        [  220],
        [ 1356],
        [   41],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [  286],
        [

next word:  काम
168 Gen_data:  tensor([[  628],
        [  552],
        [  267],
        [ 1004],
        [10984],
        [  429],
        [   61],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [  286],
        [  373],
        [  220],
        [ 1356],
        [   41],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [  286],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],


173 Gen_data:  tensor([[  429],
        [   61],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [  286],
        [  373],
        [  220],
        [ 1356],
        [   41],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [  286],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],

        [  373]], device='cuda:0') Pred_data:  tensor([[ 257,   77,  492,  257,    6,  492,  286, 3423, 2519,  492,  679,   77,
          492,  257,    6,  492,  286, 9286, 2519,  492,  679,   77,  492,  257,
            6,  492,  286,  373,  220, 1356,   41,   77,  492,  257,    6,  492,
          286,  373,  220, 9286, 2519,  492,  679,   77,  492,  257,    6,  373,
          286, 9286, 2519,  492,  679,   77,  492,  257,    6,  492,  286,  373,
          220, 9286,  492,  679,   77,  492,  257,    6,  492, 1004,  373,  220,
         1356,  492,  679,   77,  492,  257,    6,  492,  286, 9286, 6231,   61,
          373, 1004, 9286,  492,  679,   77,  492,  257,    6,  492,  286, 9286,
         2519,  492,  679,   77,  492,  257,    6,  492,  257,    6,  373,  286,
          373,  220, 1356,  492,   77,  492,  257,    6,  373,  286,  373,  220,
         1356,  492,   77,  492,  257,    6,  373,  286]], device='cuda:0')
i: 180
torch.Size([1, 128])
next word:  काम
180 Gen_data:  tensor([

torch.Size([1, 128])
next word:  भनl
186 Gen_data:  tensor([[ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [  286],
        [  373],
        [  220],
        [ 1356],
        [   41],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [  286],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [  286],
        [  373],
        [ 1004],
        [ 928

next word:  भइरहlकo
192 Gen_data:  tensor([[  492],
        [  257],
        [    6],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [  286],
        [  373],
        [  220],
        [ 1356],
        [   41],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [  286],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [  286],
        [  373],
        [ 1004],
        [ 9286],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  37

198 Gen_data:  tensor([[ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [  286],
        [  373],
        [  220],
        [ 1356],
        [   41],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [  286],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [ 1004],
        [ 3423],
        [ 2519],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [  286],
        [  373],
        [ 1004],
        [ 9286],
        [  492],
        [  679],
        [   77],
        [  492],
        [  257],
        [    6],
        [  373],
        [  286],
        [  373],
        [  220],
        [ 1356],
        [  492],
        [  679],
        [   77],

In [130]:
z_ = [t.item() for t in z_]
m = splits_to_token(split_list(z_))
m

['आफaनo घर बनाउनl र घर बनाउनl काम गरaनl र घर बनाउनl काम गरaनl काम गरaनl काम',
 'गरaन सकaनl भएकालl',
 'काम अघि बढlकo छ । तर? घर निरaमाण र घर निरaमाण कारaय अघि बढाइएकo छ । तर? घर निरaमाण कारaय अघि नबढaनl र काम अघि बढaन नसकlकo उनलl बताए । उनलl भनl? काम अघि बढaन नसकlकo उनलl बताए । उनलl भनl? काम अघि बढaन नसकlकo उनलl बताए । उनलl भनl? काम गरaनl काम',
 'भइरहlकo छ । उनलl भनl? काम गरaनl काम अघि बढaन नसकlकo उनलl बताए । उनलl भनl? काम अघि बढaन नसकlकo उनलl बताए । उनलl भनl? काम गरaनl काम अघि बढाइएकo उनलl बताए । उनलl भनl? काम गरaनl काम',
 'भइरहlकo उनलl बताए । उनलl भनl? काम अघि नबढaनl र काम अघि बढाइएकo उनलl बताए । उनलl भनl? काम अघि बढaन नसकlकo उनलl बताए । उनलl भनl? उनलl भनl? काम गरaनl काम',
 'भइरहlकo छ । उनलl भनl? काम गरaनl काम',
 'भइरहlकo छ । उनलl भनl? काम गरaनl काम',
 'भइरहlकo छ । उनलl भनl? काम गरaनl काम',
 'भइरहlकo छ । उनलl भनl? काम गरaनl']

In [132]:
word_piece_decoder(' '.join(st) + ' '.join(m))

'हरेक सेपालीलेआफ्नो घर बनाउने र घर बनाउने काम गर्ने र घर बनाउने काम गर्ने काम गर्ने काम गर्न सक्ने भएकाले काम अघि बढेको छ । तर? घर निर्माण र घर निर्माण कार्य अघि बढाइएको छ । तर? घर निर्माण कार्य अघि नबढ्ने र काम अघि बढ्न नसकेको उनले बताए । उनले भने? काम अघि बढ्न नसकेको उनले बताए । उनले भने? काम अघि बढ्न नसकेको उनले बताए । उनले भने? काम गर्ने काम भइरहेको छ । उनले भने? काम गर्ने काम अघि बढ्न नसकेको उनले बताए । उनले भने? काम अघि बढ्न नसकेको उनले बताए । उनले भने? काम गर्ने काम अघि बढाइएको उनले बताए । उनले भने? काम गर्ने काम भइरहेको उनले बताए । उनले भने? काम अघि नबढ्ने र काम अघि बढाइएको उनले बताए । उनले भने? काम अघि बढ्न नसकेको उनले बताए । उनले भने? उनले भने? काम गर्ने काम भइरहेको छ । उनले भने? काम गर्ने काम भइरहेको छ । उनले भने? काम गर्ने काम भइरहेको छ । उनले भने? काम गर्ने काम भइरहेको छ । उनले भने? काम गर्ने'