In [2]:
import torch
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

from torch.utils.data import dataset

from torch import Tensor, nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer



In [3]:
def try_gpu(i=0):
    if torch.cuda.device_count() >= i+1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')


device = try_gpu(0)

# Preprocessing Text

In [4]:
import regex as re


#Data path to ne_dedup.txt
datapath = '../data/ne_dedup.txt'

with open(datapath, 'r', encoding='utf-8') as f:
    text = f.read()
    text = re.sub(r'\s*[\u0964]\s*', r'\u0020\u0964\u0020', text)
    text = re.sub(r'\s*[\u003f]\s*', r'\u0020\u003f\u0020', text)
    text = re.sub(r'\s*[\u002c]\s*', r'\u0020\u003f\u0020', text)
    text = re.sub(r'\s*\n\s*','\n', text)
    #text = re.sub(r'\s*[\u0966-\u0976]+\s*','\u0020[\u0966-\u0976]\u0020', text)
    #text = re.sub(r'\s+?\s+', r'\u0020?\u0020', text)
    text = re.sub(r'[^\u0900-\u097F,?\s+]','', text)
    
    

In [9]:
c = re.sub(r'[^\u0900-\u097F,?]','','?नमस्तेnishant,')
#c = re.sub(r'\s*[\u002c]\s*','\u0020\u002c\u0020','?नमस्तेnishant,')
c


'?नमस्ते,'

In [193]:
hex(ord('०'))

'0x966'

In [10]:
len(text.split('\n'))

341961

In [5]:
train_split = 300000



train_iter_first = text.split('\n')[:train_split]
# train_iter_second = text.split('\n')[:35000]
test_iter = text.split('\n')[train_split:]

In [6]:
train_iter_first[:100]

['बर्दिबास नगरपालिकाको तेस्रो नगर परिषदबाट पारित आव२०७३ । ७४ को संशोधित र २०७४ । ७५ को प्रस्तावित नीति ? कार्यक्रम तथा बजेट',
 'अार्थिक वर्ष २०७५७६ काे नदिजन्य पदार्थकाे उत्खनन् गरी बिक्रि वितरण तथा अान्तरिक निकासी गर्ने कार्यकाे बाेलपत्र सम्बन्धी सुचना',
 'सक्षार सप्तरी अभियानमा सप्तरीबासी सम्पूर्ण सरोकारवालाहरुको सहयोग र सहभागिताकाो लागि अनुराोध छ ।  सामुदायिक अध्ययन केन्द्रहरूको नविकरण सम्बन्धमा । ',
 'काठमाडौं ? १२ कातिक । राष्ट्रपति विद्यादेवी भण्डारी मित्रराष्ट्र कतारको चार दिवसीय औपचारिक भ्रमणमा आज त्यसतर्फ प्रस्थान गरेकी छन् । राष्ट्रपति विद्यादेवी भण्डारी कतारका अमिर शेख हमाद बीन खालिदा अल थानीको मैत्रीपूर्ण निमन्त्रणामा चार दिवसीय औपचारिक',
 'काठमाडौँ ? २६ कात्तिक । सरकारले सङ्घ ? प्रदेश र स्थानीय तहमा कर्मचारी समायोजन गर्नका लागि कर्मचारी समायोजन अध्यादेश२०७५ ल्याउने तयारी गरेको छ । सरकारले यसअघि ल्याएको',
 'काठमाडौं ? २६ कातिक । महानायक राजेश हमाल अहिले चलचित्र क्षेत्रमा पातलिए पनि उनको सिने जगतमा नामै काफी छ । कुनै समय बलिउड सुपरस्टार अमिताभ वच्चनसँग',
 'काठमाडौं ? २६ काति

In [197]:
print(len(train_iter_first),len(train_iter_second),len(test_iter))

30000 35000 5000


In [7]:
import pickle


tokenizer = get_tokenizer(None)
max_tokens = 350000 #Three hundered And Fivety Thousand

vocab = build_vocab_from_iterator(
    map(tokenizer, train_iter_first), specials=['<unk>'],max_tokens = max_tokens)
vocab.set_default_index(vocab['<unk>'])


# Save for first time
# with open('transformer_vocab.pickle','wb') as f:
#     pickle.dump(vocab,f)

# Otherwise load
# with open('transformer_vocab.pickle','rb') as f:
#     vocab = pickle.load(f)

In [8]:
def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
    """Converts raw text into a flat Tensor."""
    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long)
            for item in raw_text_iter]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

In [19]:
vocab['<unk>']

0

In [200]:
#device = torch.device('cpu')

In [20]:
#Train and Test Split

train_data = data_process(train_iter_first)
test_data = data_process(test_iter)

In [21]:
train_data[:100]

tensor([ 20481,   1080,    490,    743,  29398,    893, 164058,      1,   3608,
            33,  12075,      3,    835,      1,   1382,     33,   2849,    321,
             2,    102,     14,    364,  20723,     81,   5409,   8406,  39570,
        213968,  11917,     30,   6163,    433,     14,  54456,   4008,     13,
        126991,  72680,    615,   2964,      0,   4002,   1726, 337340,    470,
         21393,    123,      3,      0,      9,      0,      4,      1,    876,
           326,  56619,  10647,    653,      1,     54,      2,    214,  20657,
             1,    338,   3191,   1271,  12882,   7646,    233,   8467,   1164,
          1424,     91,   5979,   3550,    659,      7,      1,    338,   3191,
          1271,   9711,  16964,   7317,  29596,  41555,  35047,   3849,  44092,
          6219,   9151,    233,   8467,   1164,    181,      2,    642,    418,
             1])

In [23]:
print(len(vocab))
print(len(train_iter_first),len(test_iter))
torch.cuda.empty_cache() 

torch.cuda.memory_allocated() 

350000
300000 41961


0

In [15]:
def batchify(data: Tensor, bsz: int) -> Tensor:
    """Divides the data into bsz separate sequences, removing extra elements
    that wouldn't cleanly fit.
    Args:
        data: Tensor, shape [N]
        bsz: int, batch size
    Returns:
        Tensor of shape [N // bsz, bsz]
    """
    seq_len = data.size(0) // bsz
    data = data[:seq_len * bsz]
    data = data.view(bsz, seq_len).t().contiguous()
    return data.to(device)

In [4]:
# batch_size = 35
# eval_batch_size = 10

# train_data = batchify(train_data, batch_size)  # shape [seq_len, batch_size]
# test_data = batchify(test_data, eval_batch_size)

# Working with a dummy Sample

In [17]:
#Sample Data


text = ['आधिकारिक निर्णयको कारणले , वाणिज्य बिभागले , संयुक्त राज्य अमेरिकी समुद्री पानी निर्माताद्वारा संयुक्त राज्य']
#text = ['जनसंख्या']
sample_data = data_process(
    text)

In [207]:
sample_data

tensor([ 2086,  5694,   568,     0,   897, 28361,     0,   357,   465,   410,
         6548,   293,     0,   357,   465])

In [18]:
sample_data = batchify(sample_data, 2)
print("Given word:", text[0])
sample_data

Given word: आधिकारिक निर्णयको कारणले , वाणिज्य बिभागले , संयुक्त राज्य अमेरिकी समुद्री पानी निर्माताद्वारा संयुक्त राज्य


tensor([[ 2086,   357],
        [ 5694,   465],
        [  568,   410],
        [    0,  6548],
        [  897,   293],
        [28361,     0],
        [    0,   357]], device='cuda:0')

In [209]:
train_data.shape

torch.Size([7540440])

In [37]:
bptt = 16

In [210]:
bptt = 16
train_data = batchify(train_data, bptt)  # shape [seq_len, batch_size]
test_data = batchify(test_data, bptt)


import math
def get_batch(source: Tensor, i: int) -> tuple[Tensor, Tensor]:
    """
    Args:
        source: Tensor, shape [full_seq_len, batch_size]
        i: int
    Returns:
        tuple (data, target), where data has shape [seq_len, batch_size] and
        target has shape [seq_len * batch_size]
    """
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    #target = source[i+1:i+1+seq_len]
    return data, target

# Model Definition

In [19]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2)
                             * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)





class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(
            d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
        """
        Args:
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]
        Returns:
            output Tensor of shape [seq_len, batch_size, ntoken]
        """
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output

In [212]:
# torch.cuda.memory_stats()


In [213]:
torch.cuda.memory_allocated()

2599095296

In [214]:
#device = torch.device('cpu')

# Hyper-Parameter Tuning

In [21]:
import math

In [22]:
ntokens = len(vocab)  # size of vocabulary
emsize = 300  # embedding dimension
d_hid = 800  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 4 # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 4 # number of heads in nn.MultiheadAttention
dropout = 0.05  # dropout probability
model = TransformerModel(ntokens, emsize,nhead, d_hid,
                         nlayers, dropout).to(device)

In [23]:
criterion = nn.CrossEntropyLoss()
lr = 1  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

In [24]:
def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

In [218]:
def train(model: nn.Module) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 200
    start_time = time.time()
    src_mask = generate_square_subsequent_mask(bptt).to(device)

    num_batches = len(train_data) // bptt
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        batch_size = data.size(0)
        if batch_size != bptt:  # only on last batch
            src_mask = src_mask[:batch_size, :batch_size]
        output = model(data, src_mask)
        #print(type(output))
        loss = criterion(output.view(-1, ntokens), targets)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if batch % log_interval == 0 and batch > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

In [40]:
softmax = nn.Softmax(dim=2)
#softmax = nn.LogSoftmax(dim=2)

In [220]:
def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(bptt).to(device)
    with torch.no_grad():
        for i in range(0, eval_data.size(0) - 1, bptt):
            data, targets = get_batch(eval_data, i)
            batch_size = data.size(0)
            if batch_size != bptt:
                src_mask = src_mask[:batch_size, :batch_size]
            output = model(data, src_mask)
            output_softmax = softmax(output)
            output_softmax_permuted = output_softmax.permute(1, 0, 2)
            indices = torch.argmax(output_softmax_permuted, dim=2)
            target_indices = targets.t()
            # print(output)
            output_flat = output.view(-1, ntokens)
            total_loss += batch_size * criterion(output_flat, targets).item()

    return total_loss / (len(eval_data) - 1)


In [221]:
#            print('data')
#             print(list([vocab.lookup_tokens(list(index))
#                         for index in data.t()]))
#             print(indices)
#             print(list([vocab.lookup_tokens(list(index))
#                         for index in indices]))
#             print(len(targets))
#             print(list([vocab.lookup_tokens(list(index))
#                         for index in target_indices]))

In [222]:
        # target_indices = targets.t()
        # print(indices.shape)
        # print(indices)
        # temp_gen_data = [vocab.lookup_tokens(
        #    list(index)) for index in indices][0][i]
        # gen_data = [vocab.lookup_tokens(
        #    list(index)) for index in indices][0]
        # print(temp_text)
        # print([[vocab.lookup_tokens(list(index))
        #        for index in indices][0][i]])
#         temp_text = [[vocab.lookup_tokens(list(index))]
#                       for index in indices[0][i]]
        # print(temp_text)
#         temp_text = [' '.join(temp_text)]
        # print(temp_text)
        # gen_data = vocab.lookup_tokens((list(gen_data))) + list(temp_gen_data)
        # gen_data = torch.tensor(
        #    (list(gen_data[0])+list(indices[0][i]))).unsqueeze(0)
        # gen_data = torch.concat([gen_data, torch.tensor(indices[0][i])], dim=1)
        #gen_data = ' '.join(gen_data)
        #return temp_txt
#         gen_data = data_process(temp_text)
#         gen_data = batchify(gen_data, 1)
        #return gen_data

In [223]:
# import os
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

# Training Data

In [81]:
# Loop over epochs. Save the model if the validation loss is the best
# we've seen so far. Adjust the learning rate after each epoch.
import time
import copy
best_val_loss = float('inf')
epochs = 8
best_model = None
for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(model)
    eval_loss = evaluate(model, test_data)
    eval_ppl = math.exp(eval_loss)
    elapsed = time.time() - epoch_start_time
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
         f'valid loss {eval_loss:5.2f} | valid ppl {eval_ppl:8.2f}')
    print('-' * 89)
    #best_model = copy.deepcopy(model)
    if eval_loss < best_val_loss:
       best_val_loss = eval_loss
       best_model = copy.deepcopy(model)

#     scheduler.step()
#save model
#torch.save(best_model.state_dict(),'best_model_3bigx3.pt')
torch.save(best_model.state_dict(),'best_model_3bigx3_corrected.pt')
torch.save(model.state_dict(),'training_model_3bigx3_corrected.pt')


| epoch   1 |   200/29454 batches | lr 1.00 | ms/batch 153.47 | loss 10.72 | ppl 45202.67
| epoch   1 |   400/29454 batches | lr 1.00 | ms/batch 150.94 | loss  9.61 | ppl 14900.30
| epoch   1 |   600/29454 batches | lr 1.00 | ms/batch 151.98 | loss  9.26 | ppl 10511.59
| epoch   1 |   800/29454 batches | lr 1.00 | ms/batch 152.24 | loss  9.10 | ppl  8980.99
| epoch   1 |  1000/29454 batches | lr 1.00 | ms/batch 152.16 | loss  8.98 | ppl  7973.21
| epoch   1 |  1200/29454 batches | lr 1.00 | ms/batch 152.26 | loss  8.79 | ppl  6593.27
| epoch   1 |  1400/29454 batches | lr 1.00 | ms/batch 152.27 | loss  8.70 | ppl  5998.29
| epoch   1 |  1600/29454 batches | lr 1.00 | ms/batch 153.06 | loss  8.64 | ppl  5667.70
| epoch   1 |  1800/29454 batches | lr 1.00 | ms/batch 152.68 | loss  8.64 | ppl  5674.85
| epoch   1 |  2000/29454 batches | lr 1.00 | ms/batch 152.50 | loss  8.53 | ppl  5072.71
| epoch   1 |  2200/29454 batches | lr 1.00 | ms/batch 152.49 | loss  8.50 | ppl  4928.34
| epoch   

| epoch   1 | 18600/29454 batches | lr 1.00 | ms/batch 152.56 | loss  7.58 | ppl  1962.75
| epoch   1 | 18800/29454 batches | lr 1.00 | ms/batch 152.65 | loss  7.57 | ppl  1929.49
| epoch   1 | 19000/29454 batches | lr 1.00 | ms/batch 152.69 | loss  7.34 | ppl  1545.26
| epoch   1 | 19200/29454 batches | lr 1.00 | ms/batch 152.71 | loss  7.27 | ppl  1434.26
| epoch   1 | 19400/29454 batches | lr 1.00 | ms/batch 152.70 | loss  7.35 | ppl  1554.65
| epoch   1 | 19600/29454 batches | lr 1.00 | ms/batch 152.58 | loss  7.21 | ppl  1352.80
| epoch   1 | 19800/29454 batches | lr 1.00 | ms/batch 152.56 | loss  7.27 | ppl  1441.17
| epoch   1 | 20000/29454 batches | lr 1.00 | ms/batch 152.54 | loss  7.34 | ppl  1536.96
| epoch   1 | 20200/29454 batches | lr 1.00 | ms/batch 152.53 | loss  7.37 | ppl  1585.04
| epoch   1 | 20400/29454 batches | lr 1.00 | ms/batch 152.59 | loss  7.28 | ppl  1449.97
| epoch   1 | 20600/29454 batches | lr 1.00 | ms/batch 152.59 | loss  7.38 | ppl  1609.06
| epoch   

| epoch   2 |  7000/29454 batches | lr 1.00 | ms/batch 152.86 | loss  7.10 | ppl  1207.73
| epoch   2 |  7200/29454 batches | lr 1.00 | ms/batch 152.75 | loss  6.98 | ppl  1080.15
| epoch   2 |  7400/29454 batches | lr 1.00 | ms/batch 152.82 | loss  7.01 | ppl  1103.26
| epoch   2 |  7600/29454 batches | lr 1.00 | ms/batch 152.70 | loss  7.19 | ppl  1319.76
| epoch   2 |  7800/29454 batches | lr 1.00 | ms/batch 152.82 | loss  7.08 | ppl  1193.51
| epoch   2 |  8000/29454 batches | lr 1.00 | ms/batch 152.81 | loss  7.35 | ppl  1554.22
| epoch   2 |  8200/29454 batches | lr 1.00 | ms/batch 152.85 | loss  7.01 | ppl  1108.28
| epoch   2 |  8400/29454 batches | lr 1.00 | ms/batch 152.74 | loss  6.89 | ppl   983.90
| epoch   2 |  8600/29454 batches | lr 1.00 | ms/batch 152.86 | loss  7.10 | ppl  1206.01
| epoch   2 |  8800/29454 batches | lr 1.00 | ms/batch 152.78 | loss  7.05 | ppl  1151.95
| epoch   2 |  9000/29454 batches | lr 1.00 | ms/batch 152.84 | loss  6.83 | ppl   929.74
| epoch   

| epoch   2 | 25400/29454 batches | lr 1.00 | ms/batch 152.73 | loss  6.81 | ppl   905.35
| epoch   2 | 25600/29454 batches | lr 1.00 | ms/batch 152.77 | loss  6.81 | ppl   905.63
| epoch   2 | 25800/29454 batches | lr 1.00 | ms/batch 152.71 | loss  6.75 | ppl   855.11
| epoch   2 | 26000/29454 batches | lr 1.00 | ms/batch 152.77 | loss  6.90 | ppl   992.94
| epoch   2 | 26200/29454 batches | lr 1.00 | ms/batch 152.98 | loss  6.66 | ppl   784.22
| epoch   2 | 26400/29454 batches | lr 1.00 | ms/batch 152.82 | loss  6.82 | ppl   920.46
| epoch   2 | 26600/29454 batches | lr 1.00 | ms/batch 152.71 | loss  6.75 | ppl   852.40
| epoch   2 | 26800/29454 batches | lr 1.00 | ms/batch 152.73 | loss  6.94 | ppl  1028.38
| epoch   2 | 27000/29454 batches | lr 1.00 | ms/batch 152.63 | loss  6.90 | ppl   995.50
| epoch   2 | 27200/29454 batches | lr 1.00 | ms/batch 152.71 | loss  6.80 | ppl   894.98
| epoch   2 | 27400/29454 batches | lr 1.00 | ms/batch 152.77 | loss  6.80 | ppl   901.30
| epoch   

| epoch   3 | 13800/29454 batches | lr 1.00 | ms/batch 152.77 | loss  6.74 | ppl   843.76
| epoch   3 | 14000/29454 batches | lr 1.00 | ms/batch 152.71 | loss  6.63 | ppl   755.16
| epoch   3 | 14200/29454 batches | lr 1.00 | ms/batch 152.77 | loss  6.62 | ppl   752.67
| epoch   3 | 14400/29454 batches | lr 1.00 | ms/batch 152.67 | loss  6.74 | ppl   845.77
| epoch   3 | 14600/29454 batches | lr 1.00 | ms/batch 152.74 | loss  6.60 | ppl   733.52
| epoch   3 | 14800/29454 batches | lr 1.00 | ms/batch 152.67 | loss  6.49 | ppl   661.01
| epoch   3 | 15000/29454 batches | lr 1.00 | ms/batch 152.65 | loss  6.48 | ppl   652.73
| epoch   3 | 15200/29454 batches | lr 1.00 | ms/batch 152.81 | loss  6.42 | ppl   610.94
| epoch   3 | 15400/29454 batches | lr 1.00 | ms/batch 152.64 | loss  6.45 | ppl   633.30
| epoch   3 | 15600/29454 batches | lr 1.00 | ms/batch 152.70 | loss  6.45 | ppl   634.47
| epoch   3 | 15800/29454 batches | lr 1.00 | ms/batch 152.76 | loss  6.42 | ppl   611.99
| epoch   

| epoch   4 |  2200/29454 batches | lr 1.00 | ms/batch 152.88 | loss  6.29 | ppl   540.75
| epoch   4 |  2400/29454 batches | lr 1.00 | ms/batch 152.72 | loss  6.46 | ppl   639.96
| epoch   4 |  2600/29454 batches | lr 1.00 | ms/batch 152.80 | loss  6.38 | ppl   591.34
| epoch   4 |  2800/29454 batches | lr 1.00 | ms/batch 152.71 | loss  6.30 | ppl   543.65
| epoch   4 |  3000/29454 batches | lr 1.00 | ms/batch 152.61 | loss  6.71 | ppl   817.46
| epoch   4 |  3200/29454 batches | lr 1.00 | ms/batch 152.73 | loss  6.46 | ppl   636.53
| epoch   4 |  3400/29454 batches | lr 1.00 | ms/batch 152.61 | loss  6.46 | ppl   637.17
| epoch   4 |  3600/29454 batches | lr 1.00 | ms/batch 152.66 | loss  6.44 | ppl   625.21
| epoch   4 |  3800/29454 batches | lr 1.00 | ms/batch 152.77 | loss  6.36 | ppl   578.93
| epoch   4 |  4000/29454 batches | lr 1.00 | ms/batch 152.73 | loss  6.36 | ppl   580.44
| epoch   4 |  4200/29454 batches | lr 1.00 | ms/batch 152.82 | loss  6.36 | ppl   579.66
| epoch   

| epoch   4 | 20600/29454 batches | lr 1.00 | ms/batch 152.02 | loss  6.37 | ppl   581.83
| epoch   4 | 20800/29454 batches | lr 1.00 | ms/batch 152.00 | loss  6.22 | ppl   501.98
| epoch   4 | 21000/29454 batches | lr 1.00 | ms/batch 152.05 | loss  6.21 | ppl   499.00
| epoch   4 | 21200/29454 batches | lr 1.00 | ms/batch 152.14 | loss  6.26 | ppl   521.30
| epoch   4 | 21400/29454 batches | lr 1.00 | ms/batch 152.18 | loss  6.27 | ppl   526.04
| epoch   4 | 21600/29454 batches | lr 1.00 | ms/batch 152.27 | loss  6.24 | ppl   512.16
| epoch   4 | 21800/29454 batches | lr 1.00 | ms/batch 152.14 | loss  6.32 | ppl   552.87
| epoch   4 | 22000/29454 batches | lr 1.00 | ms/batch 152.05 | loss  6.30 | ppl   542.56
| epoch   4 | 22200/29454 batches | lr 1.00 | ms/batch 152.09 | loss  6.21 | ppl   496.34
| epoch   4 | 22400/29454 batches | lr 1.00 | ms/batch 152.04 | loss  6.33 | ppl   562.03
| epoch   4 | 22600/29454 batches | lr 1.00 | ms/batch 152.13 | loss  6.34 | ppl   564.18
| epoch   

| epoch   5 |  9000/29454 batches | lr 1.00 | ms/batch 152.76 | loss  6.05 | ppl   422.98
| epoch   5 |  9200/29454 batches | lr 1.00 | ms/batch 152.71 | loss  6.25 | ppl   516.99
| epoch   5 |  9400/29454 batches | lr 1.00 | ms/batch 152.71 | loss  6.14 | ppl   463.03
| epoch   5 |  9600/29454 batches | lr 1.00 | ms/batch 152.68 | loss  6.27 | ppl   529.79
| epoch   5 |  9800/29454 batches | lr 1.00 | ms/batch 152.73 | loss  6.23 | ppl   509.60
| epoch   5 | 10000/29454 batches | lr 1.00 | ms/batch 152.77 | loss  6.25 | ppl   517.82
| epoch   5 | 10200/29454 batches | lr 1.00 | ms/batch 152.73 | loss  6.25 | ppl   516.58
| epoch   5 | 10400/29454 batches | lr 1.00 | ms/batch 152.94 | loss  6.09 | ppl   442.98
| epoch   5 | 10600/29454 batches | lr 1.00 | ms/batch 152.88 | loss  6.23 | ppl   509.48
| epoch   5 | 10800/29454 batches | lr 1.00 | ms/batch 152.76 | loss  6.01 | ppl   408.10
| epoch   5 | 11000/29454 batches | lr 1.00 | ms/batch 152.76 | loss  6.04 | ppl   418.27
| epoch   

| epoch   5 | 27400/29454 batches | lr 1.00 | ms/batch 152.91 | loss  6.12 | ppl   454.76
| epoch   5 | 27600/29454 batches | lr 1.00 | ms/batch 152.79 | loss  6.15 | ppl   470.80
| epoch   5 | 27800/29454 batches | lr 1.00 | ms/batch 152.86 | loss  6.19 | ppl   487.49
| epoch   5 | 28000/29454 batches | lr 1.00 | ms/batch 152.87 | loss  6.21 | ppl   499.33
| epoch   5 | 28200/29454 batches | lr 1.00 | ms/batch 152.92 | loss  6.15 | ppl   470.69
| epoch   5 | 28400/29454 batches | lr 1.00 | ms/batch 152.83 | loss  6.16 | ppl   475.12
| epoch   5 | 28600/29454 batches | lr 1.00 | ms/batch 152.91 | loss  5.98 | ppl   395.54
| epoch   5 | 28800/29454 batches | lr 1.00 | ms/batch 152.89 | loss  6.02 | ppl   410.08
| epoch   5 | 29000/29454 batches | lr 1.00 | ms/batch 152.88 | loss  6.11 | ppl   452.53
| epoch   5 | 29200/29454 batches | lr 1.00 | ms/batch 152.95 | loss  6.02 | ppl   410.76
| epoch   5 | 29400/29454 batches | lr 1.00 | ms/batch 152.89 | loss  6.00 | ppl   401.63
----------

| epoch   6 | 15800/29454 batches | lr 1.00 | ms/batch 152.94 | loss  5.85 | ppl   348.48
| epoch   6 | 16000/29454 batches | lr 1.00 | ms/batch 152.94 | loss  5.90 | ppl   366.75
| epoch   6 | 16200/29454 batches | lr 1.00 | ms/batch 152.95 | loss  5.93 | ppl   377.28
| epoch   6 | 16400/29454 batches | lr 1.00 | ms/batch 152.93 | loss  6.01 | ppl   407.54
| epoch   6 | 16600/29454 batches | lr 1.00 | ms/batch 152.93 | loss  5.87 | ppl   353.56
| epoch   6 | 16800/29454 batches | lr 1.00 | ms/batch 153.00 | loss  5.92 | ppl   373.95
| epoch   6 | 17000/29454 batches | lr 1.00 | ms/batch 153.00 | loss  5.93 | ppl   375.36
| epoch   6 | 17200/29454 batches | lr 1.00 | ms/batch 152.93 | loss  6.07 | ppl   434.14
| epoch   6 | 17400/29454 batches | lr 1.00 | ms/batch 152.98 | loss  5.84 | ppl   344.59
| epoch   6 | 17600/29454 batches | lr 1.00 | ms/batch 153.00 | loss  5.89 | ppl   360.13
| epoch   6 | 17800/29454 batches | lr 1.00 | ms/batch 152.95 | loss  6.06 | ppl   428.42
| epoch   

| epoch   7 |  4200/29454 batches | lr 1.00 | ms/batch 153.74 | loss  5.82 | ppl   337.34
| epoch   7 |  4400/29454 batches | lr 1.00 | ms/batch 152.92 | loss  5.79 | ppl   326.00
| epoch   7 |  4600/29454 batches | lr 1.00 | ms/batch 152.91 | loss  5.90 | ppl   364.31
| epoch   7 |  4800/29454 batches | lr 1.00 | ms/batch 152.92 | loss  5.82 | ppl   338.56
| epoch   7 |  5000/29454 batches | lr 1.00 | ms/batch 152.97 | loss  5.97 | ppl   391.07
| epoch   7 |  5200/29454 batches | lr 1.00 | ms/batch 152.97 | loss  5.89 | ppl   363.17
| epoch   7 |  5400/29454 batches | lr 1.00 | ms/batch 152.98 | loss  5.94 | ppl   381.70
| epoch   7 |  5600/29454 batches | lr 1.00 | ms/batch 152.97 | loss  5.66 | ppl   287.87
| epoch   7 |  5800/29454 batches | lr 1.00 | ms/batch 153.07 | loss  5.82 | ppl   338.03
| epoch   7 |  6000/29454 batches | lr 1.00 | ms/batch 153.00 | loss  6.07 | ppl   434.81
| epoch   7 |  6200/29454 batches | lr 1.00 | ms/batch 152.98 | loss  5.89 | ppl   360.33
| epoch   

| epoch   7 | 22600/29454 batches | lr 1.00 | ms/batch 152.88 | loss  5.87 | ppl   354.24
| epoch   7 | 22800/29454 batches | lr 1.00 | ms/batch 152.93 | loss  5.79 | ppl   328.57
| epoch   7 | 23000/29454 batches | lr 1.00 | ms/batch 152.84 | loss  5.90 | ppl   364.60
| epoch   7 | 23200/29454 batches | lr 1.00 | ms/batch 152.87 | loss  5.84 | ppl   343.07
| epoch   7 | 23400/29454 batches | lr 1.00 | ms/batch 152.89 | loss  5.75 | ppl   314.05
| epoch   7 | 23600/29454 batches | lr 1.00 | ms/batch 152.92 | loss  5.90 | ppl   365.63
| epoch   7 | 23800/29454 batches | lr 1.00 | ms/batch 153.13 | loss  5.81 | ppl   334.38
| epoch   7 | 24000/29454 batches | lr 1.00 | ms/batch 152.99 | loss  5.83 | ppl   339.18
| epoch   7 | 24200/29454 batches | lr 1.00 | ms/batch 153.04 | loss  5.60 | ppl   270.06
| epoch   7 | 24400/29454 batches | lr 1.00 | ms/batch 152.94 | loss  5.67 | ppl   290.14
| epoch   7 | 24600/29454 batches | lr 1.00 | ms/batch 152.98 | loss  5.81 | ppl   335.21
| epoch   

| epoch   8 | 11000/29454 batches | lr 1.00 | ms/batch 152.96 | loss  5.59 | ppl   266.67
| epoch   8 | 11200/29454 batches | lr 1.00 | ms/batch 152.99 | loss  5.63 | ppl   278.16
| epoch   8 | 11400/29454 batches | lr 1.00 | ms/batch 152.97 | loss  5.58 | ppl   263.84
| epoch   8 | 11600/29454 batches | lr 1.00 | ms/batch 153.02 | loss  5.55 | ppl   257.05
| epoch   8 | 11800/29454 batches | lr 1.00 | ms/batch 153.00 | loss  5.60 | ppl   270.91
| epoch   8 | 12000/29454 batches | lr 1.00 | ms/batch 152.96 | loss  5.79 | ppl   325.69
| epoch   8 | 12200/29454 batches | lr 1.00 | ms/batch 152.98 | loss  5.80 | ppl   331.34
| epoch   8 | 12400/29454 batches | lr 1.00 | ms/batch 153.06 | loss  5.79 | ppl   327.78
| epoch   8 | 12600/29454 batches | lr 1.00 | ms/batch 153.14 | loss  5.55 | ppl   258.40
| epoch   8 | 12800/29454 batches | lr 1.00 | ms/batch 153.23 | loss  5.67 | ppl   290.62
| epoch   8 | 13000/29454 batches | lr 1.00 | ms/batch 153.09 | loss  5.57 | ppl   262.38
| epoch   

| epoch   8 | 29400/29454 batches | lr 1.00 | ms/batch 152.94 | loss  5.57 | ppl   263.21
-----------------------------------------------------------------------------------------
| end of epoch   8 | time: 4676.89s | valid loss  6.35 | valid ppl   571.52
-----------------------------------------------------------------------------------------


In [82]:
torch.save(best_model.state_dict(),'best_model_3bigx3_corrected.pt')
torch.save(model.state_dict(),'training_model_3bigx3_corrected.pt')

In [39]:
lnsoftmax = nn.LogSoftmax(dim=2)

In [27]:
model.load_state_dict(torch.load('best_model_3bigx3_corrected.pt'))

<All keys matched successfully>

# Data Generation

In [30]:
import time
def generator(model: nn.Module, gen_data: Tensor, no_words = 10):
    model.eval()
    temp_text = text
    src_mask = generate_square_subsequent_mask(bptt).to(device)
    pred_text = []
    for i in range(no_words):
        print('i:', i)
        batch_size = gen_data.size(0)
        if batch_size != bptt:
            src_mask_ = src_mask[:batch_size, :batch_size]
        else:
            src_mask_ = src_mask[:,:]
        output_softmax = model(gen_data, src_mask_)
        output_softmax_permuted = output_softmax.permute(1, 0, 2)
        indices = torch.argmax(output_softmax_permuted, dim=2)
        #print(indices[0],indices[1])
        #for j in range(batch_size):
        print('next word: ', [vocab.lookup_tokens(list(index))
                                  for index in indices][0][-1])
        print(i,"Gen_data: ",gen_data,"Pred_data: ",indices)
        pred_text.append([vocab.lookup_tokens(list(index))
                                  for index in indices][0][-1])
        if(batch_size < 16):
            gen_data = torch.cat((gen_data[:,:],indices.t()[-1:][:]),0)
            batch_size= gen_data.size(0)
        else:
            gen_data = torch.cat((gen_data[1:,:],indices.t()[-1:][:]),0)
            batch_size= gen_data.size(0)
            
    return pred_text



In [72]:
st = ['म भारत भ्रमण गर्न']
st_i = data_process(st)

st_i = st_i.unsqueeze(1).to(device)

In [73]:
st_i.shape

torch.Size([4, 1])

In [74]:
st_i

tensor([[ 83],
        [301],
        [700],
        [ 10]], device='cuda:0')

In [60]:
lnsoftmax = nn.LogSoftmax(dim=2)

In [67]:



bptt = 35
def probability(model: nn.Module, sent: Tensor):
    model.eval()
    src_mask = generate_square_subsequent_mask(bptt).to(device)
    
    prob = 0
    for i in range(sent.shape[0]-1):
        print('i:', i)
        batch_size = i+1
        if batch_size != bptt:
            src_mask_ = src_mask[:batch_size, :batch_size]
        else:
            src_mask_ = src_mask[:,:]
        output_softmax = model(sent[:i+1,:], src_mask_)
        output_softmax_permuted = lnsoftmax(output_softmax.permute(1, 0, 2))
        
        print(output_softmax_permuted,output_softmax_permuted[0,i,sent[i+1,0]],output_softmax_permuted.max())
        #Index for maximum probability word
        indices = torch.argmax(output_softmax_permuted, dim=2)
        
        #Max probability word
        print('next word: ', [vocab.lookup_tokens(list(index))
                                  for index in indices][0][-1])
        
        prob+= output_softmax_permuted[0,i,sent[i+1,0]]
    return prob
    
    
    

In [68]:
probability(best_model,st_i)

i: 0
tensor([[[ -5.0913,  -6.2857,  -6.0924,  ..., -18.0961, -16.7388, -16.7699]]],
       device='cuda:0', grad_fn=<LogSoftmaxBackward0>) tensor(-8.7135, device='cuda:0', grad_fn=<SelectBackward0>) tensor(-3.5160, device='cuda:0', grad_fn=<MaxBackward1>)
next word:  नै
i: 1
tensor([[[ -5.0913,  -6.2857,  -6.0924,  ..., -18.0961, -16.7388, -16.7699],
         [ -6.0678,  -5.9991,  -4.4271,  ..., -20.0123, -16.5251, -18.4687]]],
       device='cuda:0', grad_fn=<LogSoftmaxBackward0>) tensor(-5.6694, device='cuda:0', grad_fn=<SelectBackward0>) tensor(-2.3711, device='cuda:0', grad_fn=<MaxBackward1>)
next word:  जान्छु
i: 2
tensor([[[ -5.0913,  -6.2857,  -6.0924,  ..., -18.0961, -16.7388, -16.7699],
         [ -6.0678,  -5.9991,  -4.4271,  ..., -20.0123, -16.5251, -18.4687],
         [ -6.4965,  -6.5571,  -4.5201,  ..., -19.2898, -16.4366, -16.8484]]],
       device='cuda:0', grad_fn=<LogSoftmaxBackward0>) tensor(-1.0052, device='cuda:0', grad_fn=<SelectBackward0>) tensor(-1.0052, device='

tensor(-15.3881, device='cuda:0', grad_fn=<AddBackward0>)

In [28]:
# import time
# def nonnaive_generator(model: nn.Module, gen_data: Tensor, no_words = 10):
#     model.eval()
#     temp_text = text
#     src_mask = generate_square_subsequent_mask(bptt).to(device)
#     pred_text = []
#     for i in range(no_words):
#         print('i:', i)
#         batch_size = gen_data.size(0)
#         if batch_size != bptt:
#             src_mask_ = src_mask[:batch_size, :batch_size]
#         output_softmax = model(gen_data, src_mask_)
#         output_softmax_permuted = output_softmax.permute(1, 0, 2)
#         #print(softmax(output_softmax_permuted))
#         indices = torch.topk(output_softmax_permuted,10 ,dim=2,sorted=True).indices.squeeze(0)
#         values = torch.topk(softmax(output_softmax_permuted),10 ,dim=2,sorted = True).values.squeeze(0)
#         values = values/torch.sum(values,dim = 1,keepdims = True)
#         values = torch.flip(values,dims = (1,))
#         #print(output_softmax_permuted[indices])
#         print(indices,values)
#         ind_sampled = torch.distributions.Categorical(values).sample()
# #         index = indices.squeeze(0)[ind_sampled.unsqueeze(0)]
#         print('is',ind_sampled)
#         next_index = indices[-1][ind_sampled[-1]]
#         print(indices[-1][ind_sampled[-1]])
        
# #     return indices
        
#         #print(indices[0],indices[1])
#         #for j in range(batch_size):
        
#         print('next word: ', [vocab.lookup_token(next_index)],'values: ',values.squeeze(0)[-1])
                                  
# #         print(i,"Gen_data: ",gen_data,"Pred_data: ",indices)


#         pred_text.append([vocab.lookup_token((next_index))][0])
#         if(batch_size <= 10):
#             gen_data = torch.cat((gen_data[:,:],next_index.unsqueeze(0).unsqueeze(0)),0)
#             batch_size= gen_data.size(0)
#         else:
#             gen_data = torch.cat((gen_data[1:,:],next_index.unsqueeze(0).unsqueeze(0)),0)
#             batch_size= gen_data.size(0)
            
#     return pred_text



def nonnaive_generator(model: nn.Module, gen_data: Tensor, no_words = 5,k=50):
    model.eval()
    temp_text = text
    src_mask = generate_square_subsequent_mask(bptt).to(device)
    pred_text = []
    for i in range(no_words):
        print('i:', i)
        batch_size = gen_data.size(0)
        if batch_size != bptt:
            src_mask_ = src_mask[:batch_size, :batch_size]
        output_softmax = model(gen_data, src_mask_)
        output_softmax_permuted = output_softmax.permute(1, 0, 2)
        indices = torch.topk(output_softmax_permuted,k ,dim=2).indices.squeeze(0)
        
        values = torch.topk(softmax(output_softmax_permuted),k ,dim=2).values
        values = values/torch.sum(values,dim = 2,keepdims = True)
#         values = softmax(values)
        
#         values = torch.flip(values,dims = (2,))

        
        ind_sampled = torch.distributions.Categorical(values.squeeze(0)).sample()
        next_index = indices[-1][ind_sampled[-1]]
        
        print('next word: ', vocab.lookup_token(next_index))

        print(i,"Values: ",values.squeeze(0)[-1],"Gen_data: ",gen_data,"possible tokens: ",indices[-1],"Pred_data: ",next_index)
        pred_text.append([vocab.lookup_token(next_index)][0])
        if(batch_size < 15):
            gen_data = torch.cat((gen_data[:,:],next_index.unsqueeze(0).unsqueeze(0)),0)
            batch_size= gen_data.size(0)
        else:
            gen_data = torch.cat((gen_data[1:,:],next_index.unsqueeze(0).unsqueeze(0)),0)
            batch_size= gen_data.size(0)
            
    return pred_text

In [32]:
#Load saved model

model.load_state_dict(torch.load('best_model_3bigx3_corrected.pt'))
model.to(device)
best_model = model

In [29]:

print(sample_data[:,-1].unsqueeze(1))
print(sample_data)
z = generator(best_model, sample_data[:,-1].unsqueeze(1),no_words = 50)

tensor([[ 357],
        [ 465],
        [ 410],
        [6548],
        [ 293],
        [   0],
        [ 357]], device='cuda:0')
tensor([[ 2086,   357],
        [ 5694,   465],
        [  568,   410],
        [    0,  6548],
        [  897,   293],
        [28361,     0],
        [    0,   357]], device='cuda:0')


NameError: name 'generator' is not defined

In [86]:
'संयुक्त राज्य अमेरिकी समुद्री पानी निर्माताद्वारा संयुक्त ' +' '.join(z)

'संयुक्त राज्य अमेरिकी समुद्री पानी निर्माताद्वारा संयुक्त राज्य अमेरिका ? चीन ? जापान ? जापान ? जापान ? भियतनाम ? जापान ? भियतनाम ? जापान ? भियतनाम ? जापान ? जापान ? जापान ? भियतनाम ? जापान ? भियतनाम ? जापान ? भियतनाम ? जापान ? जापान ? जापान ? भियतनाम ? जापान ? भियतनाम ? जापान'

In [87]:
evaluate(best_model, sample_data)

8.372248649597168

In [88]:
'''
आधिकारिक निर्णयको कारणले , वाणिज्य बिभागले , 
संयुक्त राज्य अमेरिकी समुद्री पानी निर्माताद्वारा संयुक्त

विकास गर्न प्रोत्साहित गरी सहकारी क्षेत्रले आर्थिक दृष्टिले सक्रिय
'''

'\nआधिकारिक निर्णयको कारणले , वाणिज्य बिभागले , \nसंयुक्त राज्य अमेरिकी समुद्री पानी निर्माताद्वारा संयुक्त\n\nविकास गर्न प्रोत्साहित गरी सहकारी क्षेत्रले आर्थिक दृष्टिले सक्रिय\n'

In [443]:
st = ['गर्ने']
st_i = data_process(st)

In [403]:
st_i = st_i.unsqueeze(1).to(device)

In [404]:
z_ = generator(best_model, st_i,no_words =50 )

i: 0
next word:  निर्णय
0 Gen_data:  tensor([[13]], device='cuda:0') Pred_data:  tensor([[163]], device='cuda:0')
i: 1
next word:  गरेको
1 Gen_data:  tensor([[ 13],
        [163]], device='cuda:0') Pred_data:  tensor([[163,  11]], device='cuda:0')
i: 2
next word:  छ
2 Gen_data:  tensor([[ 13],
        [163],
        [ 11]], device='cuda:0') Pred_data:  tensor([[163,  11,   4]], device='cuda:0')
i: 3
next word:  ।
3 Gen_data:  tensor([[ 13],
        [163],
        [ 11],
        [  4]], device='cuda:0') Pred_data:  tensor([[163,  11,   4,   1]], device='cuda:0')
i: 4
next word:  नेपाल
4 Gen_data:  tensor([[ 13],
        [163],
        [ 11],
        [  4],
        [  1]], device='cuda:0') Pred_data:  tensor([[163,  11,   4,   1,  22]], device='cuda:0')
i: 5
next word:  राष्ट्र
5 Gen_data:  tensor([[ 13],
        [163],
        [ 11],
        [  4],
        [  1],
        [ 22]], device='cuda:0') Pred_data:  tensor([[163,  11,   4,   1,  22, 337]], device='cuda:0')
i: 6
next word:  बैंकल

next word:  निष्काशन
36 Gen_data:  tensor([[ 126],
        [ 264],
        [  69],
        [  21],
        [   4],
        [   1],
        [ 513],
        [ 271],
        [  18],
        [ 126],
        [ 264],
        [  69],
        [ 264],
        [  44],
        [1227],
        [ 678]], device='cuda:0') Pred_data:  tensor([[ 264,   69,  156,    4,    1,   89,  271,   18,  270,  264,   69,  264,
           44, 1227,  678, 5228]], device='cuda:0')
i: 37
next word:  गरेको
37 Gen_data:  tensor([[ 264],
        [  69],
        [  21],
        [   4],
        [   1],
        [ 513],
        [ 271],
        [  18],
        [ 126],
        [ 264],
        [  69],
        [ 264],
        [  44],
        [1227],
        [ 678],
        [5228]], device='cuda:0') Pred_data:  tensor([[  44,  156,    4,    1,   89,  271,   18,  270,  264,   69,  264,   44,
         1227,  678, 5228,   11]], device='cuda:0')
i: 38
next word:  छ
38 Gen_data:  tensor([[  69],
        [  21],
        [   4],
       

In [405]:
' '.join(z_)

'निर्णय गरेको छ । नेपाल राष्ट्र बैंकले आगामी आर्थिक वर्षको मौद्रिक नीतिमा लघुवित्त वित्तीय संस्था लिमिटेडको खुद मुनाफा रु एक करोड ५० लाख रहेको छ । बैंकले रु एक करोड ५० लाख ५० हजार बराबरको शेयर निष्काशन गरेको छ । बैंकले रु एक अर्ब ५० करोड ५० लाख ५० हजार'

In [34]:
st = ['म भारत भ्रमण गर्न']
st_i = data_process(st)

In [35]:
st_i = st_i.unsqueeze(1).to(device)
st_i

tensor([[ 83],
        [301],
        [700],
        [ 10]], device='cuda:0')

In [36]:
z_ = generator(best_model, st_i,no_words = 50)

NameError: name 'bptt' is not defined

In [490]:
' '.join(st)+' ' +' '.join(z_)

'म भारत भ्रमण गर्न चाहन्छु । तर ? नेपालको पहिलो ठूलो चुनौती भनेको नेपालको आर्थिक विकासमा ठूलो टेवा पुगेको छ । नेपाल राष्ट्र बैंकले मौद्रिक नीतिमा लघुवित्त वित्तीय संस्था लिमिटेडको खुद मुनाफा रु एक करोड ५० लाख रहेको छ । बैंकले रु एक करोड ५० लाख ५० हजार बराबरको शेयर निष्काशन गरेको छ ।'

In [491]:
 x = torch.arange(1., 6.)
j = torch.topk(x, 3)

In [492]:
j.values

tensor([5., 4., 3.])

In [493]:
vocab['?']

2

In [41]:
z__ = nonnaive_generator(best_model, st_i,no_words = 100,k=50)

i: 0
next word:  पनि
0 Values:  tensor([0.3439, 0.0863, 0.0536, 0.0429, 0.0387, 0.0298, 0.0277, 0.0277, 0.0180,
        0.0177, 0.0165, 0.0141, 0.0130, 0.0129, 0.0127, 0.0116, 0.0109, 0.0105,
        0.0101, 0.0097, 0.0094, 0.0088, 0.0088, 0.0087, 0.0084, 0.0082, 0.0081,
        0.0081, 0.0076, 0.0070, 0.0067, 0.0062, 0.0061, 0.0061, 0.0060, 0.0059,
        0.0057, 0.0057, 0.0056, 0.0056, 0.0054, 0.0052, 0.0051, 0.0050, 0.0050,
        0.0049, 0.0047, 0.0045, 0.0045, 0.0044], device='cuda:0',
       grad_fn=<SelectBackward0>) Gen_data:  tensor([[ 83],
        [301],
        [700],
        [ 10]], device='cuda:0') possible tokens:  tensor([ 2067,  7752,  4235,   228,   831, 18871,   479,  3055,  2680,   215,
         9108,  8665,  3077,  9440,     3,  7966,     5,   239,  2084,    17,
         7677,  4713,   474, 17893,  6605, 10854,  3493, 15250,     0,   903,
         1591, 16053, 11937, 50041,   977,  1696,   162,   205,    66,  2214,
        18855, 12762,  4198,  2699,    22,   692,

next word:  भनेको
10 Values:  tensor([0.3117, 0.1025, 0.0660, 0.0485, 0.0408, 0.0372, 0.0312, 0.0261, 0.0240,
        0.0236, 0.0176, 0.0165, 0.0150, 0.0146, 0.0143, 0.0114, 0.0106, 0.0102,
        0.0101, 0.0097, 0.0097, 0.0085, 0.0073, 0.0072, 0.0071, 0.0062, 0.0061,
        0.0060, 0.0060, 0.0059, 0.0059, 0.0059, 0.0056, 0.0054, 0.0052, 0.0051,
        0.0047, 0.0044, 0.0043, 0.0043, 0.0042, 0.0041, 0.0041, 0.0039, 0.0038,
        0.0038, 0.0035, 0.0035, 0.0034, 0.0034], device='cuda:0',
       grad_fn=<SelectBackward0>) Gen_data:  tensor([[  83],
        [ 301],
        [ 700],
        [  10],
        [   5],
        [8665],
        [   1],
        [  87],
        [   3],
        [ 229],
        [7586],
        [ 236],
        [ 125],
        [ 374]], device='cuda:0') possible tokens:  tensor([  246,  2540,   323, 15257,    62,   267,  4649,    17,   301,     6,
          754,    21,  2323,     8,  9065,     3,  2883,   980,  8004, 43547,
           58,   678,  7586,    98, 13554, 

next word:  हासिल
21 Values:  tensor([0.0919, 0.0809, 0.0777, 0.0736, 0.0697, 0.0364, 0.0360, 0.0252, 0.0250,
        0.0222, 0.0220, 0.0204, 0.0203, 0.0199, 0.0199, 0.0198, 0.0197, 0.0184,
        0.0160, 0.0157, 0.0155, 0.0148, 0.0141, 0.0140, 0.0126, 0.0119, 0.0114,
        0.0114, 0.0110, 0.0108, 0.0105, 0.0102, 0.0099, 0.0089, 0.0081, 0.0075,
        0.0071, 0.0071, 0.0067, 0.0067, 0.0066, 0.0063, 0.0062, 0.0061, 0.0058,
        0.0057, 0.0057, 0.0056, 0.0056, 0.0054], device='cuda:0',
       grad_fn=<SelectBackward0>) Gen_data:  tensor([[ 7586],
        [  236],
        [  125],
        [  374],
        [  246],
        [  111],
        [    9],
        [  236],
        [  125],
        [  466],
        [  246],
        [  897],
        [  513],
        [  980],
        [31311]], device='cuda:0') possible tokens:  tensor([  213,  1011,     9,   254,    17,    53,  7612,  7662,   271,    58,
         3131,   597,   917,  3917,  1044,  2841,  3633,    18,   661,  1693,
          12

        [  246]], device='cuda:0') possible tokens:  tensor([    4,   998,  3309,  2206,   513,   980,   323,   271,  3097,    18,
          815,    22, 12850,  3460,    12, 11003,  2781,   423,    94,   714,
         5066,   897,  1382,   589, 41598, 20140,  1006,     3,  5832,  1288,
         3660,   236,    89,  3112,   996,    46,   678,   415,  3540,   103,
           58,   203,  5583,  6168,  2748,    17,  4881,  4886,     2,   264],
       device='cuda:0') Pred_data:  tensor(513, device='cuda:0')
i: 33
next word:  १
33 Values:  tensor([0.1795, 0.1270, 0.0532, 0.0531, 0.0415, 0.0356, 0.0337, 0.0294, 0.0281,
        0.0240, 0.0213, 0.0211, 0.0187, 0.0184, 0.0158, 0.0154, 0.0124, 0.0123,
        0.0112, 0.0109, 0.0108, 0.0103, 0.0102, 0.0101, 0.0099, 0.0095, 0.0094,
        0.0092, 0.0088, 0.0084, 0.0082, 0.0081, 0.0080, 0.0079, 0.0078, 0.0075,
        0.0074, 0.0073, 0.0072, 0.0069, 0.0069, 0.0066, 0.0065, 0.0065, 0.0065,
        0.0065, 0.0064, 0.0063, 0.0062, 0.0062], device='cu

        1.1527e-04, 1.1461e-04], device='cuda:0', grad_fn=<SelectBackward0>) Gen_data:  tensor([[ 2748],
        [ 6753],
        [  246],
        [  513],
        [   58],
        [   69],
        [ 4431],
        [   44],
        [ 8374],
        [ 1776],
        [18788],
        [  678],
        [ 3125],
        [  262],
        [   11]], device='cuda:0') possible tokens:  tensor([    4,    16,     8,    22,   513,   159,     3,   231,    65,    29,
          337,    61,   210,  2635,    35,   105,  1106,  2502,    30,    71,
         3674,   462,    18,   816,    89,    97,  1241,  3112, 12409,  1655,
          265,   153,   822,  4776,  2816,   818,   291,    21,  1606,    68,
           62,   980,  3759,  2843,   524,   678,   558,   496,   423,     5],
       device='cuda:0') Pred_data:  tensor(4, device='cuda:0')
i: 45
next word:  ।
45 Values:  tensor([9.6937e-01, 2.1711e-02, 4.0772e-03, 6.7251e-04, 4.5187e-04, 3.4932e-04,
        3.2714e-04, 2.3866e-04, 2.3153e-04, 2.0108e-04,

next word:  भएको
56 Values:  tensor([7.4029e-01, 7.9981e-02, 3.1543e-02, 1.4438e-02, 1.3618e-02, 1.0536e-02,
        8.5385e-03, 7.8832e-03, 6.2778e-03, 5.7469e-03, 5.3256e-03, 5.3111e-03,
        4.8914e-03, 4.7004e-03, 4.3523e-03, 4.1693e-03, 3.8385e-03, 3.7275e-03,
        3.6834e-03, 3.6422e-03, 3.4944e-03, 2.9902e-03, 2.7774e-03, 2.6015e-03,
        2.5795e-03, 2.0714e-03, 2.0087e-03, 1.9913e-03, 1.7327e-03, 1.6778e-03,
        1.3852e-03, 1.0177e-03, 9.0453e-04, 8.8655e-04, 8.2294e-04, 7.8421e-04,
        7.5661e-04, 7.5367e-04, 7.0646e-04, 6.9516e-04, 6.8818e-04, 6.1653e-04,
        5.2399e-04, 5.1665e-04, 5.1288e-04, 4.8366e-04, 4.3367e-04, 4.0527e-04,
        3.4771e-04, 3.3629e-04], device='cuda:0', grad_fn=<SelectBackward0>) Gen_data:  tensor([[ 3125],
        [  262],
        [   11],
        [    4],
        [    1],
        [  678],
        [ 3318],
        [ 7018],
        [ 1910],
        [  227],
        [    3],
        [  260],
        [14323],
        [ 2193],
     

next word:  ल्याए
68 Values:  tensor([0.6365, 0.0391, 0.0205, 0.0198, 0.0175, 0.0172, 0.0162, 0.0152, 0.0142,
        0.0138, 0.0120, 0.0110, 0.0104, 0.0092, 0.0089, 0.0089, 0.0087, 0.0083,
        0.0075, 0.0069, 0.0062, 0.0051, 0.0050, 0.0044, 0.0042, 0.0042, 0.0042,
        0.0041, 0.0038, 0.0037, 0.0035, 0.0034, 0.0033, 0.0032, 0.0031, 0.0031,
        0.0030, 0.0030, 0.0028, 0.0026, 0.0026, 0.0023, 0.0023, 0.0022, 0.0022,
        0.0022, 0.0022, 0.0022, 0.0022, 0.0021], device='cuda:0',
       grad_fn=<SelectBackward0>) Gen_data:  tensor([[14323],
        [ 2193],
        [  509],
        [    6],
        [    4],
        [    1],
        [12369],
        [ 5605],
        [  339],
        [   14],
        [   22],
        [  337],
        [  513],
        [   94],
        [ 4463]], device='cuda:0') possible tokens:  tensor([  757,  1201,     2,  4974, 23687,   835,    21,  3309, 18351,  1043,
          118,  2496, 12997,   729,   897,   242,    88,   199, 22008,  5205,
         189

next word:  तुलनामा
80 Values:  tensor([9.4482e-01, 1.2053e-02, 6.6274e-03, 3.2866e-03, 3.1914e-03, 2.6043e-03,
        2.5312e-03, 1.8535e-03, 1.7176e-03, 1.3528e-03, 1.3482e-03, 1.0704e-03,
        1.0227e-03, 9.5946e-04, 9.3655e-04, 7.7300e-04, 7.7123e-04, 7.3386e-04,
        7.1350e-04, 6.6492e-04, 6.5003e-04, 6.0777e-04, 5.9373e-04, 5.3126e-04,
        5.2171e-04, 4.7914e-04, 4.2010e-04, 3.7713e-04, 3.6596e-04, 3.6395e-04,
        3.6292e-04, 3.5992e-04, 3.5181e-04, 3.3892e-04, 3.3666e-04, 3.2222e-04,
        3.1961e-04, 3.1872e-04, 3.1148e-04, 2.9639e-04, 2.9494e-04, 2.9493e-04,
        2.9343e-04, 2.7763e-04, 2.7041e-04, 2.6462e-04, 2.6205e-04, 2.6005e-04,
        2.5875e-04, 2.5806e-04], device='cuda:0', grad_fn=<SelectBackward0>) Gen_data:  tensor([[ 513],
        [  94],
        [4463],
        [5205],
        [   5],
        [ 513],
        [ 980],
        [ 187],
        [  23],
        [   7],
        [   1],
        [1006],
        [ 312],
        [ 501],
        [5724]],

next word:  करोड
92 Values:  tensor([8.4516e-01, 7.4826e-02, 4.9852e-02, 1.1300e-02, 9.2837e-03, 2.2573e-03,
        1.4868e-03, 8.9165e-04, 8.0310e-04, 6.3388e-04, 4.6449e-04, 3.3633e-04,
        2.8213e-04, 2.7547e-04, 2.4723e-04, 1.9184e-04, 1.6415e-04, 1.3824e-04,
        1.3661e-04, 1.1872e-04, 9.7543e-05, 9.0137e-05, 8.7782e-05, 8.6907e-05,
        7.8225e-05, 5.9171e-05, 5.7156e-05, 4.5401e-05, 4.3945e-05, 4.1724e-05,
        3.9141e-05, 3.8078e-05, 3.5863e-05, 3.0754e-05, 2.9750e-05, 2.7744e-05,
        2.6959e-05, 2.4711e-05, 2.3598e-05, 2.2462e-05, 2.0973e-05, 2.0287e-05,
        1.9889e-05, 1.6018e-05, 1.5448e-05, 1.5216e-05, 1.4485e-05, 1.4122e-05,
        1.3759e-05, 1.3134e-05], device='cuda:0', grad_fn=<SelectBackward0>) Gen_data:  tensor([[ 312],
        [ 501],
        [5724],
        [1011],
        [2370],
        [4463],
        [ 757],
        [  16],
        [   1],
        [ 934],
        [4776],
        [ 271],
        [1224],
        [ 270],
        [2292]], de

In [503]:
' '.join(st)+' ' +' '.join(z__)

'म भारत भ्रमण गर्न पुगेँ । एक किसिमको सम्पत्ति शुद्धीकरण हुन पुग्यो । हामीले सबै भन्दा ठूलो लगानीका लागि चाहिने भएको कृषि तथा गरिबी निवारणका लागि राष्ट्रिय योजना आयोगले बनाएको सो कार्यक्रम अहिले ४ हजार ५ सय ५० रोपनी काम अघि बढाइएको उनको भनाई छ । त्यस्तै ३ हजार १ सय ५० जना परीक्षार्थीले सी प्लस ? ७ हजार ८ सय ३५ परीक्षार्थी उत्तिर्ण गरेका छन् । त्यस्तै ? ३९ हजार ३ सय ८१ ? ६२ हजार २ सय ३३ ? ६२ हजार ६७ र वित्त ७ दशमलव ६९ प्रतिशत तथा दोस्रो कम आर्थिक वृद्धिदर १७ दशमलव ६३ प्रतिशत थियो । त्यस्तै ? मौद्रिक'

In [504]:
vocab['।']

1

In [505]:
z__

['पुगेँ',
 '।',
 'एक',
 'किसिमको',
 'सम्पत्ति',
 'शुद्धीकरण',
 'हुन',
 'पुग्यो',
 '।',
 'हामीले',
 'सबै',
 'भन्दा',
 'ठूलो',
 'लगानीका',
 'लागि',
 'चाहिने',
 'भएको',
 'कृषि',
 'तथा',
 'गरिबी',
 'निवारणका',
 'लागि',
 'राष्ट्रिय',
 'योजना',
 'आयोगले',
 'बनाएको',
 'सो',
 'कार्यक्रम',
 'अहिले',
 '४',
 'हजार',
 '५',
 'सय',
 '५०',
 'रोपनी',
 'काम',
 'अघि',
 'बढाइएको',
 'उनको',
 'भनाई',
 'छ',
 '।',
 'त्यस्तै',
 '३',
 'हजार',
 '१',
 'सय',
 '५०',
 'जना',
 'परीक्षार्थीले',
 'सी',
 'प्लस',
 '?',
 '७',
 'हजार',
 '८',
 'सय',
 '३५',
 'परीक्षार्थी',
 'उत्तिर्ण',
 'गरेका',
 'छन्',
 '।',
 'त्यस्तै',
 '?',
 '३९',
 'हजार',
 '३',
 'सय',
 '८१',
 '?',
 '६२',
 'हजार',
 '२',
 'सय',
 '३३',
 '?',
 '६२',
 'हजार',
 '६७',
 'र',
 'वित्त',
 '७',
 'दशमलव',
 '६९',
 'प्रतिशत',
 'तथा',
 'दोस्रो',
 'कम',
 'आर्थिक',
 'वृद्धिदर',
 '१७',
 'दशमलव',
 '६३',
 'प्रतिशत',
 'थियो',
 '।',
 'त्यस्तै',
 '?',
 'मौद्रिक']