In [1]:
import torch
from torch import Tensor, nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import dataset
from torch.utils.tensorboard import SummaryWriter

import regex as re
import os
import time
from tqdm import tqdm
import copy
import math

from model import TransformerModel
from utils import preProcessText, getTokenizer

In [2]:
def get_model(model_config, ntokens):
    emsize = model_config["emsize"]
    d_hid = model_config["d_hid"]
    nlayers = model_config["nlayers"]
    nhead = model_config["nhead"]
    dropout = model_config["dropout"]
    model = TransformerModel(ntokens, emsize,nhead, d_hid, nlayers, dropout)
    return model

def loadModel(best_model_path):
    if os.path.exists(best_model_path):
        print(f"Preloading model {best_model_path}")
        state = torch.load(best_model_path)
        
        initial_epoch = state['epoch'] + 1
        model.load_state_dict(state['model_state_dict'])
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']
        best_val_loss = state['best_val_loss']

        return model
    else:
        raise Exception("Model Not Found")

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

bptt = 35
softmax = nn.Softmax(dim=2)

model_config = {
    "emsize" : 300, 
    "d_hid" : 400,
    "nlayers" : 2,
    "nhead" : 2, 
    "dropout" : 0.05,
}

app_config = {
    "logs" : "tensorboard_logs",
    "epochs" : 25,
}

tokenizer, vocab = getTokenizer()
ntokens = len(vocab)
model = get_model(model_config, ntokens).to(device)

criterion = nn.CrossEntropyLoss()
lr = 1  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

cuda




In [4]:
best_model_path = 'models/best_model_sample_test_corrected.pt'
loaded_model = loadModel(best_model_path)

Preloading model models/best_model_sample_test_corrected.pt


In [5]:
def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
    """Converts raw text into a flat Tensor."""
    # obtain the data in tensor format for each line
    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long)
            for item in raw_text_iter]
    # concatenate all the lines
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

def batchify(data: Tensor, batch_size: int) -> Tensor:
    """Divides the data into batch_size separate sequences, removing extra elements
    that wouldn't cleanly fit.
    Args:
        data: Tensor, shape [N]
        batch_size: int, batch size
    Returns:
        Tensor of shape [N // bsz, bsz]
    """
    seq_len = data.size(0) // batch_size
    data = data[:seq_len * batch_size]
    data = data.view(batch_size, seq_len).t().contiguous()
    return data.to(device)

def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
    
def generator(model: nn.Module, gen_data: Tensor, no_words = 10):
    model.eval()
    temp_text = text
    src_mask = generate_square_subsequent_mask(bptt).to(device)
    pred_text = []
    gen_data = gen_data.to(device)
    for i in range(no_words):
        batch_size = gen_data.size(0)
        if batch_size != bptt:
            src_mask_ = src_mask[:batch_size, :batch_size]
        output_softmax = model(gen_data, src_mask_)
        output_softmax_permuted = output_softmax.permute(1, 0, 2)

        indices = torch.topk(output_softmax_permuted,10 ,dim=2).indices.squeeze(0)
        values = torch.topk(softmax(output_softmax_permuted),10 ,dim=2).values
        values = values/torch.sum(values,dim = 2,keepdims = True)

        ind_sampled = torch.distributions.Categorical(values.squeeze(0)).sample()
        next_index = indices[-1][ind_sampled[-1]]
        # print('next word: ', [vocab.lookup_token(next_index)],'values: ',values.squeeze(0)[-1])
        pred_text.append([vocab.lookup_token((next_index))][0])
        
        if(batch_size <= 10):
            gen_data = torch.cat((gen_data[:,:],next_index.unsqueeze(0).unsqueeze(0)),0)
            batch_size= gen_data.size(0)
        else:
            gen_data = torch.cat((gen_data[1:,:],next_index.unsqueeze(0).unsqueeze(0)),0)
            batch_size= gen_data.size(0)
            
    return pred_text

In [7]:
text = ['आधिकारिक निर्णयको कारणले']
# text = ['आधिकारिक निर्णयको']
sample_data = data_process(text)
print(sample_data.size(), sample_data)
sample_data = batchify(sample_data, 3)

z = generator(loaded_model, sample_data[:,-1].unsqueeze(1),no_words = 10)
print(text[0] + ' ' + ' '.join(z))

torch.Size([3]) tensor([2086, 5937,  563])
आधिकारिक निर्णयको कारणले आयोजना गरिने <unk> <num> वटा सवारी साधन <unk> रहेको थियो
