# Import Torch functions and tokenizers

In [1]:
import torch
from torch import Tensor, nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import dataset
from torch.utils.tensorboard import SummaryWriter

import regex as re
import os
import time
from tqdm import tqdm
import copy
import math

from model import TransformerModel
from utils import preProcessText, getTokenizer,try_gpu , word_piece_decoder, word_piece_encoder
from config import getConfig

import pickle

In [22]:
model_config, app_config = getConfig(small = True)
print(model_config)
print(app_config)

bptt=model_config["bptt"]
device = try_gpu(0)

{'emsize': 300, 'd_hid': 800, 'nlayers': 4, 'nhead': 4, 'dropout': 0.05, 'bptt': 32}
{'logs': 'tensorboard_logs', 'epochs': 10}


# Preprocessing Text

In [2]:
# t1[:5000]

In [3]:

file_path = 'data/preprocessed_morph.txt'
if not os.path.exists(file_path):
    print("Run morpheme_datagen notebook")
else:
    print(f"Reading file  : {file_path}")
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
        text = preProcessText(text,tokenizer_type = 'morpheme')

Reading file  : data/preprocessed_morph.txt


In [7]:
text[:500]

'बर्दिबास * नगरपालिका को * तेस्रो नगर परिषदबाट पारित आव २०७३ * । ७४ को संशोधित र * २०७४ * । ७५ को प्रस्तावित नीति ? * कार्यक्रम * तथा * बजेट\nअार्थिक * वर्ष * २०७५७६ * काे नदिजन्य * पदार्थ काे * उत्खनन् गरी बिक्रि * वितरण * तथा अान्तरिक निकासी गर्ने * कार्य काे * बाेलपत्र सम्बन्धी * सुचना\nसक्ष ार सप्तरी अभियानमा सप्तरीबासी * सम्पूर्ण * सरोकारवालाहरु को * सहयोग र * सहभागिता का ो लागि अनुराोध छ । * सामुदायिक * अध्ययन * केन्द्र हरूको * नविकरण सम्बन्धमा । \nकाठमाडौं * ? १२ कातिक । * राष्ट्रपति * विद्या'

In [9]:
len(text.split('\n'))

123563

In [4]:
train_iter_first = text.split('\n')[:100000]
test_iter = text.split('\n')[100000:]

# Run Here

In [5]:
# import pickle


# tokenizer = get_tokenizer(None)

# vocab = build_vocab_from_iterator(
#     map(tokenizer, train_iter_first), specials=['<unk>'],max_tokens = 30000)
# vocab.set_default_index(vocab['<unk>'])


# # Save for first time
# with open('transformer_vocab_morpheme.pickle','wb') as f:
#     pickle.dump(vocab,f)

tokenizer,vocab = getTokenizer(tokenizer_type = 'morpheme')

# with open('tokenizers/transformer_vocab_morpheme.pickle','rb') as f:
#     vocab = pickle.load(f)

In [8]:
#dir(vocab)

In [7]:
#vocab.get_itos()

In [6]:
list(vocab.get_stoi().items())[:20]

[('गराउन', 630),
 ('तामाकोशी', 7912),
 ('धानको', 5649),
 ('कम्तीमा', 1348),
 ('सञ्चार', 1107),
 ('छिटै', 6908),
 ('आणविक', 5099),
 ('इलाका', 2201),
 ('राजपत्रमा', 13387),
 ('वर्षीय', 625),
 ('सम्भावना', 482),
 ('विकको', 11980),
 ('युनियनका', 11859),
 ('दावी', 1923),
 ('ठूलै', 4034),
 ('गरेका', 31),
 ('मैतीदेवी', 28939),
 ('हाँसो', 7269),
 ('अपरेटर', 13993),
 ('पासो', 19681)]

In [86]:
len(vocab)

30000

#  some utility functions

In [11]:
def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
    """Converts raw text into a flat Tensor."""
    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long)
            for item in raw_text_iter]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))


def batchify(data: Tensor, bsz: int) -> Tensor:
    """Divides the data into bsz separate sequences, removing extra elements
    that wouldn't cleanly fit.
    Args:
        data: Tensor, shape [N]
        bsz: int, batch size
    Returns:
        Tensor of shape [N // bsz, bsz]
    """
    seq_len = data.size(0) // bsz
    data = data[:seq_len * bsz]
#     data = data.view(bsz, seq_len).t().contiguous()
    data = data.view(bsz,seq_len).t()
#     return data.to(device)
    return data


seq_length = 128
def get_batch(source: Tensor, i: int) -> tuple[Tensor, Tensor]:
    """
    Args:
        source: Tensor, shape [full_seq_len, batch_size]
        i: int
    Returns:
        tuple (data, target), where data has shape [seq_len, batch_size] and
        target has shape [seq_len * batch_size]
    """
    seq_len = min(seq_length, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    #target = source[i+1:i+1+seq_len]
    return data, target

In [12]:
#Train and Test Split
train_data = data_process(train_iter_first)
test_data = data_process(test_iter)

In [13]:
len(train_data)

31139407

In [14]:
train_data[:40]

tensor([14037,     1,   146,     4,     1,   572,   778,     0,   985,  1879,
         1520,     1,     2,  3696,     4, 11933,     5,     1,   813,     1,
            2,  1527,     4,  4469,   366,     3,     1,    71,     1,    20,
            1,   403, 25615,     1,    64,     1,  5413,     1,  1624,     0])

# Working with a dummy Sample

In [17]:
#Sample Data


text = ['आधिकारिक निर्णयको कारणले , वाणिज्य बिभागले , संयुक्त राज्य अमेरिकी समुद्री पानी निर्माताद्वारा संयुक्त राज्य']
#text = ['जनसंख्या']
sample_data = data_process(
    text)

In [18]:
sample_data

tensor([2365, 9109,  666,    0, 1027,    0,    0,  422,  388,  474, 7705,  315,
           0,  422,  388])

In [19]:
sample_data = batchify(sample_data, 2)
print("Given word:", text[0])
sample_data

Given word: आधिकारिक निर्णयको कारणले , वाणिज्य बिभागले , संयुक्त राज्य अमेरिकी समुद्री पानी निर्माताद्वारा संयुक्त राज्य


tensor([[2365,  422],
        [9109,  388],
        [ 666,  474],
        [   0, 7705],
        [1027,  315],
        [   0,    0],
        [   0,  422]])

# Model Definition

In [23]:
batched_train_data = batchify(train_data, bptt).to(device)  # shape [seq_len, batch_size]
batched_test_data = batchify(test_data, bptt).to(device)

In [24]:
def get_model(model_config, ntokens):
    emsize = model_config["emsize"]
    d_hid = model_config["d_hid"]
    nlayers = model_config["nlayers"]
    nhead = model_config["nhead"]
    dropout = model_config["dropout"]
    model = TransformerModel(ntokens, emsize,nhead, d_hid, nlayers, dropout)
    return model

In [25]:
ntokens = len(vocab)
model = get_model(model_config, ntokens).to(device)
torch.cuda.memory_allocated()

401597952

In [21]:
batched_train_data = batchify(train_data, bptt)  # shape [seq_len, batch_size]
batched_test_data = batchify(test_data, bptt)

In [107]:
seq_length = 128

# Hyper-Parameter Tuning

In [26]:
criterion = nn.CrossEntropyLoss()
lr = 1  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
softmax = nn.Softmax(dim=2)
#softmax = nn.LogSoftmax(dim=2)

In [36]:
def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

In [37]:
def train(model: nn.Module) -> None:
    global epoch
    global global_step
    model.train()  # turn on train mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(bptt).to(device)

    num_batches = len(batched_train_data) // bptt
    progress_bar = tqdm(enumerate(range(0, batched_train_data.size(0) - 1, bptt)), total=num_batches, desc=f'Epoch {epoch}', ncols=80)
    for batch_idx, i in progress_bar:
        ### batch_idx -> (1, 2, 3, 4, ...)
        ### i -> (0, bptt, 2*bptt, ....)
        data, targets = get_batch(batched_train_data, i)
        batch_size = data.size(0)
        if batch_size != bptt:  # only on last batch
            src_mask = src_mask[:batch_size, :batch_size]
        output = model(data, src_mask)
        loss = criterion(output.view(-1, ntokens), targets)
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()

        ## calculate the postfix description for the progress bar
        cur_loss = total_loss / (batch_idx + 1)
        ppl = math.exp(cur_loss)
        
        progress_bar.set_postfix({"loss": cur_loss, "ppl" : ppl}, refresh=True)
        
        writer.add_scalar('loss/train loss', cur_loss, global_step)
        writer.flush()
        writer.add_scalar('ppl/train perplexity', ppl, global_step)
        writer.flush()
        global_step += 1

In [38]:
def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(bptt).to(device)

    num_batches = len(eval_data) // bptt
    with torch.no_grad():
        progress_bar = tqdm(enumerate(range(0, eval_data.size(0) - 1, bptt)), total=num_batches, desc=f'Validation {epoch}', ncols=80)
        for batch_idx, i in progress_bar:
            data, targets = get_batch(eval_data, i)
            batch_size = data.size(0)
            if batch_size != bptt:
                src_mask = src_mask[:batch_size, :batch_size]
            output = model(data, src_mask)
            output_softmax = softmax(output)
            output_softmax_permuted = output_softmax.permute(1, 0, 2)
            indices = torch.argmax(output_softmax_permuted, dim=2)
            target_indices = targets.t()
            output_flat = output.view(-1, ntokens)
            total_loss += batch_size * criterion(output_flat, targets).item()
    
    eval_loss = total_loss / (len(eval_data) - 1)
    eval_ppl = math.exp(eval_loss)

    writer.add_scalar('loss/val loss', eval_loss, global_step)
    writer.flush()
    writer.add_scalar('ppl/val perplexity', eval_ppl, global_step)
    writer.flush()

    return eval_loss

In [39]:
softmax = nn.Softmax(dim=2)
#softmax = nn.LogSoftmax(dim=2)

# Training Data

In [40]:
best_model_path = 'models/best_model_mp.pt'

In [41]:
# Loop over epochs. Save the model if the validation loss is the best
# we've seen so far. Adjust the learning rate after each epoch.
best_val_loss = float('inf')
initial_epoch = 0
epochs = app_config["epochs"]
global_step = 0
best_model = None

# preload the model if exists to train more epochs

if os.path.exists(best_model_path):
    print(f"Preloading model {best_model_path}")
    state = torch.load(best_model_path)
    
    initial_epoch = state['epoch'] + 1
    model.load_state_dict(state['model_state_dict'])
    optimizer.load_state_dict(state['optimizer_state_dict'])
    global_step = state['global_step']
    best_val_loss = state['best_val_loss']
    
    print(initial_epoch, global_step, best_val_loss)

# initializing the tensorbaord log writer
writer = SummaryWriter(app_config["logs"])


for epoch in range(initial_epoch, epochs):
    train(model)
    eval_loss = evaluate(model, batched_test_data)

    # save the model if validation loss decreases

    if eval_loss < best_val_loss:
        print(f"eval perplexity : {math.exp(eval_loss)}")
        print("saving the model")
        best_val_loss = eval_loss
        best_model = copy.deepcopy(model)

        directory_path = 'models'
        # Create the directory if it doesn't exist
        if not os.path.exists(directory_path):
            os.makedirs(directory_path)
        torch.save({
                'epoch': epoch,
                'model_state_dict': best_model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'global_step': global_step, 
                'best_val_loss' : best_val_loss,
            }, os.path.join(directory_path, 'best_model_wp.pt'))

Epoch 0:   4%|▎      | 1324/30409 [04:20<1:35:26,  5.08it/s, loss=6.34, ppl=569]


KeyboardInterrupt: 

In [131]:
lnsoftmax = nn.LogSoftmax(dim=2)

# Data Generation

In [None]:
def loadModel(best_model_path):
    global model
    if os.path.exists(best_model_path):
        print(f"Preloading model {best_model_path}")
        if torch.cuda.is_available():
            state = torch.load(best_model_path)
        else:
            state = torch.load(best_model_path, map_location=torch.device('cpu'))
        model.load_state_dict(state['model_state_dict'])
        return model
    else:
        raise Exception("Model Not Found")
        
loaded_model = loadModel(best_model_path)

In [42]:
# loaded_model = model
def generator(model: nn.Module, gen_data: Tensor, no_words = 10):
    model.eval()
    temp_text = text
    src_mask = generate_square_subsequent_mask(bptt).to(device)
    pred_text = []
    for i in range(no_words):
        batch_size = gen_data.size(0)
        if batch_size != bptt:
            src_mask_ = src_mask[:batch_size, :batch_size]
        else:
            src_mask_ = src_mask[:,:]
        output_softmax = model(gen_data, src_mask_)
        output_softmax_permuted = output_softmax.permute(1, 0, 2)
        indices = torch.argmax(output_softmax_permuted, dim=2)
        pred_text.append([vocab.lookup_tokens(list(index))
                                  for index in indices][0][-1])
        if(batch_size < 16):
            gen_data = torch.cat((gen_data[:,:],indices.t()[-1:][:]),0)
            batch_size= gen_data.size(0)
        else:
            gen_data = torch.cat((gen_data[1:,:],indices.t()[-1:][:]),0)
            batch_size= gen_data.size(0)
            
    return pred_text





def nonnaive_generator(model: nn.Module, gen_data: Tensor, no_words = 5,k=50):
    model.eval()
    temp_text = text
    src_mask = generate_square_subsequent_mask(bptt).to(device)
    pred_text = []
    for i in range(no_words):

        batch_size = gen_data.size(0)
        if batch_size != bptt:
            src_mask_ = src_mask[:batch_size, :batch_size]
        output_softmax = model(gen_data, src_mask_)
        output_softmax_permuted = output_softmax.permute(1, 0, 2)
        indices = torch.topk(output_softmax_permuted,k ,dim=2).indices.squeeze(0)
        
        values = torch.topk(softmax(output_softmax_permuted),k ,dim=2).values
        values = values/torch.sum(values,dim = 2,keepdims = True)

        
        ind_sampled = torch.distributions.Categorical(values.squeeze(0)).sample()
        next_index = indices[-1][ind_sampled[-1]]
        

        pred_text.append([vocab.lookup_token(next_index)][0])
        if(batch_size < 15):
            gen_data = torch.cat((gen_data[:,:],next_index.unsqueeze(0).unsqueeze(0)),0)
            batch_size= gen_data.size(0)
        else:
            gen_data = torch.cat((gen_data[1:,:],next_index.unsqueeze(0).unsqueeze(0)),0)
            batch_size= gen_data.size(0)
            
    return pred_text

In [43]:
import morfessor
import math

with open('models/morfessor_model.p','rb') as f:
    models = pickle.load(f)

In [44]:
def convert_to_morph(l):
    a = []
    for v in l:
        t1 = '-'.join(models[0].viterbi_segment(v)[0])
        tr = re.sub(r'[ ]+', r' ', t1)
        tr = re.sub(r'- -', r'*', tr)
        tr = re.sub(r'-[ ]+', r'*', tr)
        tr = re.sub(r'[ ]+-', r'*',tr)
        tr = re.sub(r' ', r' * ',tr)
        tr = re.sub(r'\*', r' * ',tr)
        tr = re.sub(r'  ', r' ', tr)
        tr = re.sub(r'-', r' ',tr)
        a.append(tr)
    return a

def revert_sentence(text):
#     tr = re.sub(r' ', r'',text)
    tr = re.sub(r'\*', r' ',text)
    return tr

In [45]:
# st = ['लामो समयसम्म प्रयोग गर्न सकिन्छ ।']
# st = ['तपाईंलाई कस्तो पुस्तकहरू मन']
st = ['नेपालमा आधुनिक']
st = convert_to_morph(st)
st_i = data_process(st)
st_i = st_i.unsqueeze(1).to(device)

In [47]:
z = nonnaive_generator(loaded_model, st_i,no_words = 100)

In [54]:
st[0]+ revert_sentence(''.join(z))

'नेपालमा * आधुनिक  <unk> भएका <unk>मात्र <unk> भएकोथियो समेत हो। विद्यार्थी <unk> <unk>?<unk>  ?हामीपनि ? ?<unk>? ?<unk> ? ?<unk> <unk> ?<unk> ? ?त्यो <unk>र निर्माण ?<unk> ? र  मन्त्रालयले रहेकोछ।उनले१वर्ष <unk> <unk> <unk> <unk> ? <unk> ? ? ?हामी ? '