In [1]:
import torch
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

from torch.utils.data import dataset

from torch import Tensor, nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer

import regex as re
import os
import time
from tqdm import tqdm

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
print("\u0964", "\u003f", "\u002c", "\u0900", "\u097F", "\u2020")

। ? , ऀ ॿ †


In [4]:
file_path = './modified_ne_dedup.txt'
if not os.path.exists(file_path):
    with open('./ne_dedup.txt', 'r', encoding='utf-8') as f:
        text = f.read()
        # put space in beteen the | -> devanagari danda to make it a separate word.
        text = re.sub(r'\s*[\u0964]\s*', r'\u0020\u0964\u0020', text)
        # put space around the question mark ?  to make it a separate word
        text = re.sub(r'\s*[\u003f]\s*', r'\u0020\u003f\u0020', text)
        # put space in between comma(,)
        text = re.sub(r'\s*[\u002c]\s*', r'\u0020\u002c\u0020', text)
        # remove space around the new line character
        text = re.sub(r'\s*\n\s*','\n', text)
        # replace any non-devangari string with a blank
        text = re.sub(r'[^\u0900-\u097F,?\s+]','', text) 
    with open('./modified_ne_dedup.txt', 'w', encoding='utf-8') as f:
        f.write(text)
else:
    print(f"Reading file  : {file_path}")
    with open('./modified_ne_dedup.txt', 'r', encoding='utf-8') as f:
        text = f.read()

Reading file  : ./modified_ne_dedup.txt


In [5]:
len(text.split('\n'))

341961

In [6]:
train_split = 30000
test_data_length = 10000

train_iter_first = text.split('\n')[:train_split]
test_iter = text.split('\n')[train_split:train_split+test_data_length]

In [7]:
type(train_iter_first), type(train_iter_first[0])

(list, str)

In [8]:
tokenizer = get_tokenizer(None)
vocab = build_vocab_from_iterator(
    map(tokenizer, train_iter_first), specials=['<unk>']
        )
vocab.set_default_index(vocab['<unk>'])

In [9]:
type(vocab.vocab.get_stoi())
for key, value in list(vocab.vocab.get_stoi().items())[:2]:
    print(f'{key}: {value}')

९९१६५: 342568
९८७०: 342566


In [10]:
def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
    """Converts raw text into a flat Tensor."""
    # obtain the data in tensor format for each line
    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long)
            for item in raw_text_iter]
    # concatenate all the lines
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

In [11]:
print(test_iter[0])
print(tokenizer(test_iter[0]))
print(vocab(tokenizer(test_iter[0])))

    भित्रको कामको जानकारी को पार्ट टाइम छोटो समयको बिहान पार्ट टाइम लामो समयको पार्ट टाइमको जानकारी
['भित्रको', 'कामको', 'जानकारी', 'को', 'पार्ट', 'टाइम', 'छोटो', 'समयको', 'बिहान', 'पार्ट', 'टाइम', 'लामो', 'समयको', 'पार्ट', 'टाइमको', 'जानकारी']
[5282, 1266, 66, 32, 22985, 6753, 1678, 1818, 398, 22985, 6753, 252, 1818, 22985, 52799, 66]


In [12]:
train_data = data_process(train_iter_first)
test_data = data_process(test_iter)

In [13]:
train_data.shape, test_data.shape

(torch.Size([6406801]), torch.Size([2102501]))

In [14]:
print(len(vocab))

342571


In [15]:
def batchify(data: Tensor, batch_size: int) -> Tensor:
    """Divides the data into batch_size separate sequences, removing extra elements
    that wouldn't cleanly fit.
    Args:
        data: Tensor, shape [N]
        batch_size: int, batch size
    Returns:
        Tensor of shape [N // bsz, bsz]
    """
    seq_len = data.size(0) // batch_size
    data = data[:seq_len * batch_size]
    data = data.view(batch_size, seq_len).t().contiguous()
    return data.to(device)

In [17]:
text = ['आधिकारिक निर्णयको कारणले', 'वाणिज्य बिभागले', 'संयुक्त राज्य अमेरिकी समुद्री पानी निर्माताद्वारा संयुक्त राज्य']
sample_data = data_process(text)
print(sample_data.size(), sample_data)

sample_data = batchify(sample_data, 3)
sample_data

torch.Size([13]) tensor([ 2087,  5695,   569,   898, 28362,   358,   466,   411,  6549,   294,
            0,   358,   466])


tensor([[ 2087, 28362,  6549],
        [ 5695,   358,   294],
        [  569,   466,     0],
        [  898,   411,   358]], device='cuda:0')

In [18]:
train_data.size()

torch.Size([6406801])

In [19]:
bptt = 35
batched_train_data = batchify(train_data, bptt).to(device)  # shape [seq_len, batch_size]
batched_test_data = batchify(test_data, bptt).to(device)


import math
def get_batch(source: Tensor, i: int):
    """
    Args:
        source: Tensor, shape [full_seq_len, batch_size]
        i: int
    Returns:
        tuple (data, target), where data has shape [seq_len, batch_size] and
        target has shape [seq_len * batch_size]
    """
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    #target = source[i+1:i+1+seq_len]
    return data, target

In [20]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2)
                             * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(
            d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
        """
        Args:
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]
        Returns:
            output Tensor of shape [seq_len, batch_size, ntoken]
        """
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output

In [21]:
ntokens = len(vocab)  # size of vocabulary
emsize = 300  # embedding dimension
d_hid = 400  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # number of heads in nn.MultiheadAttention
dropout = 0.05  # dropout probability
model = TransformerModel(ntokens, emsize,nhead, d_hid,
                         nlayers, dropout).to(device)





In [22]:
criterion = nn.CrossEntropyLoss()
lr = 1  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

In [23]:
def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

In [24]:
def train(model: nn.Module) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(bptt).to(device)

    num_batches = len(batched_train_data) // bptt
    progress_bar = tqdm(enumerate(range(0, batched_train_data.size(0) - 1, bptt)), total=num_batches, desc=f'Epoch {epoch}')
    for batch_idx, i in progress_bar:
        ### batch_idx -> (1, 2, 3, 4, ...)
        ### i -> (0, bptt, 2*bptt, ....)
        data, targets = get_batch(batched_train_data, i)
        batch_size = data.size(0)
        if batch_size != bptt:  # only on last batch
            src_mask = src_mask[:batch_size, :batch_size]
        output = model(data, src_mask)
        loss = criterion(output.view(-1, ntokens), targets)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()

        ## calculate the postfix description for the progress bar
        cur_loss = total_loss / (batch_idx + 1)
        ppl = math.exp(cur_loss)
        
        progress_bar.set_postfix({"loss": cur_loss, "ppl" : ppl}, refresh=True)

In [25]:
softmax = nn.Softmax(dim=2)
# softmax = nn.LogSoftmax(dim=2)

In [26]:
def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(bptt).to(device)

    num_batches = len(eval_data) // bptt
    with torch.no_grad():
        progress_bar = tqdm(enumerate(range(0, eval_data.size(0) - 1, bptt)), total=num_batches, desc=f'Validation {epoch}')
        for batch_idx, i in progress_bar:
            data, targets = get_batch(eval_data, i)
            batch_size = data.size(0)
            if batch_size != bptt:
                src_mask = src_mask[:batch_size, :batch_size]
            output = model(data, src_mask)
            output_softmax = softmax(output)
            output_softmax_permuted = output_softmax.permute(1, 0, 2)
            indices = torch.argmax(output_softmax_permuted, dim=2)
            target_indices = targets.t()
            output_flat = output.view(-1, ntokens)
            total_loss += batch_size * criterion(output_flat, targets).item()
    
    return total_loss / (len(eval_data) - 1)


In [27]:
# Loop over epochs. Save the model if the validation loss is the best
# we've seen so far. Adjust the learning rate after each epoch.
import copy
best_val_loss = float('inf')
epochs = 10
best_model = None
print(len(vocab.get_stoi()))

# preload the model if exists to train more epochs
best_model_path = 'models/best_model_sample_test_corrected.pt'
if os.path.exists(best_model_path):
    print(f"loading the model {best_model_path}")
    model.load_state_dict(torch.load(best_model_path))
    model.to(device)
    
for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(model)
    eval_loss = evaluate(model, batched_test_data)
    eval_ppl = math.exp(eval_loss)
    elapsed = time.time() - epoch_start_time
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
         f'valid loss {eval_loss:5.2f} | valid ppl {eval_ppl:8.2f}')
    print('-' * 89)
    #best_model = copy.deepcopy(model)
    if eval_loss < best_val_loss:
       best_val_loss = eval_loss
       best_model = copy.deepcopy(model)

342571


Epoch 1:   4%|▎      | 209/5230 [00:42<16:51,  4.97it/s, loss=10.4, ppl=3.43e+4]


KeyboardInterrupt: 

In [None]:
directory_path = 'models'
# Create the directory if it doesn't exist
if not os.path.exists(directory_path):
    os.makedirs(directory_path)
torch.save(best_model.state_dict(), os.path.join(directory_path, 'best_model_sample_test_corrected.pt'))

In [None]:
import time
def generator(model: nn.Module, gen_data: Tensor, no_words = 10):
    model.eval()
    temp_text = text
    src_mask = generate_square_subsequent_mask(bptt).to(device)
    pred_text = []
    for i in range(no_words):
        print('i:', i)
        batch_size = gen_data.size(0)
        if batch_size != bptt:
            src_mask_ = src_mask[:batch_size, :batch_size]
        output_softmax = model(gen_data, src_mask_)
        output_softmax_permuted = output_softmax.permute(1, 0, 2)
        #print(softmax(output_softmax_permuted))
        indices = torch.topk(output_softmax_permuted,10 ,dim=2).indices.squeeze(0)
        values = torch.topk(softmax(output_softmax_permuted),10 ,dim=2).values
        values = values/torch.sum(values,dim = 2,keepdims = True)
        #print(output_softmax_permuted[indices])
        # print(indices,values)
        ind_sampled = torch.distributions.Categorical(values.squeeze(0)).sample()
#         index = indices.squeeze(0)[ind_sampled.unsqueeze(0)]
        # print('is',ind_sampled)
        next_index = indices[-1][ind_sampled[-1]]
        # print(indices[-1][ind_sampled[-1]])
        
#     return indices
        
        #print(indices[0],indices[1])
        #for j in range(batch_size):
        
        print('next word: ', [vocab.lookup_token(next_index)],'values: ',values.squeeze(0)[-1])
                                  
#         print(i,"Gen_data: ",gen_data,"Pred_data: ",indices)


        pred_text.append([vocab.lookup_token((next_index))][0])
        if(batch_size <= 10):
            gen_data = torch.cat((gen_data[:,:],next_index.unsqueeze(0).unsqueeze(0)),0)
            batch_size= gen_data.size(0)
        else:
            gen_data = torch.cat((gen_data[1:,:],next_index.unsqueeze(0).unsqueeze(0)),0)
            batch_size= gen_data.size(0)
            
    return pred_text



In [None]:
#Load saved model
#model.load_state_dict(torch.load('models/best_model_3prex1005.pt'))
model.load_state_dict(torch.load('models/best_model_sample_test_corrected.pt'))
model.to(device)
best_model = model

In [65]:
text = ['आधिकारिक निर्णयको कारणले', 'वाणिज्य बिभागले', 'संयुक्त राज्य अमेरिकी समुद्री पानी निर्माताद्वारा संयुक्त राज्य']
text = ['यो सकियो। म काठमाडौं घुम्दै छु']
text = ['म काठमाडौं घुम्दै']
sample_data = data_process(text)
print(sample_data.size(), sample_data)

sample_data = batchify(sample_data, 3)
sample_data

torch.Size([3]) tensor([68, 53,  0])


tensor([[68, 53,  0]], device='cuda:0')

In [66]:
# print(sample_data[:,-1].unsqueeze(1))
# print(sample_data)
z = generator(best_model, sample_data[:,-1].unsqueeze(1),no_words = 50)

i: 0
next word:  ['गाउँपालिका'] values:  tensor([0.2206, 0.1376, 0.1331, 0.1140, 0.1128, 0.1114, 0.0491, 0.0444, 0.0400,
        0.0370], device='cuda:0', grad_fn=<SelectBackward0>)
i: 1
next word:  ['३'] values:  tensor([0.2736, 0.2708, 0.1952, 0.0521, 0.0502, 0.0492, 0.0366, 0.0311, 0.0275,
        0.0137], device='cuda:0', grad_fn=<SelectBackward0>)
i: 2
next word:  ['मा'] values:  tensor([0.2808, 0.2520, 0.1427, 0.0669, 0.0667, 0.0568, 0.0361, 0.0356, 0.0355,
        0.0271], device='cuda:0', grad_fn=<SelectBackward0>)
i: 3
next word:  ['भएको'] values:  tensor([0.4773, 0.1172, 0.0760, 0.0676, 0.0570, 0.0497, 0.0441, 0.0374, 0.0372,
        0.0365], device='cuda:0', grad_fn=<SelectBackward0>)
i: 4
next word:  ['हो'] values:  tensor([0.5777, 0.1994, 0.0839, 0.0303, 0.0251, 0.0250, 0.0213, 0.0133, 0.0124,
        0.0117], device='cuda:0', grad_fn=<SelectBackward0>)
i: 5
next word:  ['।'] values:  tensor([0.8751, 0.0477, 0.0418, 0.0124, 0.0070, 0.0051, 0.0032, 0.0028, 0.0027,
        0

In [67]:
text[0] +' '.join(z)

'म काठमाडौं घुम्दैगाउँपालिका ३ मा भएको हो । कार्तिक २७ , २०७५ । काठमाडौं । सो क्षेत्रमा ठूलो रहेको छ , नेपाल र भारतीय नेपाल र राष्ट्रिय निकुञ्ज , आर्थिक , आर्थिक वर्षमा १० , २ को लागि तीन दिन आज १० हजार ३ सय दिनमा १ सय करोड ४ लाख पर्यटक'

In [51]:
evaluate(best_model, sample_data)

Validation: 0it [00:00, ?it/s]


9.053659439086914

In [52]:
'''
आधिकारिक निर्णयको कारणले , वाणिज्य बिभागले , 
संयुक्त राज्य अमेरिकी समुद्री पानी निर्माताद्वारा संयुक्त

विकास गर्न प्रोत्साहित गरी सहकारी क्षेत्रले आर्थिक दृष्टिले सक्रिय
'''

'\nआधिकारिक निर्णयको कारणले , वाणिज्य बिभागले , \nसंयुक्त राज्य अमेरिकी समुद्री पानी निर्माताद्वारा संयुक्त\n\nविकास गर्न प्रोत्साहित गरी सहकारी क्षेत्रले आर्थिक दृष्टिले सक्रिय\n'

In [53]:
st = ['गर्ने']
st_i = data_process(st)

In [54]:
st_i = st_i.unsqueeze(1).to(device)

In [55]:
z_ = generator(best_model, st_i,no_words =10 )

i: 0
next word:  ['तयारी'] values:  tensor([0.2037, 0.1642, 0.1544, 0.0870, 0.0816, 0.0761, 0.0721, 0.0630, 0.0538,
        0.0439], device='cuda:0', grad_fn=<SelectBackward0>)
i: 1
next word:  ['गरेको'] values:  tensor([0.7071, 0.0723, 0.0510, 0.0422, 0.0361, 0.0231, 0.0203, 0.0192, 0.0168,
        0.0120], device='cuda:0', grad_fn=<SelectBackward0>)
i: 2
next word:  ['छ'] values:  tensor([0.7326, 0.1537, 0.0495, 0.0210, 0.0087, 0.0082, 0.0074, 0.0071, 0.0060,
        0.0056], device='cuda:0', grad_fn=<SelectBackward0>)
i: 3
next word:  ['।'] values:  tensor([9.2223e-01, 4.0236e-02, 2.5793e-02, 4.5705e-03, 2.0539e-03, 1.4398e-03,
        9.9501e-04, 9.9269e-04, 8.5499e-04, 8.3322e-04], device='cuda:0',
       grad_fn=<SelectBackward0>)
i: 4
next word:  ['यस'] values:  tensor([0.2811, 0.1637, 0.1249, 0.1007, 0.0757, 0.0526, 0.0514, 0.0512, 0.0496,
        0.0491], device='cuda:0', grad_fn=<SelectBackward0>)
i: 5
next word:  ['पटक'] values:  tensor([0.4641, 0.1059, 0.0876, 0.0739, 0.073

In [56]:
' '.join(z_)

'तयारी गरेको छ । यस पटक पटक पटक , रौतहट'

In [57]:
z_

['तयारी', 'गरेको', 'छ', '।', 'यस', 'पटक', 'पटक', 'पटक', ',', 'रौतहट']

In [58]:
vocab.get_itos()[22]

'नेपाल'