# A neural probabilistic language model
Step 1: get the corpus and split into paragraphs, into sentence, into words

Step 2: get the training data, i.e, certain no. of paragraphs (say 95%)

Step 3: get the word frequency in hash_map for each words in training data

Step 4: prepare a vocabulary. in which keep the word whose frequency is greater than some cut-off frequency

Step 5: create word-to-index and index-to-word mapping for each words [One-Hot Encoding]\

Step 6: prepare the training and testing data with one-hot encoding

Step 7: build a neural probabilistic language model

Step 8: create training, testing and train loop and train the model

Step 9: evaluate the model with own input

Step 10: Use language generation metrics to evaluate the model

### Import Required packages

In [2]:
import numpy as np
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import multiprocessing

import time

In [4]:
print("GPU is", "available" if torch.cuda.is_available() else "NOT AVAILABLE")

GPU is available


In [5]:
!nvidia-smi

Wed Dec 21 16:44:38 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.141.03   Driver Version: 470.141.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0  On |                  N/A |
|  0%   34C    P8    19W / 170W |    867MiB / 12045MiB |     37%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [6]:
torch.__version__

'1.13.1'

In [7]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)

In [8]:
!nvidia-sim

/bin/bash: nvidia-sim: command not found


In [9]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

### Import data file corpus and explore

In [10]:
oscar_dedup_file_path = "./ne_dedup.txt"

In [11]:
with open(oscar_dedup_file_path) as f:
  lines = f.readlines()

In [12]:
print(type(lines), len(lines))
lines[100:105]

<class 'list'> 1490594


['सरसफाइको प्रतिक भनेर चिनिने हरियो रंङले गाउँ हरियाली बनेको छ । सुकिदह गाउँपालिका वडा नं. ३ मा करिब ७ सय घरहरु रहेका छन । ती घरमध्ये ८७ प्रतिशत घरमा हरियो रङ लगाइसकेको वडा सचिब कमलराज वलीले जानकारी दिए ।\n',
 'गाउँको फरक परिचान बनाउनको लागि यो अभियानको सुरुवात गरेको सुकिदह ३ का वडाअध्यक्ष जलेशकुमार केसीले बताए । पूर्ण सरसफाइमा सहयोग पु¥याउने उद्देश्यका साथ यो अभियानको सुरुवात गरिएको हो । गाउँका पुरै घर एउटै रङ लगाउँदा निकै सुन्दर देखिएको स्थानीयहरुले बताएका छन । उनीहरु वडापालिकामा एउटै रङ लगाउने कुरामा सहमत भएका थिए ।\n',
 'सरकारकै इशारामा चल्ने किसिमको कर्मचारी संयन्त्र बनाउन आफूहरुले छलफल गरिरहेको संकेत गर्दै उनले कर्मचारीहरु सिंहदरबारबाट प्रदेश र गाउँपालिकामा जान नमान्ने नियति छिट्टै अन्त्य हुने बताए ।\n',
 'पोखरा । नेपाल कम्युनिष्ट पार्टी (नेकपा) अध्यक्ष पुष्पकमल दाहालले कर्मचारीकै कारण सरकारले जनता सामु गरेका बाचा पुरा गर्न नसकेको बताएका छन् ।\n',
 'प्रेस सेन्टर र प्रेस चौतारी गण्डकी प्रदेशले पोखरामा बुधबार आयोजना गरेको पत्रकार सम्मेलनमा दाहालले राजनीतिक परिवर्तनका लागि दलहरु सफल

In [13]:
len(lines)

1490594

In [14]:
lines[10000]

'– नियमित रुपमा कक्षाहरुमा भाग लिने, सबै गृहकार्यहरु समयमा गर्ने, नोटहरु राम्रोसँग तयार गर्ने, स्कुल कलेजको हरेक कार्यमा संलग्न हुने। एकटक पढिसकेको कुरा फेरी दोहोर्याएर पढ्नलाई पर्याप्त समय छुट्टाउनुहोस् जसकारण परिक्षाको समयमा आतिनु नपरोस। अन्तिम समयमा आएर पढ्ने बानी हटाउनुहोस्।\n'

In [15]:
words = lines[10000].split()
print(words)

['–', 'नियमित', 'रुपमा', 'कक्षाहरुमा', 'भाग', 'लिने,', 'सबै', 'गृहकार्यहरु', 'समयमा', 'गर्ने,', 'नोटहरु', 'राम्रोसँग', 'तयार', 'गर्ने,', 'स्कुल', 'कलेजको', 'हरेक', 'कार्यमा', 'संलग्न', 'हुने।', 'एकटक', 'पढिसकेको', 'कुरा', 'फेरी', 'दोहोर्याएर', 'पढ्नलाई', 'पर्याप्त', 'समय', 'छुट्टाउनुहोस्', 'जसकारण', 'परिक्षाको', 'समयमा', 'आतिनु', 'नपरोस।', 'अन्तिम', 'समयमा', 'आएर', 'पढ्ने', 'बानी', 'हटाउनुहोस्।']


In [16]:
oscar_paras = []
for sentence in tqdm(lines):
  words = sentence.split()
  oscar_paras.append(words) 

100%|██████████| 1490594/1490594 [00:09<00:00, 165093.84it/s]


In [17]:
print(oscar_paras[10000])

['–', 'नियमित', 'रुपमा', 'कक्षाहरुमा', 'भाग', 'लिने,', 'सबै', 'गृहकार्यहरु', 'समयमा', 'गर्ने,', 'नोटहरु', 'राम्रोसँग', 'तयार', 'गर्ने,', 'स्कुल', 'कलेजको', 'हरेक', 'कार्यमा', 'संलग्न', 'हुने।', 'एकटक', 'पढिसकेको', 'कुरा', 'फेरी', 'दोहोर्याएर', 'पढ्नलाई', 'पर्याप्त', 'समय', 'छुट्टाउनुहोस्', 'जसकारण', 'परिक्षाको', 'समयमा', 'आतिनु', 'नपरोस।', 'अन्तिम', 'समयमा', 'आएर', 'पढ्ने', 'बानी', 'हटाउनुहोस्।']


In [18]:
len(oscar_paras)

1490594

In [19]:
train_size = int(len(oscar_paras) * 0.95)
train_size

1416064

In [20]:
len(oscar_paras[6993])

60

In [21]:
# create the corpus from training dataset
train_corpus = []
test_corpus = []
for idx, sentence in tqdm(enumerate(oscar_paras)):
  words = []
  for word in sentence:
    words.append(word)
  if(idx < train_size):
    train_corpus.append(words)
  else:
    test_corpus.append(words)


1490594it [00:10, 149035.97it/s]


In [22]:
print(train_corpus[0] == oscar_paras[0])
print(test_corpus[0] == oscar_paras[len(train_corpus)])

True
True


In [23]:
len(train_corpus) + len(test_corpus) == len(oscar_paras)

True

In [24]:
words_term_frequency_train = {}
for sentence in tqdm(train_corpus):
  for word in sentence:
    words_term_frequency_train[word] = words_term_frequency_train.get(word, 0) + 1

100%|██████████| 1416064/1416064 [00:15<00:00, 90939.43it/s]


# Analyze on word frequency in the corpus

In [25]:
# get all the words with frequency less than cutoff_frequency
cutoff_word_frequency = 100
exceed_cutoff_word_count = 0
total_words = len(words_term_frequency_train)
for word, freq in tqdm(words_term_frequency_train.items()):
    if (freq > cutoff_word_frequency):
        exceed_cutoff_word_count += 1
total_words, exceed_cutoff_word_count, total_words - exceed_cutoff_word_count


100%|██████████| 2130172/2130172 [00:00<00:00, 4341773.79it/s]


(2130172, 40544, 2089628)

### Vocabulary Preparation

In [26]:
# keep all the words which have frequency higher that the cutoff_word_frequency in the vocab
UNK_SYMBOL = "<UNK>"
vocab = set([UNK_SYMBOL])
vocab

{'<UNK>'}

In [27]:
print(type(train_corpus))
print(train_corpus[0:2])

<class 'list'>
[['बर्दिबास', 'नगरपालिकाको', 'तेस्रो', 'नगर', 'परिषदबाट', 'पारित', 'आ.व.२०७३।७४', 'को', 'संशोधित', 'र', '२०७४।७५', 'को', 'प्रस्तावित', 'नीति,', 'कार्यक्रम', 'तथा', 'बजेट'], ['अार्थिक', 'वर्ष', '२०७५/७६', 'काे', 'नदिजन्य', 'पदार्थकाे', 'उत्खनन्', 'गरी', 'बिक्रि', 'वितरण', 'तथा', 'अान्तरिक', 'निकासी', 'गर्ने', 'कार्यकाे', 'बाेलपत्र', 'सम्बन्धी', 'सुचना']]


In [28]:
# create vocabulary
for sentence in tqdm(train_corpus):
    for word in sentence:
        if words_term_frequency_train.get(word, 0) >= cutoff_word_frequency:
            vocab.add(word)

100%|██████████| 1416064/1416064 [00:16<00:00, 87101.79it/s]


In [29]:
len(vocab), type(vocab), list(vocab)[0:5]

(40832, set, ['कीर्ति', 'फ्रान्सको', 'रहरले', 'बेजोड', 'तट'])

### One Hot Encoding and Data Loader

### Procedure
Step 1 : Create the batch generator for the corpus sentences using "generate_batch" function
Step 2 : Create the one-hot generator for the batch sentences using "one-hot generator" function
Step 3 : 

In [30]:
# create word-to-id and id-to-word mapping
word_to_id_mappings = {}
id_to_word_mappings = {}
for idx, word in enumerate(vocab):
    word_to_id_mappings[word] = idx
    id_to_word_mappings[idx] = word

# return the id of word if found in vocab else, return id for unknown word
def get_id_of_word(word):
    return word_to_id_mappings.get(word, word_to_id_mappings[UNK_SYMBOL])

In [31]:
def oneHotEncoding(sentence):
    """
    Take an input sentence and generate one hot vectors of certain context size = 3 in this particular case
    Arguments:
        sentence: any sentences
    
    Output : 
        tensor of shape (x, 3) where 3 is the size of one hot vector and x is the number of one hot vectors
    """
    batch_list = []
    for i, word in enumerate(sentence):
        if i+2 >= len(sentence):
            break
        context_words = [get_id_of_word(word), get_id_of_word(sentence[i + 1]), get_id_of_word(sentence[i + 2])]
        batch_list.append(context_words)
    return torch.LongTensor(batch_list)


In [32]:
def concatenate_batch_corpus(batch_corpus):
    """ 
    Concatenate the sentences presented in the form of list in batch_corpus
    Arguments:
        batch_corpus : list of sentences of certain batch size
    Output:
        string : concatenation of all sentences in the batch_corpus into a single sentence
    """
    string = ""
    for sentence in batch_corpus:
        sen = ' '.join(sentence)
        sen += " "
        string += sen
    return string
    

In [33]:
# generate batch for train corpus
def generate_batch(corpus, batch_size=32):
    """ 
    Generate batch of sentences from the corpus
    Arguments:
        corpus : list of sentences where sentences is the list of words. (Corpus is 2 dimensional list)
        batch_size : number of sentences to be yielded for a batch
    """
    for i in range(0, len(corpus), batch_size):
        yield corpus[i:i + batch_size]

In [34]:
def one_hot_generator(generator):
    """ 
    Generate one hot vector for a batch of sentences from the corpus
    Arguments:
        generator : a function to generate the batch of input sentences
        
    Output : 
        batch_one_hot : batch of one-hot vectors for the input batch of sentences
    """
    for idx, batch in enumerate(generator):
        batch_concatenated_string = concatenate_batch_corpus(batch)
        batch_one_hot = oneHotEncoding(batch_concatenated_string)
        yield batch_one_hot

In [35]:
train_generator = generate_batch(train_corpus, batch_size=5)
train_one_hot_generator = one_hot_generator(train_generator)

for i, one_hot in enumerate(train_one_hot_generator):
    print(one_hot)
    print(one_hot[:, 0:2])
    print(one_hot[:, 2])
    if i > -1:
        break

tensor([[14384,  6318,  5620],
        [ 6318,  5620,  4888],
        [ 5620,  4888,  5620],
        ...,
        [ 9600, 34232, 39777],
        [34232, 39777,  5620],
        [39777,  5620,  5620]])
tensor([[14384,  6318],
        [ 6318,  5620],
        [ 5620,  4888],
        ...,
        [ 9600, 34232],
        [34232, 39777],
        [39777,  5620]])
tensor([ 5620,  4888,  5620, 14384,  9600, 12336,  5620,   722, 30190,  6318,
        33168,  9600, 28879,  5620, 39777,  9600, 39777,  5620,  5620,  6440,
        23495, 12336,  5620,  6318,  5620,  5620,   722, 30190,  6318,  5620,
        33168,  6318,  5620, 34984,  4888, 14384,  9600, 38683,  5620, 33168,
         9600,  6318,  5620,  6440,  5620, 16331, 39907, 30478, 39907, 11086,
         1562, 22731, 20917, 15529, 22731, 25914,  5620, 39777,  5620,  5620,
        12336,  4541,  4330,  5620, 22728,  5620,  6440,  5620,  6318,  5620,
        11086,  1562, 22731, 25914, 15529, 22731,  8424,  5620, 39777,  5620,
         5620, 331

In [36]:
len(vocab), len(oscar_paras), train_size, len(lines) - train_size

(40832, 1490594, 1416064, 74530)

# Model Development

In [37]:
class TrigramNNmodel(nn.Module):
  def __init__(self, vocab_size, embedding_dim = 300, context_size = 2):
    super(TrigramNNmodel, self).__init__()
    self.context_size = context_size
    self.embedding_dim = embedding_dim
    
    self.embeddings = nn.Embedding(vocab_size, embedding_dim)
    self.linear1 = nn.Linear(context_size * embedding_dim, 124)
    self.linear2 = nn.Linear(124, vocab_size, bias = False)

  def forward(self, inputs):
    # compute x' : concatenation of x1 and x2 embeddings
    embeds = self.embeddings(inputs).view((-1, self.context_size * self.embedding_dim))
    # compute h: tanh(W_1. x' + b)
    out = torch.tanh(self.linear1(embeds))
    # compute W_2.h
    out = self.linear2(out)
    # compute y: log_softmax(W_2.h)
    log_prods = F.log_softmax(out, dim = 1)
    # return log probabilites
    # log_prods : BATCH_SIZE x len(vocab)
    return log_prods

### Model Summary

In [38]:
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 300
CONTEXT_SIZE = 2
model = TrigramNNmodel(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)

In [40]:
from pytorch_model_summary import summary
print(summary(model, torch.zeros(807, 2).long()))

-----------------------------------------------------------------------
      Layer (type)        Output Shape         Param #     Tr. Param #
       Embedding-1       [807, 2, 300]      12,249,600      12,249,600
          Linear-2          [807, 124]          74,524          74,524
          Linear-3        [807, 40832]       5,063,168       5,063,168
Total params: 17,387,292
Trainable params: 17,387,292
Non-trainable params: 0
-----------------------------------------------------------------------


### Training and evaluation loop

In [41]:
def get_accuracy_from_log_probs(log_probs, labels):
  probs = torch.exp(log_probs)
  predicted_label = torch.argmax(probs, dim = 1)
  acc = (predicted_label == labels).float().mean()
  return acc

In [78]:
def evaluation_loop(model, loss_fn, one_hot_generator, device):
    model.eval()
    
    mean_acc, mean_loss = 0, 0
    count = 0

    start_time = time.time()
    with torch.inference_mode():
        for iteration, data_tensor in enumerate(one_hot_generator):
            context_tensor = data_tensor[:, 0:2]
            target_tensor = data_tensor[:, 2]
            context_tensor, target_tensor = context_tensor.to(device), target_tensor.to(device)
            log_probs = model(context_tensor)

            mean_loss += loss_fn(log_probs, target_tensor).item()
            mean_acc += get_accuracy_from_log_probs(log_probs, target_tensor)
            count += 1

            if(iteration % 500 == 0):
                print(f"Evaluation iteration : {iteration} completed; Evaluation Accuracy : {mean_acc / count}; Evaluation Loss : {mean_loss / count}; Time Taken : {time.time() - start_time}")
                start_time = time.time()
    
    return mean_loss / count, mean_acc / count

In [79]:
def training_loop(model, loss_fn, optimizer, one_hot_generator, device):
    model.train()

    train_acc, train_loss = 0, 0
    count = 0

    start_time = time.time()
    for iteration, data_tensor in enumerate(one_hot_generator):
        # get X, y data
        context_tensor = data_tensor[:, 0:2]
        target_tensor = data_tensor[:, 2]
        context_tensor, target_tensor = context_tensor.to(device), target_tensor.to(device)

        # Forward propagation
        log_probs = model(context_tensor)

        # calculate the loss and accuracy
        acc = get_accuracy_from_log_probs(log_probs, target_tensor)
        loss = loss_fn(log_probs, target_tensor)
        train_loss += loss.item()
        train_acc += get_accuracy_from_log_probs(log_probs, target_tensor)

        # optimizer zero grad
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        count += 1
        if(iteration % 500 == 0):
            print(f"Training Iteration : {iteration} completed; Training accuracy : {train_acc / count}; Training Loss : {train_loss / count}; Time Taken : {time.time() - start_time}")
            start_time = time.time()

    return train_loss / count, train_acc / count
        

In [80]:
def train(model, train_one_hot_generator, test_one_hot_generator, optimizer, loss_fn, epochs, device):
  results = {
    "train_loss" : [],
    "train_acc" : [], 
    "val_loss" : [], 
    "val_acc" : []
  }
  
  for epoch in tqdm(range(epochs)):
    print(f"\n------------------- Epoch: {epoch+1} -------------------\n")
    train_loss, train_acc = training_loop(model = model, loss_fn=loss_fn, optimizer=optimizer, one_hot_generator=train_one_hot_generator, device = device)
    val_loss, val_acc = evaluation_loop(model, loss_fn=loss_fn, one_hot_generator=test_one_hot_generator, device = device)
    
    print(
      f"Epoch: {epoch+1} | "
      f"train_loss: {train_loss:.4f} | "
      f"train_acc: {train_acc:.4f} | "
      f"val_loss: {val_loss:.4f} | "
      f"val_acc: {val_acc:.4f}"
    )

    results["train_loss"].append(train_loss)
    results["train_acc"].append(train_acc)
    results["val_loss"].append(val_loss)
    results["val_acc"].append(val_acc)
  
  return results


In [83]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
EPOCHS = 2


# create the generator for the train and test data
train_generator = generate_batch(train_corpus, batch_size=4)
train_one_hot_generator = one_hot_generator(train_generator)

test_generator = generate_batch(test_corpus, batch_size=4)
test_one_hot_generator = one_hot_generator(test_generator)

# create model
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 300
CONTEXT_SIZE = 2
nplm_model = TrigramNNmodel(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE).to(DEVICE)

# optimizer and loss_fn
optimizer = torch.optim.Adam(params=nplm_model.parameters(), lr=0.001)
loss_function = nn.NLLLoss()

In [84]:
results = train(model = nplm_model, 
                train_one_hot_generator=train_one_hot_generator, 
                test_one_hot_generator=test_one_hot_generator, 
                optimizer=optimizer, 
                loss_fn=loss_function, 
                epochs=EPOCHS, 
                device=DEVICE)

  0%|          | 0/2 [00:00<?, ?it/s]


------------------- Epoch: 1 -------------------

Training Iteration : 0 completed; Training accuracy : 0.0; Training Loss : 10.63282585144043; Time Taken : 0.023766517639160156


  0%|          | 0/2 [00:12<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.67 GiB (GPU 0; 11.76 GiB total capacity; 6.23 GiB already allocated; 1.28 GiB free; 8.99 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF