# Neural Network Language Model

Language modeling (LM) is the use of various statistical and probabilistic techniques to determine the probability  of a given sequence of words occurring in a sentence. Thus, it may be defined as a probability distribution over the vocabulary words.



---
### Importing the required libraries 

In [22]:
import re
import torch
import wandb
import numpy as np
import torch.nn as nn
from tqdm import tqdm
import torch.optim as optim
import gensim.downloader as api
from collections import Counter
import torch.nn.functional as F
from collections import Counter
from nltk.tokenize import word_tokenize
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

In [23]:
config = {
    'batch_size': 512  ,
    'embedding_dim': 300,
    'hidden_dim': 3000,
    'lr': 2*10e-6,
    'num_epochs': 15,
}

### Initializing the wandb project

In [24]:
wandb.login()

run = wandb.init(project='nnlm', config=config,
                 id=f'nnlm__LR_{config["lr"]}_E_{config["num_epochs"]}_BS_{config["batch_size"]}_HD_{config["hidden_dim"]}_ED_{config["embedding_dim"]}')



VBox(children=(Label(value='0.003 MB of 0.015 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.168414…

0,1
Training Accuracy,▅▃█▆▂▁▇▃
Training Loss,█▇▄▄▄▄▃▁
Training Perplexity,█▆▃▃▃▃▂▁
Validation Accuracy,▁▃▅▅▅█▇█
Validation Loss,█▄▂▁▁▁▁▂
Validation Perplexity,█▃▂▁▁▁▁▂

0,1
Training Accuracy,16.49485
Training Loss,4.05606
Training Perplexity,57.74614
Validation Accuracy,19.40299
Validation Loss,5.24244
Validation Perplexity,189.13149


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668978399987586, max=1.0…

### Function to preprocess and clean the data

In [25]:
def preprocess(text):
    """
    Preprocesses the text
    """
    text = text.lower()
    text = re.sub(r"[^a-zA-Z;.]+", ' ', text)
    text = text.replace('  ', ' ')
    text = text.replace(';', ' ; ')
    text = text.replace('.', ' . ')
    text = text.replace('  ', ' ')
    tokens = word_tokenize(text)
    return text, tokens

### Loading the dataset

In [26]:
corpus_data = open('Dataset/Auguste_Maquet.txt', 'r').read()
print(corpus_data[:100])

Chapter I.
The Shade of Cardinal Richelieu.


In a splendid chamber of the Palais Royal, formerly st


In [27]:
cleaned_corpus, tokens = preprocess(corpus_data)

# print(cleaned_corpus[:500], '\n') 
# print(tokens[:50])
print("Number of tokens: ", len(tokens))
print("Number of . and ; in the corpus: ", tokens.count('.')+tokens.count(';'))

Number of tokens:  1014605
Number of . and ; in the corpus:  58690


### Creating the train and test splits

In [44]:
"""
Get the sentences from the corpus and split them into train, validation and test sets
"""

sentences = []
current_sentence = []
for token in tokens:
    if token == '.' or token == ';':
        # if(len(current_sentence) > 5):
        current_sentence.append("<EOS>")
        sentences.append(current_sentence)
        current_sentence = ["<PAD>", "<PAD>", "<PAD>", "<PAD>", "<PAD>"]
    else:
        current_sentence.append(token)

print('Number of sentences in the corpus: ', len(sentences))

with open('Dataset/sentences.txt', 'w') as f:
    for sentence in sentences:
        f.write(' '.join(sentence) + '\n')

train_sentences, test_valid_sentences = train_test_split(
    sentences, test_size=0.3, random_state=42)
valid_sentences, test_sentences = train_test_split(
    test_valid_sentences, test_size=0.6667, random_state=42)

print('Number of training sentences: ', len(train_sentences))
print('Number of validation sentences: ', len(valid_sentences))
print('Number of testing sentences: ', len(test_sentences))

Number of sentences in the corpus:  58690
Number of training sentences:  41083
Number of validation sentences:  5868
Number of testing sentences:  11739


### Creating the vocabulary

In [29]:

vocab_counter = Counter([word for sentence in train_sentences for word in sentence])

# remove the words that appear only once
# vocab_counter = Counter({word: freq for word, freq in vocab_counter.items() if freq > 1})

vocab = ["<PAD>", "<UNK>", "<EOS>"] + [word for word, freq in vocab_counter.items()]

word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}

print("Vocabulary size: ", len(vocab))
print("Most common words: ", vocab_counter.most_common(10))
print("Least common words: ", vocab_counter.most_common()[-10:])


# save the vocab as npy file
np.save('Dataset/vocab.npy', vocab)

Vocabulary size:  18472
Most common words:  [('<PAD>', 205410), ('<EOS>', 41083), ('the', 39613), ('to', 18798), ('of', 17867), ('and', 17443), ('a', 13312), ('i', 11447), ('you', 11292), ('he', 9693)]
Least common words:  [('hops', 1), ('gin', 1), ('rotund', 1), ('canonical', 1), ('lamorici', 1), ('ney', 1), ('changarnier', 1), ('bedeau', 1), ('supplicatingly', 1), ('jaw', 1)]


### Importing the pre-trained word embeddings

In [30]:

print(list(api.info()['models'].keys()))
if not 'word_vectors' in locals():  # so that it doesn't load again if it was already loaded
    word_vectors = api.load("glove-wiki-gigaword-300")

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


### Creating the embedding matrix

In [31]:
"""
Create the embedding matrix for the vocab
"""

embedding_matrix = []
num_unknown_words = 0
for word in vocab:
    if word in word_vectors:
        # embedding_matrix.append(word_vectors[word])
        # print("Word {} found in w2v dict".format(word))
        if len(word_vectors[word]) != config["embedding_dim"]:
            print("Word {} size {} does not match embedding dim {}".format(
                word, len(word_vectors[word]), config["embedding_dim"]))
        else:
            embedding_matrix.append(word_vectors[word])
    else:
        # print("Word {} not in w2v dict".format(word))
        num_unknown_words += 1
        embedding_matrix.append([0]*config["embedding_dim"])

print("Number of unknown words: ", num_unknown_words)

Number of unknown words:  1126


In [32]:
print("Embedding matrix size: ", len(embedding_matrix))
print("Embedding size: ", len(embedding_matrix[0]))
print("Embedding matrix first row: ", embedding_matrix[56])

Embedding matrix size:  18472
Embedding size:  300
Embedding matrix first row:  [ 2.6926e-01  3.8534e-01 -6.5080e-01  7.9847e-02  1.4793e-01 -7.8472e-02
  5.3576e-02 -6.6062e-02  4.2256e-01 -1.3911e+00  3.5039e-01 -3.0858e-01
  4.1294e-01  5.1396e-02 -1.8945e-01 -1.3053e-01  2.2396e-01  3.2059e-01
 -1.4995e-01  3.9264e-02  5.9915e-02 -2.0883e-01  5.2932e-01 -4.0698e-01
 -2.8060e-01 -2.5013e-01  3.9860e-01 -1.5108e-01  3.2029e-01  8.5095e-02
 -1.2038e-01 -5.5589e-01  3.0680e-01 -1.4087e-01 -1.2590e+00  1.1816e-01
  1.4410e-01  5.7897e-01 -4.7504e-01 -3.6376e-01  1.9966e-01  3.7050e-01
 -1.9161e-02  1.8794e-01  5.0041e-02  2.5054e-01  9.8075e-03 -7.6851e-02
 -2.2045e-02 -3.8040e-01  2.4530e-01 -7.3199e-02  8.3152e-02 -4.3383e-01
 -2.0633e-01  4.8931e-01 -5.0676e-01  3.8061e-01 -2.2221e-01  3.8953e-02
  4.3099e-01 -4.9454e-04 -3.6134e-01  1.3887e-01  8.0764e-02 -2.7260e-01
 -3.2459e-02  1.7780e-01  4.4219e-02  3.7202e-01 -1.0937e-01 -1.2273e-01
  5.8154e-02  3.3571e-01  2.5524e-01 -4.0259

In [33]:
embedding_matrix = np.array(embedding_matrix) # since converting a list directly to tensor is very slow
embedding_matrix = torch.tensor(embedding_matrix)
print("Embedding matrix shape: ", embedding_matrix.shape)

# save the embedding matrix locally as npy file
np.save('Dataset/embedding_matrix.npy', embedding_matrix)

Embedding matrix shape:  torch.Size([18472, 300])


### Creating the data and the data loaders

In [34]:

class NNLM_Dataset(Dataset):
    def __init__(self, sentences, word2idx):
        self.data = []
        for sentence in sentences:
            for i in range(len(sentence)-5):
                context = sentence[i:i+5]
                target = sentence[i+5]
                context_idxs = [word2idx[word] if word in word2idx  else word2idx["<UNK>"]  for word in context]
                target_idx = word2idx[target] if target in word2idx else word2idx["<UNK>"]
                self.data.append((context_idxs, target_idx))
                
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

train_dataset = NNLM_Dataset(train_sentences, word2idx)
train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)

print("Number of training samples: ", len(train_dataset))
print("Sample training data: ", train_dataset[57])
print("Number of batches in the training set: ", len(train_loader))

valid_dataset = NNLM_Dataset(valid_sentences, word2idx)
valid_loader = DataLoader(valid_dataset, batch_size=config["batch_size"], shuffle=False)

test_dataset = NNLM_Dataset(test_sentences, word2idx)
test_loader = DataLoader(test_dataset, batch_size=config["batch_size"], shuffle=False)

Number of training samples:  708705
Sample training data:  ([45, 46, 6, 47, 48], 49)
Number of batches in the training set:  1385


### Defining the model

In [35]:
class NNLM(nn.Module):
    def __init__(self, embedding_matrix, vocab_size, embedding_dim = config["embedding_dim"], hidden_dim = config["hidden_dim"]):
        super(NNLM, self).__init__()
        self.embeddings = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.fc1 = nn.Linear(5*embedding_dim, hidden_dim)
        self.dropout = nn.Dropout(0.2)
        
        # self.fc3 = nn.Linear(hidden_dim,1000)
        # self.fc4 = nn.Linear(1000,7500)
        # self.fc2 = nn.Linear(7500, vocab_size)

        self.fc2 = nn.Linear(hidden_dim, vocab_size)
        self.bn1 = nn.BatchNorm1d(5*embedding_dim)
        

    def forward(self, x):
        x = self.embeddings(x)        
        x= x.view(x.size(0), -1)

        # adding batch normalization
        # x = self.bn1(x)

        x=x.float()
        x = F.relu(self.fc1(x))
        x = self.dropout(x)

        x = self.fc2(x)
        return x

model = NNLM(embedding_matrix, len(vocab), config['embedding_dim'], config['hidden_dim'])


### Training the model

In [36]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print("Device: ", device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=config["lr"])

for epoch in range(config["num_epochs"]):
    model.train()
    for context_idxs, target_idx in tqdm(train_loader):
    
        context_idxs, target_idx = np.squeeze(context_idxs), np.squeeze(target_idx) # remove the extra dimension
        context_idxs, target_idx = np.array(context_idxs), np.array(target_idx) # convert to numpy array
        context_idxs, target_idx = torch.from_numpy(context_idxs), torch.from_numpy(target_idx) # convert to torch tensor
        context_idxs, target_idx = context_idxs.to(device), target_idx.to(device)   # move to device
        context_idxs = context_idxs.transpose(1,0)  # transpose to get the batch size first

        optimizer.zero_grad()   # zero the gradients
        log_probs = model(context_idxs) # forward pass
        loss = criterion(log_probs, target_idx) # calculate the loss
        loss.backward() # backprop
        optimizer.step()    # update the weights

    training_loss = loss.item()
    training_acc = (torch.argmax(log_probs, dim=1) == target_idx).sum().item()/len(target_idx)* 100
    training_perplexity = np.exp(training_loss)
    print (f"Epoch {epoch+1}/{config['num_epochs']} - Training loss: {training_loss}, Training accuracy: {training_acc}, Training perplexity: {training_perplexity}")
    
    # Validation
    model.eval()
    valid_loss = 0
    with torch.no_grad():
        for context_idxs, target_idx in valid_loader:
            context_idxs, target_idx = np.squeeze(context_idxs), np.squeeze(target_idx)
            context_idxs, target_idx = np.array(context_idxs), np.array(target_idx)
            context_idxs, target_idx = torch.from_numpy(context_idxs), torch.from_numpy(target_idx)
            context_idxs, target_idx = context_idxs.to(device), target_idx.to(device)
            context_idxs = context_idxs.transpose(1,0)
            
            log_probs = model(context_idxs)
            loss = criterion(log_probs, target_idx)
            valid_loss += loss.item()

    
    validation_loss = valid_loss/len(valid_loader)
    validation_acc = (torch.argmax(log_probs, dim=1) == target_idx).sum().item()/len(target_idx)* 100
    validation_perplexity = np.exp(validation_loss)
    print (f"Epoch {epoch+1}/{config['num_epochs']} - Validation loss: {validation_loss}, Validation accuracy: {validation_acc}, Validation perplexity: {validation_perplexity}")
    
    wandb.log({"Training Loss": training_loss, "Training Accuracy": training_acc, "Training Perplexity": training_perplexity,
                "Validation Loss": validation_loss, "Validation Accuracy": validation_acc, "Validation Perplexity": validation_perplexity})

Device:  cuda


100%|██████████| 1385/1385 [02:42<00:00,  8.54it/s]


Epoch 1/16 - Training loss: 6.172402858734131, Training accuracy: 10.309278350515463, Training perplexity: 479.3365013159206
Epoch 1/16 - Validation loss: 6.304048556907504, Validation accuracy: 12.5, Validation perplexity: 546.7811095114712


100%|██████████| 1385/1385 [02:42<00:00,  8.54it/s]


Epoch 2/16 - Training loss: 6.160462856292725, Training accuracy: 13.402061855670103, Training perplexity: 473.6472547189058
Epoch 2/16 - Validation loss: 5.991466802709243, Validation accuracy: 25.0, Validation perplexity: 400.00090224152194


100%|██████████| 1385/1385 [02:41<00:00,  8.59it/s]


Epoch 3/16 - Training loss: 5.527929782867432, Training accuracy: 15.463917525773196, Training perplexity: 251.62245832543053
Epoch 3/16 - Validation loss: 5.805563396098567, Validation accuracy: 25.0, Validation perplexity: 332.14226829807683


100%|██████████| 1385/1385 [02:36<00:00,  8.85it/s]


Epoch 4/16 - Training loss: 4.8345794677734375, Training accuracy: 17.525773195876287, Training perplexity: 125.78567515284
Epoch 4/16 - Validation loss: 5.668761669420729, Validation accuracy: 25.0, Validation perplexity: 289.67559804554736


100%|██████████| 1385/1385 [02:20<00:00,  9.87it/s]


Epoch 5/16 - Training loss: 5.08284854888916, Training accuracy: 21.649484536082475, Training perplexity: 161.2326815857154
Epoch 5/16 - Validation loss: 5.563753733447954, Validation accuracy: 25.0, Validation perplexity: 260.7999748055362


100%|██████████| 1385/1385 [02:35<00:00,  8.90it/s]


Epoch 6/16 - Training loss: 5.500262260437012, Training accuracy: 10.309278350515463, Training perplexity: 244.75611369306617
Epoch 6/16 - Validation loss: 5.489057424021702, Validation accuracy: 20.833333333333336, Validation perplexity: 242.02896861704


100%|██████████| 1385/1385 [02:42<00:00,  8.55it/s]


Epoch 7/16 - Training loss: 4.887378692626953, Training accuracy: 17.525773195876287, Training perplexity: 132.6055182472701
Epoch 7/16 - Validation loss: 5.438819950702143, Validation accuracy: 16.666666666666664, Validation perplexity: 230.1704107080131


100%|██████████| 1385/1385 [02:42<00:00,  8.50it/s]


Epoch 8/16 - Training loss: 4.944179058074951, Training accuracy: 19.587628865979383, Training perplexity: 140.35557976264357
Epoch 8/16 - Validation loss: 5.404058535893758, Validation accuracy: 16.666666666666664, Validation perplexity: 222.30682803237855


100%|██████████| 1385/1385 [02:41<00:00,  8.58it/s]


Epoch 9/16 - Training loss: 4.635165691375732, Training accuracy: 14.432989690721648, Training perplexity: 103.04499024702416
Epoch 9/16 - Validation loss: 5.379032763780332, Validation accuracy: 16.666666666666664, Validation perplexity: 216.81246510410972


100%|██████████| 1385/1385 [02:41<00:00,  8.58it/s]


Epoch 10/16 - Training loss: 4.796623229980469, Training accuracy: 20.618556701030926, Training perplexity: 121.10079677055536
Epoch 10/16 - Validation loss: 5.363888801312914, Validation accuracy: 16.666666666666664, Validation perplexity: 213.5538020834785


100%|██████████| 1385/1385 [02:40<00:00,  8.61it/s]


Epoch 11/16 - Training loss: 4.867167949676514, Training accuracy: 19.587628865979383, Training perplexity: 129.9523636286379
Epoch 11/16 - Validation loss: 5.352298816045125, Validation accuracy: 16.666666666666664, Validation perplexity: 211.09300451330014


100%|██████████| 1385/1385 [02:39<00:00,  8.68it/s]


Epoch 12/16 - Training loss: 4.950023651123047, Training accuracy: 14.432989690721648, Training perplexity: 141.1783029074049
Epoch 12/16 - Validation loss: 5.3453649633071, Validation accuracy: 16.666666666666664, Validation perplexity: 209.63437949552886


100%|██████████| 1385/1385 [02:43<00:00,  8.48it/s]


Epoch 13/16 - Training loss: 4.943075180053711, Training accuracy: 15.463917525773196, Training perplexity: 140.2007298064411
Epoch 13/16 - Validation loss: 5.34230072591819, Validation accuracy: 16.666666666666664, Validation perplexity: 208.9929931737125


100%|██████████| 1385/1385 [02:42<00:00,  8.53it/s]


Epoch 14/16 - Training loss: 4.698666095733643, Training accuracy: 12.371134020618557, Training perplexity: 109.80061122077294
Epoch 14/16 - Validation loss: 5.344267188333998, Validation accuracy: 12.5, Validation perplexity: 209.4043743902394


100%|██████████| 1385/1385 [02:38<00:00,  8.76it/s]


Epoch 15/16 - Training loss: 4.4259138107299805, Training accuracy: 17.525773195876287, Training perplexity: 83.589157000682
Epoch 15/16 - Validation loss: 5.347511950661154, Validation accuracy: 12.5, Validation perplexity: 210.0849453638158


100%|██████████| 1385/1385 [02:32<00:00,  9.10it/s]


Epoch 16/16 - Training loss: 4.7526984214782715, Training accuracy: 12.371134020618557, Training perplexity: 115.89660083411142
Epoch 16/16 - Validation loss: 5.354659933669894, Validation accuracy: 12.5, Validation perplexity: 211.59200879880473


### Perplexity calculation function

In [45]:
"""
Function to calculate the perplexity of the model
"""

def calculate_perplexity(model, data_loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for context_idxs, target_idx in data_loader:
            context_idxs, target_idx = np.squeeze(context_idxs), np.squeeze(target_idx)
            context_idxs, target_idx = np.array(context_idxs), np.array(target_idx)
            context_idxs, target_idx = torch.from_numpy(context_idxs), torch.from_numpy(target_idx)

            context_idxs, target_idx = context_idxs.to(device), target_idx.to(device)
            context_idxs = context_idxs.transpose(1,0)
            log_probs = model(context_idxs)
            loss = criterion(log_probs, target_idx)
            total_loss += loss.item()
    return np.exp(total_loss/len(data_loader)), total_loss/len(data_loader)

### Calculating the perplexity on the train set

In [46]:
# Calculate the perplexity on the train set

perplexity_train, loss_train = calculate_perplexity(model, train_loader, criterion)
print("Perplexity on the train set: ", perplexity_train)
print("Loss on the train set: ", loss_train)

Perplexity on the train set:  87.71125950604727
Loss on the train set:  4.474050277751275


### Calculating the perplexity on the validation set

In [47]:
# Calculate the perplexity on the validation set
perplexity_valid, loss_valid = calculate_perplexity(model, valid_loader, criterion)
print("Perplexity on the validation set: ", perplexity_valid)
print("Loss on the validation set: ", loss_valid)

Perplexity on the validation set:  211.59200879880473
Loss on the validation set:  5.354659933669894


### Calculating the perplexity on the test set

In [48]:
# Calculate the perplexity on the test set
perplexity_test, loss_test = calculate_perplexity(model, test_loader, criterion)
print("Perplexity on the test set: ", perplexity_test)
print("Loss on the test set: ", loss_test)

Perplexity on the test set:  211.75566290861588
Loss on the test set:  5.355433076544653


### Calculating and saving sentence wise perplexity

In [59]:
# report sentence wise perplexity on the test set
def calculate_sentence_perplexity(model, data_loader, criterion):
    model.eval()
    sentence_perplexity = []
    cnt = 0
    with torch.no_grad():
        for context_idxs, target_idx in data_loader:
            context_idxs, target_idx = np.squeeze(context_idxs), np.squeeze(target_idx)
            context_idxs, target_idx = np.array(context_idxs), np.array(target_idx)
            context_idxs, target_idx = torch.from_numpy(context_idxs), torch.from_numpy(target_idx)

            context_idxs, target_idx = context_idxs.to(device), target_idx.to(device)
            context_idxs = context_idxs.transpose(1,0)
            log_probs = model(context_idxs)
            loss = criterion(log_probs, target_idx)
            cnt += 1
            
            temp_str = str(cnt) + "\t:\t" + str(np.exp(loss.item()))
            sentence_perplexity.append(temp_str)
    return sentence_perplexity

sentence_perplexity_test = calculate_sentence_perplexity(model, test_loader, criterion)
sentence_perplexity_train = calculate_sentence_perplexity(model, train_loader, criterion)

# store the sentence wise perplexity in a file
with open('Results/2021114016_LM1_test_perplexity.txt', 'w') as f:
    for perplexity in sentence_perplexity_test:
        f.write(str(perplexity) + '\n')

with open('Results/2021114016_LM1_train_perplexity.txt', 'w') as f:
    for perplexity in sentence_perplexity_train:
        f.write(str(perplexity) + '\n')


### Saving the parameters along with the results in a file

In [41]:
"""
Saving the parameters along with the results in a file
"""
with open('results.txt', 'a') as f:
    f.write(f"Learning rate : {config['lr']}\n")
    f.write(f"Number of epochs : {config['num_epochs']}\n")
    f.write(f"Batch size : {config['batch_size']}\n")
    f.write(f"Hidden dimension : {config['hidden_dim']}\n")
    f.write(f"Embedding dimension : {config['embedding_dim']}\n")
    f.write(f"Perplexity on the train set: {perplexity_train}\n")
    f.write(f"Perplexity on the validation set: {perplexity_valid}\n")
    f.write(f"Perplexity on the test set: {perplexity_test}\n")
    f.write(f"Loss on the train set: {loss_train}\n")
    f.write(f"Loss on the validation set: {loss_valid}\n")
    f.write(f"Loss on the test set: {loss_test}\n")
    f.write("--------------------------------------------------\n")
    f.write("\n\n")


In [42]:

torch.save(model.state_dict(), 'nnlm.pt')

316 : 5*10^-6
<br>
286 : 10^-5
<br>
275 : 8*10^-6
<br>
