In [1]:
# https://www.analyticsvidhya.com/blog/2020/08/build-a-natural-language-generation-nlg-system-using-pytorch/
# perplexity values 4000 ke around aa rahi thi
import re
import random
import nltk
import math
nltk.download('punkt')
import numpy as np
import pandas as pd
# for progress bars
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import joblib

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# HYPER PARAMETERS

# sequence length for the sequence-target pair
seq_len = 5

# batch size
batch_size = 16

# number of epochs
epochs = 5

# learning rate
lr = 0.001

# clip
clip = 1

# threshold -> for handling unknown words. If the frequency comes out to be less than threshold, then we will replace the token with unk
threshold = 2

In [3]:
# read file
file_name = '../input/dataset/intro-to-nlp-assign3/europarl-corpus/train.europarl'
f = open(file_name)
text = f.read()
f.close()

In [4]:
# data cleaning
def clean_data(text):
    text = text.lower()
    sentences = nltk.tokenize.sent_tokenize(text)

    # keeping only alphabetical characters and apostrophe in our text
    sentences = [re.sub("[^a-z' ]", " ", sent) for sent in sentences]
    
    # frequency count
    freq = {}
    for sent in sentences:
        for word in sent.split():
            if freq.get(word) == None:
                freq[word] = 1
            else:
                freq[word] += 1
                
    pure_sentences = []
    for sent in sentences:
        pure = ''
        for word in sent.split():
            if freq[word] >= threshold:
                pure += word
            else:
                pure += 'unk'
                
            pure += ' '
        pure_sentences.append(pure)
    
    return pure_sentences

sentences = clean_data(text)

In [5]:
# creating sequence-target pair and also storing all the distinct words
seq = []
dist_words = set()
for string in sentences:
    # we will take seq_len+1 and then divide this into input_sequence and output_sequence     
    sent = string.split()
    if len(sent) >= seq_len:
        for i in range(seq_len,len(sent)):
            seq.append(" ".join(sent[i-seq_len:i+1]))
        
    for word in sent:
        dist_words.add(word)
        
# vocabulary size
vocab_size = len(dist_words)
    
inp = []
out = []
for sq in seq:
    inp.append(" ".join(sq.split()[:-1]))  # from first word to last second word 
    out.append(" ".join(sq.split()[1:]))   # from second word to last word

    
# creating word_to_index and index_to_word dictionary
word_to_index = {}
index_to_word = {}
for cnt,word in enumerate(dist_words):
    word_to_index[word] = cnt
    index_to_word[cnt] = word

In [6]:
joblib.dump(word_to_index,'word_to_index_q1.pkl')
joblib.dump(index_to_word,'index_to_word_q1.pkl')

['index_to_word_q1.pkl']

In [7]:
print(vocab_size, len(inp))
print(seq[0]+'\n'+inp[0]+'\n'+out[0])

9330 407107
resumption of the session i declare
resumption of the session i
of the session i declare


In [8]:
# As we have got the index for each word, now convert inp and out to their corresponding word's indices
inp_index = []
out_index = []
for i in range(len(inp)):
    inp_index.append([word_to_index[word] for word in inp[i].split()])
    out_index.append([word_to_index[word] for word in out[i].split()])
    
inp_index = np.array(inp_index)
out_index = np.array(out_index)

In [9]:
# creating batches
def make_batch(inp, out, batch_size):
    ind = 0
    for n in range(batch_size, len(inp), batch_size):
        x = inp[ind:n,:]
        y = out[ind:n,:]
        ind = n
        # yield is used to return from a function without destroying the states of its local variable 
        # and when the function is called, the execution starts from the last yield statement.         
        yield x, y

In [10]:
# Declaring out Model
class WordLSTM(nn.Module):
    
    def __init__(self, n_hidden=256, n_layers=4, drop_prob=0.3, lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.emb_layer = nn.Embedding(vocab_size, 200)

        ## define the LSTM
        self.lstm = nn.LSTM(200, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## define the fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''

        ## pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        ## Get the outputs and the new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)
        
        ## pass through a dropout layer
        out = self.dropout(lstm_output)
        
        #out = out.contiguous().view(-1, self.n_hidden) 
        out = out.reshape(-1, self.n_hidden) 

        ## put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        # if GPU is available
        if (torch.cuda.is_available()):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        
        # if GPU is not available
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

In [11]:
# instantiate the model
net = WordLSTM()
# push the model to GPU (avoid it if you are not using the GPU)
net.cuda()
print(net)

WordLSTM(
  (emb_layer): Embedding(9330, 200)
  (lstm): LSTM(200, 256, num_layers=4, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=9330, bias=True)
)


In [12]:
def train(net, epochs, batch_size, lr, clip):
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    net.cuda()
    
    net.train()
    for e in range(epochs):
        # initialize hidden state
        h = net.init_hidden(batch_size)
        for x, y in make_batch(inp_index, out_index, batch_size):
            inputs, targets = torch.from_numpy(x).cuda(), torch.from_numpy(y).cuda()
            # detach hidden states
            h = tuple([each.data for each in h])
            net.zero_grad()
            output, h = net(inputs, h)
            loss = criterion(output, targets.view(-1))
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step() 
        
        print('Epoch {} done!'.format(e+1))

In [13]:
# train the model & save it
train(net, epochs, batch_size, lr, clip)
torch.save(net.state_dict(),'q1_english.pt')

Epoch 1 done!
Epoch 2 done!
Epoch 3 done!
Epoch 4 done!
Epoch 5 done!


In [14]:
def perplexity(sentence):
    # push to GPU
    net.cuda()
    net.eval()

    # batch size is 1
    h = net.init_hidden(1)
    
    sentence = sentence.lower()
    sentence = re.sub("[^a-z']", " ", sentence)
    indices = []
    for token in sentence.split():
        if word_to_index.get(token) == None:
            indices.append(word_to_index['unk'])
        else:
            indices.append(word_to_index[token])

    length = len(indices)
    lst = np.array([indices])

    # tensor inputs
    inputs = torch.from_numpy(lst).cuda()

    # get the output of the model
    out, h = net(inputs, h)

    # get the token probabilities
    p = F.softmax(out, dim=1).data
    p = p.cpu().numpy()

    logarithm_sum = 0.0
    for i,ind in enumerate(indices):
        logarithm_sum += (math.log(p[i][ind]))/length
        
    # we already took division by length, so no need to divide here by length     
    return math.exp(-logarithm_sum)
    

In [15]:
p = perplexity('the results are acceptable.')
print(p)

8988.339084997478


In [16]:
def calc_perplexity_and_write(output_path,dataset_path):
    f = open(dataset_path)
    text = f.read()
    f.close()
    
    # text cleaning 
    text = text.lower()
    sentences = nltk.tokenize.sent_tokenize(text)
            
    avg = 0.0
    perplexity_score = []
    N = len(sentences)
    for sent in sentences:
        try:
            p = perplexity(sent)
        except:
            p = 1000.0
        perplexity_score.append(p)
        avg += p
        
    avg /= N
    # thing to be written in the file     
    to_write = ''
    to_write += str(avg) + '\n'
    
    for i,sent in enumerate(sentences):
        to_write += sent
        to_write += '     '
        to_write += str(perplexity_score[i])
        to_write += '\n'
        
    file = open(output_path, 'w')
    file.write(to_write)
    file.close()

In [17]:
calc_perplexity_and_write('2019101056_LM_train.txt','../input/dataset/intro-to-nlp-assign3/europarl-corpus/train.europarl')

In [18]:
calc_perplexity_and_write('2019101056_LM_test.txt','../input/dataset/intro-to-nlp-assign3/europarl-corpus/test.europarl')

In [20]:
model = torch.load('../input/new-models/q1_english.pt')
net.load_state_dict(model)
net.eval()
p = perplexity('the results are acceptable.')
print(p)

8988.339084997478
