In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import sys

In [3]:
with open('/content/drive/MyDrive/europarl-corpus/train.europarl', 'r') as f:
    europarl_train = f.readlines()
with open('/content/drive/MyDrive/europarl-corpus/dev.europarl', 'r') as f:
    europarl_validation = f.readlines()
with open('/content/drive/MyDrive/europarl-corpus/test.europarl', 'r') as f:
    europarl_test = f.readlines()


In [4]:
#print length of corpus
print('train:', len(europarl_train))  
print('valid:', len(europarl_validation))

train: 20000
valid: 500


In [5]:
import re
def preprocess(text):
    #the function prpocess a single line
    #it takes the single line and returns a list of tokens for that particular line
    #make text lower
    cleaned_text = text.lower()
    # cleaned_text = text
    #remove non-ASCII characters
    #cleaned_text = re.sub(r'[^\x00-\x7F]+',' ', cleaned_text)
    # remove URLS
    cleaned_text = re.sub(r"http\S+", "<URL>", cleaned_text)
    # remove HTs
    cleaned_text = re.sub(r"#[A-Za-z0-9_]+", "<HASHTAG>", cleaned_text)
    # remove Mentions
    cleaned_text = re.sub(r"@[A-Za-z0-9_]+", "<MENTION>", cleaned_text)
    #replace percentage quantities with tags
    cleaned_text = re.sub(r'(\d+(\.\d+)?%)',"<PERCENT>",cleaned_text)
    #replace numbers with tags
    cleaned_text = re.sub("^\d+\s|\s\d+\s|\s\d+$", " <NUM> ", cleaned_text)
    #hypenated words are accounted for by joining them/merging them together
    cleaned_text = re.sub(r'\w+(?:-\w+)+', '', cleaned_text)
    # Substitue for punctuations
    cleaned_text = re.sub(r"(\'t)"," not",cleaned_text)
    cleaned_text = re.sub(r'(i\'m)',"i am",cleaned_text)
    cleaned_text = re.sub(r'(ain\'t)',"am not",cleaned_text)
    cleaned_text = re.sub(r'(\'ll)'," will",cleaned_text)
    cleaned_text = re.sub(r'(\'ve)'," have",cleaned_text)
    cleaned_text = re.sub(r'(\'re)'," are",cleaned_text)
    cleaned_text = re.sub(r'(\'s)'," is",cleaned_text)
    cleaned_text = re.sub(r'(\'re)'," are",cleaned_text)
    #removing repetetive spam
    cleaned_text = re.sub('\!\!+', '!', cleaned_text)
    cleaned_text = re.sub('\*\*+', '*', cleaned_text)
    cleaned_text = re.sub('\>\>+', '>', cleaned_text)
    cleaned_text = re.sub('\<\<+', '<', cleaned_text)
    cleaned_text = re.sub('\?\?+', '?', cleaned_text)
    cleaned_text = re.sub('\!\!+', '!', cleaned_text)
    cleaned_text = re.sub('\.\.+', '.', cleaned_text)
    cleaned_text = re.sub('\,\,+', ',', cleaned_text)
    #matching punctuation characters at end of sentences and padding them
    cleaned_text = re.sub('([;:.,!?()])', r' \1 ', cleaned_text)
    #removing multiple spaces finally
    cleaned_text = re.sub('\s{2,}', ' ', cleaned_text)
    #remove trailing white spaces
    cleaned_text = re.sub(r'\s+$', '', cleaned_text) #important to get rid of empty tokens at the end of list
    #tokenization based on spaces for each line
    # spaces = r"\s+"
    # tokenized_sent = re.split(spaces, cleaned_text)
    return cleaned_text

In [6]:
#preprocess the corpus
europarl_train_clean = [preprocess(line) for line in europarl_train]
europarl_validation_clean = [preprocess(line) for line in europarl_validation]
europarl_validation_clean[:10]

['it is this tendency which presents a problem right now , much more than the scale of the epidemic in quantitative terms .',
 'the truth is , at the present time , there is quite simply no adequate explanation for it , hence the debate which has been initiated on the possible existence of alternative disease transmission routes of which we are , as yet , unaware .',
 'medical and scientific uncertainty still prevails .',
 'in these circumstances , the precautionary principle must be adopted to the full .',
 'all possible resources must be used in order to assess the bse situation in the various countries , particularly including the development and systematic use of fast screening tests .',
 'the development of bse in france raises issues that are not purely medical in the narrowest sense .',
 'this is nothing new .',
 'from the very beginning , it has not been possible to give any explanation for the emergence of this disease and its transmission to human beings , and its spread inte

In [7]:
europarl_test_clean = [preprocess(line) for line in europarl_test]

In [8]:
def create_UNK_corpus(data):
    freq_dict = {}
    for line in data:
        for word in line.split():
            if word in freq_dict:
                freq_dict[word] += 1
            else:
                freq_dict[word] = 1
    freq_dict['<UNK>'] = 0
    #replace all words with frequency less than 3 with <UNK> in freq_dict
    word_list = []
    for word in freq_dict:
        if freq_dict[word] == 1:
            freq_dict["<UNK>"] += freq_dict[word]
            #remove the word from the freq_dict
            word_list.append(word)
    for word in word_list:
        del freq_dict[word]
    #for each sentence in corpus if word not in freq_dict replace with <UNK>
    temp = []
    for line in data:
        for word in line.split():
            if word not in freq_dict:
                line = line.replace(word, "<UNK>")
        temp.append(line)
    return temp
europarl_train_clean = create_UNK_corpus(europarl_train_clean)
europarl_train_clean[:10]


['resumption of the session',
 'i declare resumed the session of the european parliament adjourned on friday <NUM> december 1999 , and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period .',
 'although , as you will have seen , the <UNK> <UNK> <UNK> failed to materialise , still the people in a number of countries suffered a series of natural disasters that truly were dreadful .',
 'you have requested a debate on this subject in the course of the next few days , during this .',
 "in the meantime , i should like to observe a minute' s silence , as a number of members have requested , on behalf of all the victims concerned , particularly those of the terrible storms , in the various countries of the european union .",
 "please rise , then , for this minute' s silence .",
 " ( the house rose and observed a minute' s silence )",
 'madam president , on a point of order .',
 'you will be aware from the press and television that there ha

In [9]:
import torch
import os
import numpy as np

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

class Corpus(object):
    def __init__(self):
        self.dictionary = Dictionary()
    def get_data(self, train_data, batch_size):
        tokens = 0
        for line in train_data:
            #tokenize the line
            words = ['<START>'] + line.split() + ['<END>']
            tokens += len(words)
            for word in words:
                self.dictionary.add_word(word)

        #tokenizing the content
        ids = torch.LongTensor(tokens) #contains the ids of the words
        token = 0
        for line in train_data:
            words = ['<START>'] + line.split() + ['<END>']
            for word in words:
                ids[token] = self.dictionary.word2idx[word]
                token += 1
        num_batches = ids.size(0) // batch_size
        ids = ids[:num_batches*batch_size]
        ##return reshaped tensor using .view()
        ## the -1-> situation that you don't know how many rows you want but are sure of the number of columns
        return ids.view(batch_size, -1) 

In [10]:
import torch
import torch.nn as nn
import numpy as np
from torch.nn.utils import clip_grad_norm_

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [11]:
# https://towardsdatascience.com/pytorch-basics-how-to-train-your-neural-net-intro-to-rnn-cb6ebc594677
# Hyper-parameters
embed_size = 128
hidden_size = 1024
num_layers = 2
num_epochs = 2
num_samples = 1000     # number of words to be sampled
batch_size = 30
seq_length = 15 #number of elements in each batch from the dimension
learning_rate = 0.002

#load the corpus (dictionary it is)
corpus = Corpus()
ids = corpus.get_data(europarl_train_clean, batch_size)
vocab_size = len(corpus.dictionary)
output_size = vocab_size
num_batches = ids.size(1) // seq_length
print(vocab_size)
val_ids = corpus.get_data(europarl_validation_clean,batch_size)
val_ids
#print word2idx
#print(corpus.dictionary.word2idx)

9408


tensor([[   0,  119,  124,  ...,  348,   17,  119],
        [  19,   96, 8268,  ..., 1062,   37,    5],
        [   0,    6,   67,  ...,   19,   17,    2],
        ...,
        [ 330,  326, 1294,  ...,   30,  245, 2840],
        [  18,   26,  732,  ...,    2,    3,    9],
        [  10,   17,  313,  ...,  126,  954,  201]])

In [12]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, output_size, embed_size, hidden_size, num_layers):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    def forward(self, x, hidden):
        # Embed word ids to vectors
        x = self.embed(x)
        
        # Forward propagate LSTM
        out, (h, c) = self.lstm(x, hidden)
        
        # Reshape output to (batch_size*sequence_length, hidden_size)
        out = out.reshape(out.size(0)*out.size(1), out.size(2))
        
        # Decode hidden states of all time steps
        out = self.linear(out)
        # out = self.softmax(out)
        return out, (h, c)
    def init_hidden(self, batch_size):
        # Create two new tensors with sizes n_layers x batch_size x hidden_size,
        # initialized to zero, for hidden state and cell state of LSTM
        return (torch.zeros(num_layers, batch_size, hidden_size).to(device),
                torch.zeros(num_layers, batch_size, hidden_size).to(device))
model = LSTMModel(vocab_size, output_size, embed_size, hidden_size, num_layers)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
def detach (states):
    return [state.detach() for state in states]

In [13]:
num_epochs = 3
for epoch in range(num_epochs):
    # Set initial hidden and cell states 
    hidden = model.init_hidden(batch_size)
    states = hidden
    for i in range(0, ids.size(1) - seq_length, seq_length):
        # Get mini-batch inputs and targets
        inputs = ids[:, i:i+seq_length].to(device)
        targets = ids[:, (i+1):(i+1)+seq_length].to(device)
        # Forward pass
        states = detach(states)
        outputs, states = model(inputs, states)
        # print(targets.shape)
        # print(outputs.shape)
        # print(outputs)
        loss = criterion(outputs, targets.reshape(-1))
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        step = (i+1) // seq_length
        if step % 100 == 0:
            print ('Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
                   .format(epoch+1, num_epochs, step, num_batches, loss.item(), np.exp(loss.item())))



Epoch [1/3], Step[0/1331], Loss: 9.1515, Perplexity: 9428.30
Epoch [1/3], Step[100/1331], Loss: 5.5918, Perplexity: 268.23
Epoch [1/3], Step[200/1331], Loss: 5.8972, Perplexity: 364.03
Epoch [1/3], Step[300/1331], Loss: 5.1784, Perplexity: 177.39
Epoch [1/3], Step[400/1331], Loss: 5.1839, Perplexity: 178.37
Epoch [1/3], Step[500/1331], Loss: 5.2222, Perplexity: 185.34
Epoch [1/3], Step[600/1331], Loss: 5.0215, Perplexity: 151.63
Epoch [1/3], Step[700/1331], Loss: 5.1533, Perplexity: 172.99
Epoch [1/3], Step[800/1331], Loss: 4.9195, Perplexity: 136.93
Epoch [1/3], Step[900/1331], Loss: 5.1278, Perplexity: 168.65
Epoch [1/3], Step[1000/1331], Loss: 5.1840, Perplexity: 178.40
Epoch [1/3], Step[1100/1331], Loss: 4.8878, Perplexity: 132.66
Epoch [1/3], Step[1200/1331], Loss: 5.0884, Perplexity: 162.13
Epoch [1/3], Step[1300/1331], Loss: 5.0556, Perplexity: 156.90
Epoch [2/3], Step[0/1331], Loss: 5.9643, Perplexity: 389.26
Epoch [2/3], Step[100/1331], Loss: 4.6807, Perplexity: 107.85
Epoch [

In [14]:
#saving best model checkpoint
torch.save(model.state_dict(), 'eng_model.ckpt')

In [15]:

#calculate perplexity of the input sentence using a model
import torch.nn.functional as F

def evaluate(model, sentence, vocab_size, device):
    model.eval()
    with torch.no_grad():
        sentence = preprocess(sentence)
        sentence = sentence.split()

        #replace words in sentence with UNK
        sentence = [word if word in corpus.dictionary.word2idx else '<UNK>' for word in sentence]
        #convert the sentence to ids
        ids = torch.tensor([[corpus.dictionary.word2idx[word] for word in sentence]])
        ids = ids.to(device)
        # Set initial hidden and cell states
        states = (torch.zeros(num_layers, 1, hidden_size).to(device),
                    torch.zeros(num_layers, 1, hidden_size).to(device))
        # Forward pass
        for i in range(len(sentence)):
            states = detach(states)
            _, states = model(ids[:, i:i+1], states)
        # Decode hidden states of all time steps
        out = states[0][0].squeeze(0)
        # for i in range(len(sentence)):
        #     print('{}: {}'.format(sentence[i], F.softmax(out[i], dim=0)))
    
        #print probability of each word in the sentence
        # for i in range(len(sentence)):
        #     print('{}: {}'.format(sentence[i], out[i].item()))
        #predict
        #print perplexity score of the sentence
        score = torch.exp(torch.log(torch.sum(torch.exp(out)))/len(sentence))
        return score.item()
  
evaluate(model, 'What is more, the movement of fish is like CO2 or like capital, it is not European, but world-wide.', vocab_size, device)


1.3778657913208008