In [1]:
# https://www.guru99.com/seq2seq-model.html
from __future__ import unicode_literals, print_function, division
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import nltk
import numpy as np
import pandas as pd
import string
import spacy
import os
import re
import random
import joblib
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import sentence_bleu

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# Declaring out Model
class WordLSTM(nn.Module):
    
    def __init__(self, n_hidden=256, n_layers=4, drop_prob=0.3, lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.emb_layer = nn.Embedding(vocab_size, 200)

        ## define the LSTM
        self.lstm = nn.LSTM(200, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## define the fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''

        ## pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        ## Get the outputs and the new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)
        
        ## pass through a dropout layer
        out = self.dropout(lstm_output)
        
        #out = out.contiguous().view(-1, self.n_hidden) 
        out = out.reshape(-1, self.n_hidden) 

        ## put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        # if GPU is available
        if (torch.cuda.is_available()):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        
        # if GPU is not available
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

vocab_size = 17706
french = WordLSTM()
# push the model to GPU (avoid it if you are not using the GPU)
french.cuda()    

model = torch.load('../input/lang-models/q1_french.pt', map_location=device)
french.load_state_dict(model)
french.eval()

In [3]:
vocab_size = 9330
english = WordLSTM()
# push the model to GPU (avoid it if you are not using the GPU)
english.cuda()    

model = torch.load('../input/lang-models/q1_english.pt', map_location=device)
english.load_state_dict(model)
english.eval()

In [4]:
# HYPER PARAMETERS  
threshold = 2

epochs = 5

# sentence length allowed
max_length = 25

teacher_forcing_ratio = 0.5

SOS_token = 0
EOS_token = 1

In [5]:
# download spacy tokenizer for english and french language
!python3 -m spacy download en_core_web_sm
!python3 -m spacy download fr_core_news_sm

spacy_fr = spacy.load('fr_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [18]:
# read file
english_file_name = '../input/dataset/intro-to-nlp-assign3/ted-talks-corpus/train.en'
f = open(english_file_name)
en_text = f.read()
f.close()

french_file_name = '../input/dataset/intro-to-nlp-assign3/ted-talks-corpus/train.fr'
f = open(french_file_name)
fr_text = f.read()
f.close()

In [19]:
# tokenize french and english language sentences

en_text = en_text.lower()
en_sentences = nltk.tokenize.sent_tokenize(en_text)

fr_text = fr_text.lower()
fr_sentences = nltk.tokenize.sent_tokenize(fr_text)

source = []
target = []
for en_sent, fr_sent in zip(en_sentences, fr_sentences):
    source.append([tok.text for tok in spacy_en.tokenizer(en_sent)])
    target.append([tok.text for tok in spacy_fr.tokenizer(fr_sent)])

In [20]:
# calculating frequency and assigning unknown to words having frequcny less than threshold
en_freq = {}
fr_freq = {}
for en_lst, fr_lst in zip(source, target):
    for en_token in en_lst:
        if en_freq.get(en_token) == None:
            en_freq[en_token] = 1
        else:
            en_freq[en_token] += 1
            
    for fr_token in fr_lst:
        if fr_freq.get(fr_token) == None:
            fr_freq[fr_token] = 1
        else:
            fr_freq[fr_token] += 1
            
            
en_sentences = []
fr_sentences = []
for en_lst, fr_lst in zip(source, target):
    en_sent = []
    for word in en_lst:
        if word not in string.punctuation:
            if en_freq[word] >= threshold:
                en_sent.append(word)
            else:
                en_sent.append('unk')
                
    en_sentences.append(en_sent)
    
    fr_sent = []
    for word in fr_lst:
        if word not in string.punctuation:
            if fr_freq[word] >= threshold:
                fr_sent.append(word)
            else:
                fr_sent.append('unk')
        
    fr_sentences.append(fr_sent)

In [21]:
# storing unique words in vocabulary and creatinf sentecne of max_length and appending eos,sos tokens and creating word2index and index2word dicts
en_vocab = set()
fr_vocab = set()
pads = ['sos', 'eos', 'unk']
for p in pads:
    en_vocab.add(p)
    fr_vocab.add(p)

# here we are storing words in src and trg
src = []
trg = []
for en_sent, fr_sent in zip(en_sentences, fr_sentences):
    en_lst = []
    for i in range(min(max_length, len(en_sent))):
        en_lst.append(en_sent[i])
        en_vocab.add(en_sent[i])
    
    fr_lst = []
    for i in range(min(max_length, len(fr_sent))):
        fr_lst.append(fr_sent[i])
        fr_vocab.add(fr_sent[i])
    
    src.append(en_lst)
    trg.append(fr_lst)
    

en_token2index = {}
en_index2token = {}
for cnt,token in enumerate(en_vocab):
    en_token2index[token] = cnt
    en_index2token[cnt] = token

fr_token2index = {}
fr_index2token = {}
for cnt,token in enumerate(fr_vocab):
    fr_token2index[token] = cnt
    fr_index2token[cnt] = token

In [22]:
source = []
target = []
pairs = []
for en_sent, fr_sent in zip(src, trg):
    en_lst = []
    for en_token in en_sent:
        en_lst.append(en_token2index[en_token])
    
    fr_lst = []
    for fr_token in fr_sent:
        fr_lst.append(fr_token2index[fr_token])
    
    source.append(en_lst)
    target.append(fr_lst)
    pairs.append([en_lst,fr_lst])

In [23]:
print(len(src),len(trg),len(pairs))

In [12]:
# ENCODER DECODER SEQ2SEQ
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, embbed_dim, num_layers):
        super(Encoder, self).__init__()

        #set the encoder input dimesion , embbed dimesion, hidden dimesion, and number of layers 
        self.input_dim = input_dim
        self.embbed_dim = embbed_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        #initialize the embedding layer with input and embbed dimention
        self.embedding = nn.Embedding(input_dim, self.embbed_dim)
        #intialize the GRU to take the input dimetion of embbed, and output dimention of hidden and
        #set the number of gru layers
        self.gru = nn.GRU(self.embbed_dim, self.hidden_dim, num_layers=self.num_layers)
              
    def forward(self, src):

        embedded = self.embedding(src).view(1,1,-1)
        outputs, hidden = self.gru(embedded)
        return outputs, hidden

class Decoder(nn.Module):
    def __init__(self, output_dim, hidden_dim, embbed_dim, num_layers):
        super(Decoder, self).__init__()

        #set the encoder output dimension, embed dimension, hidden dimension, and number of layers 
        self.embbed_dim = embbed_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers

        # initialize every layer with the appropriate dimension. For the decoder layer, it will consist of an embedding, GRU, a Linear layer and a Log softmax activation function.
        self.embedding = nn.Embedding(output_dim, self.embbed_dim)
        self.gru = nn.GRU(self.embbed_dim, self.hidden_dim, num_layers=self.num_layers)
        self.out = nn.Linear(self.hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)
      
    def forward(self, input, hidden):
        # reshape the input to (1, batch_size)
        input = input.view(1, -1)
        embedded = F.relu(self.embedding(input))
        output, hidden = self.gru(embedded, hidden)       
        prediction = self.softmax(self.out(output[0]))

        return prediction, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, MAX_LENGTH=max_length):
        super().__init__()
      
        #initialize the encoder and decoder
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        
        input_length = source.size(0) #get the input length (number of words in sentence)
        batch_size = target.shape[1] 
        target_length = target.shape[0]
        vocab_size = self.decoder.output_dim

        #initialize a variable to hold the predicted outputs
        outputs = torch.zeros(target_length, batch_size, vocab_size).to(self.device)

        #encode every word in a sentence
        for i in range(input_length):
            encoder_output, encoder_hidden = self.encoder(source[i])

        #use the encoder’s hidden layer as the decoder hidden
        decoder_hidden = encoder_hidden.to(device)

        #add a token before the first predicted word
        decoder_input = torch.tensor([SOS_token], device=device)  # SOS

        #topk is used to get the top K value over a list
        #predict the output word from the current target word. If we enable the teaching force,  then the #next decoder input is the next word, else, use the decoder output highest value. 

        for t in range(target_length):   
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
            outputs[t] = decoder_output
            teacher_force = random.random() < teacher_forcing_ratio
            topv, topi = decoder_output.topk(1)
            input = (target[t] if teacher_force else topi)
            if(teacher_force == False and input.item() == EOS_token):
                break

        return outputs

In [13]:
def calcError(model, input_tensor, target_tensor, model_optimizer, criterion):
    model_optimizer.zero_grad()

    loss = 0
    epoch_loss = 0

    output = model(input_tensor, target_tensor)

    num_iter = output.size(0)
                
    #calculate the loss from a predicted sentence with the expected result
    for ot in range(num_iter):
        loss += criterion(output[ot], target_tensor[ot])

    loss.backward()
    model_optimizer.step()
    epoch_loss = loss.item() / num_iter
    return epoch_loss


def trainModel(model, pairs, num_iteration):
    model.train()
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    criterion = nn.NLLLoss()
    total_loss_iterations = 0

    for iter in range(1, num_iteration+1):
        training_pair = pairs[iter - 1]
        input_tensor = torch.tensor(training_pair[0], dtype=torch.long, device=device).view(-1, 1)
        target_tensor = torch.tensor(training_pair[1], dtype=torch.long, device=device).view(-1, 1)
        
        try:
            loss = calcError(model, input_tensor, target_tensor, optimizer, criterion)
        except:
            do_nothing = 1
        
        total_loss_iterations += loss

        if iter % 300 == 0:
            avarage_loss= total_loss_iterations / 300
            total_loss_iterations = 0
            print('Iteration Number: %d     Avg Loss: %.4f' % (iter, avarage_loss))
          
    return model

In [14]:
input_size = len(en_vocab)
output_size = len(fr_vocab)
print('Input : {} Output : {}'.format(input_size, output_size))

embed_size = 256
hidden_size = 512
num_layers = 1
# total number of senetences
num_iteration = len(pairs)

#create encoder-decoder model
encoder = Encoder(input_size, hidden_size, embed_size, num_layers)
decoder = Decoder(output_size, hidden_size, embed_size, num_layers)

model = Seq2Seq(encoder, decoder, device).to(device)

encoder.state_dict()['gru.weight_ih_l0'] = english.state_dict()['lstm.weight_hh_l3']
encoder.state_dict()['gru.weight_ih_l0'] = french.state_dict()['lstm.weight_hh_l3']

#print model 
print(encoder)
print(decoder)

for ep in range(epochs):
    model = trainModel(model, pairs, num_iteration)
    print('Epoch {} done'.format(ep+1))

In [15]:
torch.save(model.state_dict(), 'q2.1_train_french.pt')

In [24]:
def french_sentence(input_sentence):
    input_tensor = torch.tensor(input_sentence, dtype=torch.long, device=device).view(-1, 1)
    target = np.zeros(len(input_sentence))
    output_tensor = torch.tensor(target, dtype=torch.long, device=device).view(-1, 1)
    # setting the teacher forcing ratio to be 0.0, so we get only the words predicted by the model     
    output = model(input_tensor, output_tensor,0.0)
    num_iter = output.size(0)
    decoded_words = []
    for ot in range(output.size(0)):
        topv, topi = output[ot].topk(1)
        # print(topi)

        if topi[0].item() == EOS_token:
            decoded_words.append('<EOS>')
            break
        else:
            decoded_words.append(fr_index2token[topi[0].item()])

    return decoded_words

In [25]:
def write_in_file(file_name, trg, pairs):
    corpus_references = []
    corpus_candidates = []

    sentence_score = []
    length = len(trg)
    french = []
    for i in range(length):
        reference = trg[i]
        french.append(french_sentence(pairs[i][0]))
        candidate = french[i]
        try:
            score = sentence_bleu(reference, candidate)
        except:
            score = 0.0
        sentence_score.append(score)
        corpus_references.append(reference)
        corpus_candidates.append(candidate)

    corpus_score = corpus_bleu(corpus_references, corpus_candidates)  
    
    to_write = ''
    to_write += (str(corpus_score) + '\n')
    
    for cnt, lst in enumerate(french):
        sent = " ".join(lst)
        to_write += sent
        to_write += '     '
        to_write += str(sentence_score[cnt])
        to_write += '\n'
        
    file = open(file_name, 'w')
    file.write(to_write)
    file.close()

In [26]:
write_in_file('2019101056_MT2_train.txt',trg,pairs)

In [None]:
def generate_for_test(english_filename, french_filename):
    # read file
    f = open(english_file_name)
    en_text = f.read()
    f.close()

    f = open(french_file_name)
    fr_text = f.read()
    f.close()

    # tokenize french and english language sentences
    en_text = en_text.lower()
    en_sentences = nltk.tokenize.sent_tokenize(en_text)

    fr_text = fr_text.lower()
    fr_sentences = nltk.tokenize.sent_tokenize(fr_text)

    source = []
    target = []
    for en_sent, fr_sent in zip(en_sentences, fr_sentences):
        source.append([tok.text for tok in spacy_en.tokenizer(en_sent)])
        target.append([tok.text for tok in spacy_fr.tokenizer(fr_sent)])


    trg = []
    pairs = []
    for en_sent, fr_sent in zip(source,target):
        en_lst = []
        for token in en_sent:
            if en_token2index.get(token) == None:
                en_lst.append(en_token2index['unk'])
            else:
                en_lst.append(en_token2index[token])


        fr_lst = []
        fr_tokens = []
        for token in fr_sent:
            if fr_token2index.get(token) == None:
                fr_lst.append(fr_token2index['unk'])
                fr_tokens.append('unk')
            else:
                fr_lst.append(fr_token2index[token])
                fr_tokens.append(token)

        trg.append(fr_tokens)
        pairs.append([en_lst,fr_lst])
        
    return trg, pairs

In [None]:
# FOR GENERATING FILE ON TEST DATASET
english_filename = '../input/dataset/intro-to-nlp-assign3/ted-talks-corpus/test.en'
french_filename = '../input/dataset/intro-to-nlp-assign3/ted-talks-corpus/test.fr'
trg, pairs = generate_for_test(english_filename, french_filename)
write_in_file('2019101056_MT2_test.txt',trg,pairs)