In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
from torchtext.vocab import GloVe
from torchtext.data import Field, BucketIterator
from torchtext.data.metrics import bleu_score
import spacy

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [8]:
#Reading files

with open('trainen.txt', encoding='utf8') as f:
    eng_train = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('trainta.txt', encoding='utf8') as f:
    tamil_train = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('deven.txt', encoding='utf8') as f:
    eng_test = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('devta.txt', encoding='utf8') as f:
    tamil_test = list(map(lambda x: x.rstrip(), f.readlines()))
    
embedding_glove = GloVe(name='6B', dim=100)

spacy_en = spacy.load('en_core_web_sm')

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

ENG = Field(tokenize = tokenize_en, init_token='sos', eos_token = 'eos', lower=True)
processed_eng_train = list(map(lambda x: ENG.preprocess(x), eng_train))
processed_eng_test = list(map(lambda x: ENG.preprocess(x), eng_test))

ENG.build_vocab(processed_eng_train, vectors=embedding_glove)

In [9]:
def preprocess(processed_eng):
    
    #function to return the numericalized version of the tokenized sentences
    X = []
    for tokenized_sentence in processed_eng:
        int_sequence = [2]  #first element is the SOS token 
        for token in tokenized_sentence:
            int_sequence.append(ENG.vocab.stoi[token])
        int_sequence.append(3) #last element is the EOS token
        X.append(int_sequence)
    
    return X

# X_train and X_test are lists of lists with the integer sequences for a given sentence
X_train = preprocess(processed_eng_train)
X_test = preprocess(processed_eng_test)

In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Same thing for Tamil sentences
TAM = Tokenizer()
TAM.fit_on_texts(tamil_train)
Y_train = TAM.texts_to_sequences(tamil_train)
Y_test = TAM.texts_to_sequences(tamil_test)

#adding EOS token
_ = [y.append(0) for y in Y_train]
_ = [y.append(0) for y in Y_test]

Using TensorFlow backend.


In [11]:
source_vocab_size = len(ENG.vocab)
target_vocab_size = len(TAM.word_index)+1
print(source_vocab_size)
print(target_vocab_size)

9736
18669


In [12]:
class Encoder(nn.Module):
    
    def __init__(self, embed_size, ENG):
        
        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.embed = nn.Embedding.from_pretrained(ENG.vocab.vectors)
        self.lstm = nn.LSTM(embed_size, embed_size)
        
    def forward(self, x, enc_hidden):
        
        enc_output, enc_hidden = self.lstm(self.embed(x).view(1,1,-1), enc_hidden)
        return enc_output, enc_hidden

In [41]:
class Decoder(nn.Module):
    
    def __init__(self, embed_size, n_classes):
        
        super(Decoder, self).__init__()
        self.embed_size = embed_size
        self.embed = nn.Embedding(n_classes, embed_size)
        self.lstm = nn.LSTM(embed_size, embed_size)
        self.fc = nn.Linear(embed_size, n_classes)
        self.logsoftmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x, dec_hidden):
        
        dec_output, dec_hidden = self.lstm(self.embed(x).view(1,1,-1), dec_hidden)
        dec_output = self.logsoftmax(self.fc(dec_output[0]))
        return dec_output, dec_hidden

In [51]:
def loss_per_pair(source_sentence, target_sentence, enc_obj, dec_obj, enc_optimiser, dec_optimiser, loss_fn):
    
    Ts = source_sentence.size(-1)
    Tt = target_sentence.size(-1)
    enc_hidden = (torch.zeros(1, 1, enc_obj.embed_size, device=device), torch.zeros(1, 1, enc_obj.embed_size, device=device))
    #all_enc_outputs = torch.zeros(60, enc_obj.embed_size, device=device)
    
    enc_optimiser.zero_grad()
    dec_optimiser.zero_grad()
    
    loss_val = 0
    
    for i in range(Ts):
        enc_output, enc_hidden = enc_obj(source_sentence[i], enc_hidden)
        #all_enc_outputs[i] = enc_output[0,0]
        
    dec_input = torch.tensor([[2]], device=device)  #SOS token
    
    dec_hidden = (enc_hidden[0], torch.zeros(1, 1, dec_obj.embed_size, device=device))
    
    for i in range(Tt):
        dec_output, dec_hidden = dec_obj(dec_input, dec_hidden)
        _ , index = dec_output.topk(1)
        dec_input = index.squeeze().detach()
        
        target_word = torch.cuda.LongTensor([target_sentence[i].item()])
        
        loss_val += loss_fn(dec_output, target_word)
        
        if dec_input.item() == 0:
            break

    loss_val.backward()

    enc_optimiser.step()
    dec_optimiser.step()

    return loss_val.item()/Tt

In [15]:
def train_model(sources, targets, enc_obj, dec_obj):
    
    loss_fn = nn.NLLLoss()
    enc_optimiser = optim.SGD(enc_obj.parameters(), lr=0.001, momentum=0.9)
    dec_optimiser = optim.SGD(dec_obj.parameters(), lr=0.001, momentum=0.9)
    
    max_epochs = 100
    old_loss = np.inf
    indices = [i for i in range(len(sources))]
    
    for epoch in range(max_epochs):
        
        # shuffling the input data manually at the start of every new epoch
        np.random.shuffle(indices)
        sources = list(np.array(sources)[indices])
        targets = list(np.array(targets)[indices])
        
        running_loss = 0.0
        num_sentences = len(sources)
        
        for i in range(num_sentences):
            source_sentence = torch.cuda.LongTensor(sources[i])
            target_sentence = torch.cuda.LongTensor(targets[i])

            loss = loss_per_pair(source_sentence, target_sentence, enc_obj, dec_obj, enc_optimiser, dec_optimiser, loss_fn)
            running_loss += loss

            if i%int(num_sentences*0.1)==0:
                print("Epoch", epoch+1, ":", (i/int(num_sentences*0.1))*10,'% done')
                print("Current loss:", running_loss)
                
        if abs(running_loss-old_loss)/running_loss < 1e-3:
            print('Converged')
            break
    
        old_loss = running_loss

    print("Finished Training")

In [52]:
embedding_size = 100
encoder = Encoder(embedding_size, ENG).to(device)
decoder = Decoder(embedding_size, target_vocab_size).to(device)
train_model(X_train, Y_train, encoder, decoder)

Epoch 1 : 0.0 % done
Current loss: 9.92596664428711


KeyboardInterrupt: 

In [47]:
def eval_bleu(enc_obj, dec_obj, source_sentence, target, target_vocab_dict):
    
    # function to return the BLEU score for a single sentence 
    
    with torch.no_grad():
        
        Ts = source_sentence.size(-1)
        enc_hidden = (torch.zeros(1, 1, enc_obj.embed_size, device=device), torch.zeros(1, 1, enc_obj.embed_size, device=device))

        all_enc_outputs = torch.zeros(60, enc_obj.embed_size, device=device)

        for i in range(Ts):
            enc_output, enc_hidden = enc_obj(source_sentence[i], enc_hidden)
            all_enc_outputs[i] += enc_output[0, 0]

        dec_input = torch.tensor([[2]], device=device)  # SOS

        dec_hidden = (enc_hidden[0], torch.zeros(1, 1, dec_obj.embed_size, device=device))

        predicted = []

        for i in range(60):
            dec_output, dec_hidden = dec_obj(dec_input, dec_hidden)
            _ , index = dec_output.data.topk(1)
            if index.item() == 0:
                #decoded_words.append('<EOS>')
                break
            else:
                predicted.append(target_vocab_dict[index.item()])

            dec_input = index.squeeze().detach()

#     print(predicted)
#     print(target)
    return bleu_score([predicted], [[target]])

In [39]:
def evaluate_model(enc_obj, dec_obj, source_test, target_test, target_vocab_dict):
    
    # returns the average bleu score for the model with the given test data
    
    total_bleu = 0
    for i in range(len(source_test)):
        source_sentence = torch.cuda.LongTensor(source_test[i])
        target = [target_vocab_dict[x] for x in target_test[i][:-1]]
        bleu = eval_bleu(enc_obj, dec_obj, source_sentence, target, target_vocab_dict)
        total_bleu += bleu
    
    return total_bleu/len(source_test)

In [50]:
evaluate_model(encoder, decoder, X_test, Y_test, TAM.index_word)

0.0