In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
from torchtext.vocab import GloVe
from torchtext.data import Field, BucketIterator
from torchtext.data.metrics import bleu_score
import spacy

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
#Reading files

with open('trainen.txt', encoding='utf8') as f:
    eng_train = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('trainta.txt', encoding='utf8') as f:
    tamil_train = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('deven.txt', encoding='utf8') as f:
    eng_test = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('devta.txt', encoding='utf8') as f:
    tamil_test = list(map(lambda x: x.rstrip(), f.readlines()))
    
embedding_glove = GloVe(name='6B', dim=100)

spacy_en = spacy.load('en_core_web_sm')

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

stop_words = [',','.','?','!',')','(',':',']','[','$','#','&','%','--']
ENG = Field(tokenize = tokenize_en, init_token='sos', eos_token = 'eos', lower=True, stop_words=stop_words)
processed_eng_train = list(map(lambda x: ENG.preprocess(x), eng_train))
processed_eng_test = list(map(lambda x: ENG.preprocess(x), eng_test))

ENG.build_vocab(processed_eng_train, vectors=embedding_glove)

In [5]:
def preprocess(processed_eng):
    
    #function to return the numericalized version of the tokenized sentences
    X = []
    for tokenized_sentence in processed_eng:
        int_sequence = [2]  #first element is the SOS token 
        for token in tokenized_sentence:
            int_sequence.append(ENG.vocab.stoi[token])
        int_sequence.append(3) #last element is the EOS token
        X.append(int_sequence)
    
    return X

# X_train and X_test are lists of lists with the integer sequences for a given sentence
X_train = preprocess(processed_eng_train)
X_test = preprocess(processed_eng_test)

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Same thing for Tamil sentences
TAM = Tokenizer()
TAM.fit_on_texts(tamil_train)
Y_train = TAM.texts_to_sequences(tamil_train)
Y_test = TAM.texts_to_sequences(tamil_test)

#adding EOS token
_ = [y.append(0) for y in Y_train]
_ = [y.append(0) for y in Y_test]

Using TensorFlow backend.


In [7]:
TAM.index_word[0] = 'EOS'
TAM.index_word[len(TAM.index_word)] = 'SOS'

In [9]:
source_vocab_size = len(ENG.vocab)
target_vocab_size = len(TAM.index_word)
print(source_vocab_size)
print(target_vocab_size)

9723
18670


In [18]:
# loading pretrained weights from numpy file
tamil_embedding = np.load('tam_embed.npy')
tamil_embedding = torch.from_numpy(tamil_embedding)

In [12]:
class Encoder(nn.Module):
    
    def __init__(self, embed_size, hidden_size, ENG):
        
        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.embed = nn.Embedding.from_pretrained(ENG.vocab.vectors)
        self.lstm = nn.LSTM(embed_size, hidden_size)
        
    def forward(self, x, enc_hidden):
        
        embedded_x = self.embed(x)  # Ts x 100
        enc_output, enc_hidden = self.lstm(embedded_x.unsqueeze(1), enc_hidden)
        return enc_output, enc_hidden

In [16]:
class AttnDecoder(nn.Module):
    
    def __init__(self, hidden_size, n_classes, embeddings):
        
        super(AttnDecoder, self).__init__()
        self.embed_size = embeddings.shape[1]
        self.hidden_size = hidden_size
        self.embed = nn.Embedding.from_pretrained(embeddings)
        self.lstm = nn.LSTM(embeddings.shape[1] + hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size, n_classes)
        self.logsoftmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden, enc_hidden_states):
        
        # hidden is a tuple with each element of size 1,1,self.hidden_size
        # enc_hidden_states is of size Ts, self.hidden_size (Ts is the source length)
        embedded = self.embed(input).view(1, 1, -1)   # size is 1,1,self.embed_size
        
        #attn_scores will be of shape Ts
        attn_scores = torch.mm(enc_hidden_states, hidden[0].squeeze().view(-1,1)).squeeze()

        attn_weights = F.softmax(attn_scores, dim=0)
        
        # context will be of shape 1,1,self.hidden_size
        context = torch.bmm(attn_weights.view(1,1,-1), enc_hidden_states.unsqueeze(0))
        
        # new_input will be of shape 1,self.hidden_size + self.embed_size
        new_input = torch.cat((embedded[0], context[0]), 1)
        
        output, hidden = self.lstm(new_input.unsqueeze(0), hidden)

        output = self.logsoftmax(self.fc(output[0]))
        
        return output, hidden

In [14]:
def loss_per_pair(source_sentence, target_sentence, enc_obj, dec_obj, enc_optimiser, dec_optimiser, loss_fn):
    
    Ts = source_sentence.size(-1)
    Tt = target_sentence.size(-1)
    enc_hidden = (torch.zeros(1, 1, enc_obj.hidden_size, device=device), torch.zeros(1, 1, enc_obj.hidden_size, device=device))
    #enc_hidden_states = torch.zeros(Ts, enc_obj.hidden_size, device=device)
    
    enc_optimiser.zero_grad()
    dec_optimiser.zero_grad()
    
    loss_val = 0
    
    enc_output, enc_hidden = enc_obj(source_sentence, enc_hidden) # enc_output will be Ts x 1 x hidden_size
    #enc_hidden_states = enc_output.squeeze(1)      # to make it Ts x hidden_size
        
    dec_input = torch.tensor([[target_vocab_size-1]], device=device)  #SOS token
    
    dec_hidden = enc_hidden
    
    for i in range(Tt):
        dec_output, dec_hidden = dec_obj(dec_input, dec_hidden, enc_output.squeeze(1))
        _ , index = dec_output.topk(1)
        dec_input = index.squeeze().detach()
        
        target_word = torch.cuda.LongTensor([target_sentence[i].item()])
        
        loss_val += loss_fn(dec_output, target_word)
        
        if dec_input.item() == 0:
            break

    loss_val.backward()
    
    nn.utils.clip_grad_norm_(enc_obj.parameters(), 0.5, 1)
    nn.utils.clip_grad_norm_(dec_obj.parameters(), 0.5, 1)

    enc_optimiser.step()
    dec_optimiser.step()

    return loss_val.item()/Tt

In [15]:
def train_model(sources, targets, enc_obj, dec_obj):
    
    loss_fn = nn.NLLLoss()
    enc_optimiser = optim.SGD(enc_obj.parameters(), lr=0.001, momentum=0.9)
    dec_optimiser = optim.SGD(dec_obj.parameters(), lr=0.001, momentum=0.9)
    
    max_epochs = 200
    old_loss = np.inf
    indices = [i for i in range(len(sources))]
    
    for epoch in range(max_epochs):
        
        # shuffling the input data manually at the start of every new epoch
        np.random.shuffle(indices)
        sources = list(np.array(sources)[indices])
        targets = list(np.array(targets)[indices])
        
        running_loss = 0.0
        num_sentences = len(sources)
        
        for i in range(num_sentences):
            source_sentence = torch.cuda.LongTensor(sources[i])
            target_sentence = torch.cuda.LongTensor(targets[i])

            loss = loss_per_pair(source_sentence, target_sentence, enc_obj, dec_obj, enc_optimiser, dec_optimiser, loss_fn)
            running_loss += loss

            if i%int(num_sentences*0.1)==0:
                print("Epoch", epoch+1, ":", (i/int(num_sentences*0.1))*10,'% done')
                print("Current loss:", running_loss)
                
        print('\nEpoch', epoch+1,"\n")
        print("Encoder obj lstm wts sum=", torch.sum(enc_obj.lstm.weight_ih_l0))
        print("Encoder obj lstm bias sum=", torch.sum(enc_obj.lstm.bias_ih_l0))
        print('Decoder obj emb wts sum=', torch.sum(dec_obj.embed.weight.data))
        print('Decoder obj lstm wts sum=', torch.sum(dec_obj.lstm.weight_ih_l0.data))
        print('Decoder obj lstm bias sum=', torch.sum(dec_obj.lstm.bias_ih_l0.data))
        print('Decoder obj linear wts sum=', torch.sum(dec_obj.fc.weight.data))
        print('Decoder obj linear bias sum=', torch.sum(dec_obj.fc.bias.data))
        
        if abs(running_loss-old_loss)/running_loss < 1e-4:
            print('Converged')
            break
    
        old_loss = running_loss

    print("Finished Training")

In [19]:
embedding_size = 100
hidden_size = 150
encoder = Encoder(embedding_size, hidden_size, ENG).to(device)
decoder = AttnDecoder(hidden_size, target_vocab_size, tamil_embedding).to(device)
train_model(X_train, Y_train, encoder, decoder)

Epoch 1 : 0.0 % done
Current loss: 9.824352900187174


KeyboardInterrupt: 

In [16]:
torch.save(encoder.state_dict(), 'q4_encoder_weights.pt')
torch.save(decoder.state_dict(), 'q4_decoder_weights.pt')

In [43]:
def test_loss(sources, targets, enc_obj, dec_obj):
    
    loss_fn = nn.NLLLoss()
    num_sentences = len(sources)
    total_loss = 0
        
    for i in range(num_sentences):

        with torch.no_grad():

            source_sentence = torch.cuda.LongTensor(sources[i])
            target_sentence = torch.cuda.LongTensor(targets[i])
            Ts = source_sentence.size(-1)
            Tt = target_sentence.size(-1)
            enc_hidden = (torch.zeros(1, 1, enc_obj.hidden_size, device=device), torch.zeros(1, 1, enc_obj.hidden_size, device=device))
            
            loss_val = 0
            
            enc_output, enc_hidden = enc_obj(source_sentence, enc_hidden)

            dec_input = torch.tensor([[target_vocab_size-1]], device=device)  #SOS token

            # first hidden state of decoder is made the final hidden state of the encoder
            dec_hidden = enc_hidden
            

            for j in range(Tt):

                dec_output, dec_hidden = dec_obj(dec_input, dec_hidden, enc_output.squeeze(1))
                _ , index = dec_output.topk(1)
                dec_input = index.squeeze().detach()
                if index.item() == 0:
                    break

                target_word = torch.cuda.LongTensor([target_sentence[j].item()])

                loss_val += loss_fn(dec_output, target_word)
            
            total_loss += loss_val.item()/Tt
    
    return total_loss

In [44]:
# this test loss is for 500 sentences
test_loss(X_test, Y_test, encoder, decoder)

KeyboardInterrupt: 

In [45]:
def eval_bleu(enc_obj, dec_obj, source_sentence, target, target_vocab_dict):
    
    # function to return the BLEU score for a single sentence 
    
    with torch.no_grad():
        
        Ts = source_sentence.size(-1)
        enc_hidden = (torch.zeros(1, 1, enc_obj.hidden_size, device=device), torch.zeros(1, 1, enc_obj.hidden_size, device=device))
    
        enc_output, enc_hidden = enc_obj(source_sentence, enc_hidden)
    
        dec_input = torch.tensor([[target_vocab_size-1]], device=device)  # SOS

        dec_hidden = enc_hidden
        predicted = []

        for i in range(60):
            dec_output, dec_hidden = dec_obj(dec_input, dec_hidden, enc_output.squeeze(1))
            _ , index = dec_output.data.topk(1)
            if index.item() == 0:
                #decoded_words.append('<EOS>')
                break
            else:
                predicted.append(target_vocab_dict[index.item()])

            dec_input = index.squeeze().detach()

#     print(predicted)
#     print(target)
    return bleu_score([predicted], [[target]])

In [26]:
def evaluate_model(enc_obj, dec_obj, source_test, target_test, target_vocab_dict):
    
    # returns the average bleu score for the model with the given test data
    
    total_bleu = 0
    for i in range(len(source_test)):
        source_sentence = torch.cuda.LongTensor(source_test[i])
        target = [target_vocab_dict[x] for x in target_test[i][:-1]]
        bleu = eval_bleu(enc_obj, dec_obj, source_sentence, target, target_vocab_dict)
        total_bleu += bleu
    
    return total_bleu/len(source_test)

In [27]:
evaluate_model(encoder, decoder, X_test, Y_test, TAM.index_word)

KeyboardInterrupt: 

In [14]:
F = TAM.index_word

In [27]:
list1 = [[F[2],F[3],F[4]]]
list2 = [[[F[2],F[3],F[4]]]]

In [28]:
list1

[['நீங்கள்', 'ஒரு', 'இல்லை']]

In [29]:
list2

[[['நீங்கள்', 'ஒரு', 'இல்லை']]]

In [31]:
bleu_score(list1, list2)

0.0

In [36]:
l1 = [['The','brown','fox','jumped']]
l2 = [[['The','brown','fox','jumped']]]
bleu_score(l1,l2)

1.0

In [41]:
l1 = [['I','am','a','boy']]
l2 = [[['I','am','a','girl']]]
bleu_score(l1,l2, weights=(0.5,0.5,0,0))

0.0

In [42]:
from nltk.translate.bleu_score import sentence_bleu
reference = [['this', 'is', 'a', 'test'], ['this', 'is' 'test']]
candidate = ['this', 'is', 'a', 'test']
score = sentence_bleu(reference, candidate)
print(score)

1.0


In [43]:
cand = [F[2],F[3],F[4]]
ref = [[F[2],F[3],F[4]]]

In [44]:
cand

['நீங்கள்', 'ஒரு', 'இல்லை']

In [45]:
ref

[['நீங்கள்', 'ஒரு', 'இல்லை']]

In [49]:
sentence_bleu(ref, cand, weights=(1/3, 1/3, 1/3, 0))

1.0