In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [2]:
import torch
import torchvision
import torchvision.transforms as transforms
import tarfile
import pandas as pd
import re
from torch.utils.data import Dataset, DataLoader, ConcatDataset, random_split
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import torch.optim as optim
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from io import StringIO
from PIL import Image
from torchtext.vocab import GloVe
from torchtext.data import Field, BucketIterator
from torchtext.data.metrics import bleu_score
import spacy

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
# class DatasetClass(Dataset):
    
#     def __init__(self, source, target):
        
#         self.source = source
#         self.target = target
#         self.size = len(source)
      
#     def __len__(self):
        
#         return self.size
    
#     def __getitem__(self, idx):
        
#         return self.source[idx], self.target[idx]
    
# def train_test_loader(source_train, target_train, source_test, target_test, num_workers=0, batch_size=32):

#     train_data = DatasetClass(source_train, target_train)
#     test_data = DatasetClass(source_test, target_test)

#     trainloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers)
#     testloader = DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    
#     return trainloader, testloader

In [5]:
class EncoderRNN(nn.Module):
    
    def __init__(self, input_size, hidden_size, ENG):
        
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding.from_pretrained(ENG.vocab.vectors)
        self.lstm = nn.LSTM(hidden_size, hidden_size)

    def forward(self, input, hidden):
        
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.lstm(output, hidden)
        return output, hidden

    def initHidden(self):
        
        return (torch.zeros(1, 1, self.hidden_size, device=device), torch.zeros(1, 1, self.hidden_size, device=device))

In [6]:
class DecoderRNN(nn.Module):
    
    def __init__(self, hidden_size, output_size):
        
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.lstm(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self, enc_output):
        
        return (enc_output.to(device), torch.zeros(1, 1, self.hidden_size, device=device))

In [7]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=60):
    
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(-1)
    target_length = target_tensor.size(-1)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        
        # this line calls forward() in the EncoderRNN
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[2]], device=device)  #SOS token
    
    #the first hidden state of the decoder is the final output of the encoder and cell state is zeroes
    decoder_hidden = decoder.initHidden(encoder_outputs[input_length-1].view(1,1,-1))  

    # Without teacher forcing: use its own predictions as the next input
    for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()  # detach from history as input

        #print(decoder_output.shape) # tensor of size [1,18668] with logprobs
        
        # trg will be the index of the target word put in a long tensor of size 1
        trg = torch.cuda.LongTensor([target_tensor[di].item()])
        
        loss += criterion(decoder_output, trg)
        # 0 is the EOS token in the target language
        if decoder_input.item() == 0:
            break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [8]:
def trainIters(encoder, decoder, sources, targets, device, learning_rate=0.001):
    
    # sources and targets are lists of lists with each list containing the integer encodings of a sentence

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate, momentum=0.9)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate, momentum=0.9)

    criterion = nn.NLLLoss()
    
    max_epochs = 100
    old_loss = np.inf
    indices = [i for i in range(len(sources))]
    
    for epoch in range(max_epochs):
        
        # shuffling the input data manually at the start of every new epoch
        np.random.shuffle(indices)
        sources = list(np.array(sources)[indices])
        targets = list(np.array(targets)[indices])
        
        running_loss = 0.0

        for i in range(len(sources)):
            input_tensor = torch.LongTensor(sources[i]).to(device)
            target_tensor = torch.LongTensor(targets[i]).to(device)

            loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
            running_loss += loss

            if i%int(len(sources)*0.1)==0:
                print("Epoch", epoch+1, ":", (i/int(len(sources)*0.1))*10,'% done')
                print("Current loss:", running_loss)
                
        if abs(running_loss-old_loss)/running_loss < 1e-3:
            print('Converged')
            break
    
        old_loss = running_loss

    print("Finished Training")

In [9]:
#Reading files

with open('trainen.txt', encoding='utf8') as f:
    eng_train = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('trainta.txt', encoding='utf8') as f:
    tamil_train = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('deven.txt', encoding='utf8') as f:
    eng_test = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('devta.txt', encoding='utf8') as f:
    tamil_test = list(map(lambda x: x.rstrip(), f.readlines()))
    
embedding_glove = GloVe(name='6B', dim=100)

spacy_en = spacy.load('en_core_web_sm')

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

ENG = Field(tokenize = tokenize_en, init_token='sos', eos_token = 'eos', lower=True)
processed_eng_train = list(map(lambda x: ENG.preprocess(x), eng_train))
processed_eng_test = list(map(lambda x: ENG.preprocess(x), eng_test))

ENG.build_vocab(processed_eng_train, vectors=embedding_glove)
embedding_trained = nn.Embedding.from_pretrained(ENG.vocab.vectors)

In [10]:
def preprocess(processed_eng):
    
    #function to return the numericalized version of the tokenized sentences
    X = []
    for tokenized_sentence in processed_eng:
        int_sequence = [2]  #first element is the SOS token 
        for token in tokenized_sentence:
            int_sequence.append(ENG.vocab.stoi[token])
        int_sequence.append(3) #last element is the EOS token
        X.append(int_sequence)
    
    return X

# X_train and X_test are lists of lists with the integer sequences for a given sentence
X_train = preprocess(processed_eng_train)
X_test = preprocess(processed_eng_test)

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Same thing for Tamil sentences
TAM = Tokenizer()
TAM.fit_on_texts(tamil_train)
Y_train = TAM.texts_to_sequences(tamil_train)
Y_test = TAM.texts_to_sequences(tamil_test)

#adding EOS token
_ = [y.append(0) for y in Y_train]
_ = [y.append(0) for y in Y_test]

Using TensorFlow backend.


In [12]:
source_vocab_size = len(ENG.vocab)
target_vocab_size = len(TAM.word_index)+1
print(source_vocab_size)
print(target_vocab_size)

9736
18669


In [13]:
hidden_size = 100
encoder = EncoderRNN(source_vocab_size, hidden_size, ENG).to(device)
decoder = DecoderRNN(hidden_size, target_vocab_size).to(device)
trainIters(encoder, decoder, X_train, Y_train, device)

Epoch 1 : 0.0 % done
Current loss: 9.71787166595459
Epoch 1 : 10.0 % done
Current loss: 10236.595142730424
Epoch 1 : 20.0 % done
Current loss: 19840.871817395455
Epoch 1 : 30.0 % done
Current loss: 29117.83951014723
Epoch 1 : 40.0 % done
Current loss: 39025.92999433921


KeyboardInterrupt: 

In [14]:
def eval_bleu(encoder, decoder, input_tensor, target, target_vocab_dict, max_length=60):
    
    # function to return the BLEU score for a single sentence 
    
    with torch.no_grad():
        
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[2]], device=device)  # SOS

        decoder_hidden = decoder.initHidden(encoder_outputs[input_length-1].view(1,1,-1)) 

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == 0:
                #decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(target_vocab_dict[topi.item()])

            decoder_input = topi.squeeze().detach()

#     print(decoded_words)
#     print(target)
    return bleu_score([decoded_words], [[target]])

In [15]:
def evaluate_model(encoder, decoder, source_test, target_test, target_vocab_dict):
    
    # returns the average bleu score for the model with the given test data
    
    total_bleu = 0
    for i in range(len(source_test)):
        input_tensor = torch.LongTensor(source_test[i]).to(device)
        target = [target_vocab_dict[x] for x in target_test[i][:-1]]
        bleu = eval_bleu(encoder, decoder, input_tensor, target, target_vocab_dict)
        total_bleu += bleu
    
    return total_bleu/len(source_test)

In [16]:
evaluate_model(encoder, decoder, X_test, Y_test, TAM.index_word)

0.0