In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [2]:
import torch
import torchvision
import torchvision.transforms as transforms
import tarfile
import pandas as pd
import re
from torch.utils.data import Dataset, DataLoader, ConcatDataset, random_split
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import torch.optim as optim
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from io import StringIO
from PIL import Image
from torchtext.vocab import GloVe
from torchtext.data import Field, BucketIterator
from torchtext.data.metrics import bleu_score
import spacy

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
# class DatasetClass(Dataset):
    
#     def __init__(self, source, target):
        
#         self.source = source
#         self.target = target
#         self.size = len(source)
      
#     def __len__(self):
        
#         return self.size
    
#     def __getitem__(self, idx):
        
#         return self.source[idx], self.target[idx]
    
# def train_test_loader(source_train, target_train, source_test, target_test, num_workers=0, batch_size=32):

#     train_data = DatasetClass(source_train, target_train)
#     test_data = DatasetClass(source_test, target_test)

#     trainloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers)
#     testloader = DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    
#     return trainloader, testloader

In [4]:
class EncoderRNN(nn.Module):
    
    def __init__(self, input_size, hidden_size, SRC):
        
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding.from_pretrained(SRC.vocab.vectors)
        self.lstm = nn.LSTM(hidden_size, hidden_size)

    def forward(self, input, hidden):
        
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
#         print(output.shape)
#         print(hidden.shape)
        output, hidden = self.lstm(output, hidden)
        return output, hidden

    def initHidden(self):
        
        return (torch.zeros(1, 1, self.hidden_size, device=device), torch.zeros(1, 1, self.hidden_size, device=device))

In [5]:
class DecoderRNN(nn.Module):
    
    def __init__(self, hidden_size, output_size):
        
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.lstm(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        
        return torch.zeros(1, 1, self.hidden_size, device=device)
    
    def get_embedding(self, x):
        return self.embedding(x).unsqueeze(0)

In [6]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=60):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(-1)
    target_length = target_tensor.size(-1)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        
        # this line calls forward() in the EncoderRNN
        #print(encoder_hidden.shape)
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[2]], device=device)  #SOS token
    
    #the first hidden state of the decoder is the final hidden state of the encoder
    decoder_hidden = encoder_hidden   

    # Without teacher forcing: use its own predictions as the next input
    for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()  # detach from history as input

        #print(decoder_output.shape) # tensor of size [1,18668] with logprobs
        
        # trg will be the index of the target word put in a long tensor of size 1
        trg = torch.cuda.LongTensor([target_tensor[di].item()])
        
        loss += criterion(decoder_output, trg)
        # 0 is the EOS token in the target language
        if decoder_input.item() == 0:
            break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [7]:
def trainIters(encoder, decoder, sources, targets, device, learning_rate=0.01):
    
    # sources and targets are lists of lists with each list containing the integer encodings of a sentence
    
    print_loss_total = 0  

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

    criterion = nn.NLLLoss()

    for i in range(len(sources)):
        input_tensor = torch.LongTensor(sources[i]).to(device)
        target_tensor = torch.LongTensor(targets[i]).to(device)

        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        
        if i%2621==0:
            print((i/2621)*10,'% done')
            print("Current loss:", print_loss_total)
        
    print("Loss = ",print_loss_total)

In [8]:
#Reading files

with open('trainen.txt', encoding='utf8') as f:
    eng_train = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('trainta.txt', encoding='utf8') as f:
    tamil_train = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('deven.txt', encoding='utf8') as f:
    eng_test = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('devta.txt', encoding='utf8') as f:
    tamil_test = list(map(lambda x: x.rstrip(), f.readlines()))
    
embedding_glove = GloVe(name='6B', dim=100)

spacy_en = spacy.load('en_core_web_sm')

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

SRC = Field(tokenize = tokenize_en, init_token='<sos>', eos_token = '<eos>', lower=True)
processed_eng_train = list(map(lambda x: SRC.preprocess(x), eng_train))
processed_eng_test = list(map(lambda x: SRC.preprocess(x), eng_test))

SRC.build_vocab(processed_eng_train, vectors=embedding_glove)
embedding_trained = nn.Embedding.from_pretrained(SRC.vocab.vectors)

In [9]:
def preprocess(processed_eng):
    
    #function to return the numericalized version of the tokenized sentences
    X = []
    for tokenized_sentence in processed_eng:
        int_sequence = [2]  #first element is the SOS token 
        for token in tokenized_sentence:
            int_sequence.append(SRC.vocab.stoi[token])
        int_sequence.append(3) #last element is the EOS token
        X.append(int_sequence)
    
    return X
    
X_train = preprocess(processed_eng_train)
X_test = preprocess(processed_eng_test)

In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Same thing for Tamil sentences
tokenizer2 = Tokenizer()
tokenizer2.fit_on_texts(tamil_train)
y_tr_seq = tokenizer2.texts_to_sequences(tamil_train)
y_val_seq = tokenizer2.texts_to_sequences(tamil_test)

#adding EOS token
_ = [y.append(0) for y in y_tr_seq]
_ = [y.append(0) for y in y_val_seq]

Using TensorFlow backend.


In [11]:
source_vocab_size = len(SRC.vocab)
target_vocab_size = len(tokenizer2.word_index)+1
print(source_vocab_size)
print(target_vocab_size)

9737
18669


In [12]:
hidden_size = 100
encoder1 = EncoderRNN(source_vocab_size, hidden_size, SRC).to(device)
decoder1 = DecoderRNN(hidden_size, target_vocab_size).to(device)
trainIters(encoder1, decoder1, X_train, y_tr_seq, device)

0.0 % done
Current loss: 9.837830352783204
10.0 % done
Current loss: 10637.475831454185
20.0 % done
Current loss: 22052.417893069407
30.0 % done
Current loss: 33588.00305823722
40.0 % done
Current loss: 44821.36450597464
50.0 % done
Current loss: 56506.69794175826
60.0 % done
Current loss: 68169.20414662533
70.0 % done
Current loss: 79854.48099445802
80.0 % done
Current loss: 91522.4564530828
90.0 % done
Current loss: 103236.43000924337
100.0 % done
Current loss: 114782.68873067149
Loss =  114809.6355010852


In [None]:
tokenizer2.index_word

In [None]:
bleu_score([tem], [tem2])

In [None]:
tem = [tokenizer2.index_word[x] for x in y_val_seq[0][:-1]]

In [None]:
tem2 = [tokenizer2.index_word[x] for x in y_val_seq[1][:-1]]

In [None]:
bleu_score([['How', 'are', 'you']], [['How', 'are', 'you']])