In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import tarfile
import pandas as pd
import os
import re
from torch.utils.data import Dataset, DataLoader, ConcatDataset, random_split
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import torch.optim as optim
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from io import StringIO
from PIL import Image
from torchtext.vocab import GloVe
from torchtext.data import Field, BucketIterator
from torchtext.data.metrics import bleu_score

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
class DatasetClass(Dataset):
    
    def __init__(self, source, target):
        
        self.source = source
        self.target = target
        self.size = len(source)
      
    def __len__(self):
        
        return self.size
    
    def __getitem__(self, idx):
        
        return self.source[idx], self.target[idx]

In [4]:
def train_test_loader(source_train, target_train, source_test, target_test, num_workers=0, batch_size=32):

    train_data = DatasetClass(source_train, target_train)
    test_data = DatasetClass(source_test, target_test)

    trainloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    testloader = DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    
    return trainloader, testloader

In [5]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        print(output.shape, hidden.shape)
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [6]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [7]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=60):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(-1)
    print(input_tensor)
    print(input_tensor.shape)
    print(input_length)
    target_length = target_tensor.size(-1)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        print(input_tensor[ei].shape)
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[0]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [8]:
def trainIters(encoder, decoder, iterator, device, learning_rate=0.01):
    
    print_loss_total = 0  

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

    criterion = nn.NLLLoss()

    for i in iterator:
        input_tensor = i[0].type(torch.LongTensor).to(device)
        target_tensor = i[1].type(torch.LongTensor).to(device)

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        
    print(print_loss_total)

In [9]:
#Reading files

with open('trainen.txt', encoding='utf8') as f:
    eng_train = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('trainta.txt', encoding='utf8') as f:
    tamil_train = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('deven.txt', encoding='utf8') as f:
    eng_test = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('devta.txt', encoding='utf8') as f:
    tamil_test = list(map(lambda x: x.rstrip(), f.readlines()))
    
embedding_glove = GloVe(name='6B', dim=100)

SRC = Field(tokenize='basic_english', init_token='<sos>', eos_token = '<eos>', lower=True)
processed_eng = list(map(lambda x: SRC.preprocess(x), eng_train))

SRC.build_vocab(processed_eng, vectors=embedding_glove)
embedding_trained = nn.Embedding.from_pretrained(SRC.vocab.vectors)

In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#Tokenize the sentences
tokenizer = Tokenizer()

#preparing vocabulary
tokenizer.fit_on_texts(eng_train)

#converting text into integer sequences
x_tr_seq  = tokenizer.texts_to_sequences(eng_train) 
x_val_seq = tokenizer.texts_to_sequences(eng_test)

#padding to prepare sequences of same length
x_tr_seq  = pad_sequences(x_tr_seq, maxlen=58)  # this is a 26217x58 integer representation of the source text
x_val_seq = pad_sequences(x_val_seq, maxlen=58)

sos_token = -2
eos_token = -1
x_tr_seq = pad_sequences(x_tr_seq, maxlen=59, padding='pre', value=sos_token)
x_tr_seq = pad_sequences(x_tr_seq, maxlen=60, padding='post', value=eos_token)
x_val_seq = pad_sequences(x_val_seq, maxlen=59, padding='pre', value=sos_token)
x_val_seq = pad_sequences(x_val_seq, maxlen=60, padding='post', value=eos_token)

Using TensorFlow backend.


In [11]:
# Same thing for Tamil sentences
tokenizer2 = Tokenizer()
tokenizer2.fit_on_texts(tamil_train)
y_tr_seq = tokenizer2.texts_to_sequences(tamil_train)
y_val_seq = tokenizer2.texts_to_sequences(tamil_test)

y_tr_seq = pad_sequences(y_tr_seq, maxlen=58)
y_val_seq = pad_sequences(y_val_seq, maxlen=58)

y_tr_seq = pad_sequences(y_tr_seq, maxlen=59, padding='pre', value=sos_token)
y_tr_seq = pad_sequences(y_tr_seq, maxlen=60, padding='post', value=eos_token)
y_val_seq = pad_sequences(y_val_seq, maxlen=59, padding='pre', value=sos_token)
y_val_seq = pad_sequences(y_val_seq, maxlen=60, padding='post', value=eos_token)

In [12]:
# to make sos_token=0, eos_token=1 and padding=2, each sentence is now a tensor of size [1,60]
X_train = [(torch.Tensor(i)+2.0).unsqueeze(0) for i in x_tr_seq]
Y_train = [(torch.Tensor(i)+2.0).unsqueeze(0) for i in y_tr_seq]
X_test = [(torch.Tensor(i)+2.0).unsqueeze(0) for i in x_val_seq]
Y_test = [(torch.Tensor(i)+2.0).unsqueeze(0) for i in y_val_seq]

In [13]:
trainloader, testloader = train_test_loader(X_train, Y_train, X_test, Y_test, batch_size=32)

In [14]:
source_vocab_size = len(tokenizer.word_index)+2
target_vocab_size = len(tokenizer2.word_index)+2
print(source_vocab_size)
print(target_vocab_size)

9883
18670


In [15]:
hidden_size = 256
encoder1 = EncoderRNN(source_vocab_size, hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, target_vocab_size).to(device)
trainIters(encoder1, decoder1, trainloader, device)

tensor([[[   0,    2,    2,  ..., 1854,  620,    1]],

        [[   0,    2,    2,  ...,   49, 2212,    1]],

        [[   0,    2,    2,  ...,  362,  109,    1]],

        ...,

        [[   0,    2,    2,  ..., 7522,   79,    1]],

        [[   0,    2,    2,  ...,    6, 1423,    1]],

        [[   0,    2,    2,  ...,  252,  225,    1]]], device='cuda:0')
torch.Size([32, 1, 60])
60
torch.Size([1, 60])
torch.Size([1, 1, 15360]) torch.Size([1, 1, 256])


RuntimeError: input.size(-1) must be equal to input_size. Expected 256, got 15360

In [16]:
X_train[0].shape

torch.Size([1, 60])

In [17]:
for i in trainloader:
    print(i[0].shape)
    break

torch.Size([32, 1, 60])
