**Load Data and do preprocessing**

***I have mounted my drive and loaded our training dataset in a list and in the csv file i have only used columns hindi and english and left the index column that was present in the csv.***

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import csv
column_names = []
dataset = []
source_sent_list=[]
targ_sent_list=[]
  
# reading csv file
with open('/content/drive/MyDrive/train.csv', 'r') as csvfile:
    # creating a csv reader object
    csvreader = csv.reader(csvfile)
      
    # extracting field names through first row
    column_names = next(csvreader)
  
    # extracting each data row one by one
    for row in csvreader:
        #print(row[1],row[2])
        dataset.append([row[1],row[2]]) #row[1] contains hindi sentences and row[2] contains english sentences
        source_sent_list.append(row[1])
        targ_sent_list.append(row[2])

**Brief Overview of the pipeline**

*Unprocessed text ---> Pre-processing and tokenizing ----> Building the vocabulary for source(Hindi) and target(english)  using the tokenized text --->Converting each sentence to tensor objects using the indexes(in the vocabulary) of the words present in the sentence ---> passing it to the encoder decoder model and training it .*

**Pre-requisite for hindi text processing**

***Cloning the indic_nlp_library and setting the path . indic_nlp library will be used pre-processing the hindi text provided in training dataset.***

In [None]:
!git clone "https://github.com/anoopkunchukuttan/indic_nlp_library"
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
!pip install Morfessor
# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME=r"/content/indic_nlp_library"

# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES="/content/indic_nlp_resources"
import sys
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)
from indicnlp import loader
loader.load()

Cloning into 'indic_nlp_library'...
remote: Enumerating objects: 1271, done.[K
remote: Counting objects: 100% (93/93), done.[K
remote: Compressing objects: 100% (68/68), done.[K
remote: Total 1271 (delta 50), reused 54 (delta 25), pack-reused 1178[K
Receiving objects: 100% (1271/1271), 9.56 MiB | 13.35 MiB/s, done.
Resolving deltas: 100% (654/654), done.
Cloning into 'indic_nlp_resources'...
remote: Enumerating objects: 133, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126[K
Receiving objects: 100% (133/133), 149.77 MiB | 22.37 MiB/s, done.
Resolving deltas: 100% (51/51), done.
Collecting Morfessor
  Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl
Installing collected packages: Morfessor
Successfully installed Morfessor-2.0.6


***Importing libraries for pre-processing Hindi and English Text***

In [None]:
import re
import string
import random
from unicodedata import normalize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
from indicnlp.tokenize import indic_tokenize

***Here I am building my vocabulary where SOS_token is the start token , EOS_token is the end token , UNK_token is for unknown token, PAD_token is for padding.***

In [None]:
SOS_token = '<start>' #start token
EOS_token = '<end>'   #end token
UNK_token = '<unk>'   #unknown token
PAD_token = '<pad>'   #pad token

SOS_idx = 0  #index for start token
EOS_idx = 1  #index for end token
UNK_idx = 2  #index for unknown token
PAD_idx = 3  #index for pad token

class Lang:
    def __init__(self, name):
        # name denotes the name of the language sent as input i.e. Hindi or English
        self.name = name  

        #word2count is used to count the times a word appreas in the vocabulary
        self.word2count = {}

        # n_words denotes the number of unique words in constructed vocabulary
        self.n_words = 4  # Count SOS,EOS,PAD,UNK

        #index2word provides the word from the vocabulary we will build given  the index
        self.index2word = {
            SOS_idx: SOS_token,
            EOS_idx: EOS_token,
            UNK_idx: UNK_token,
            PAD_idx: PAD_token
        }

        #word2index provides the index from the vocabulary we will build given  the word
        self.word2index = {v: k for k, v in self.index2word.items()}
   
    #given a sentence I am splitting it in words and adding it to our vocabulary
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            # since my index starts from 0 and I have initialised n_words to 4 and intitially my vocab contains just SOS,EOS,PAD,UNK , I am putting the n_words count as index of the current word encountered
            self.word2index[word] = self.n_words 
             
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

***Here I have defined functions for cleaning the source(hindi) text and target(english) text***

In [None]:
#clean_text function takes in the text and it removes puntuation marks, numbers,quotes from the provided text and returns the cleaned text
def clean_text(text):
    text = text.replace(u',','')
    text = text.replace(u'"','')
    text = text.replace(u'"','')
    text = text.replace(u"‘‘",'')
    text = text.replace(u"’’",'')
    text = text.replace(u"''",'')
    text = text.replace(u"।",'')
    text=text.replace(u',','')
    text=text.replace(u'"','')
    text=text.replace(u'(','')
    text=text.replace(u')','')
    text=text.replace(u'"','')
    text=text.replace(u':','')
    text=text.replace(u"'",'')
    text=text.replace(u"‘‘",'')
    text=text.replace(u"’’",'')
    text=text.replace(u"''",'')
    text=text.replace(u".",'')
    text=text.replace(u"-",'')
    text=text.replace(u"।",'')
    text=text.replace(u"?",'')
    text=text.replace(u"\\",'')
    text=text.replace(u"_",'')
    text=text.replace("'", "")
    text=text.replace('"', "")
    text= re.sub("'", '', text)
    text= re.sub("’", '', text)
    text=re.sub('[0-9+\-*/.%]', '', text)
    text=text.strip()
    text=re.sub(' +', ' ',text)
    exclude = set(string.punctuation)
    text= ''.join(ch for ch in text if ch not in exclude)
    return text

#this function is used to pre-process english sentences
def pre_process_english_sentence(line):
    #lower casing the text
    line = line.lower()
    #cleaning the text 
    line = clean_text(line)
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    #I first normalise the sentences using unicode normalise to NDF(cannonical decomposition) and then convert to ascii ignoring the errors
    line = normalize('NFD', line).encode('ascii', 'ignore') 
    line = line.decode('UTF-8')
    line = line.split()
    line = [re_print.sub('', w) for w in line]
    line = [word for word in line if word.isalpha()]
    line = ' '.join(line)
    return line

#this function is used to pre-process hindi sentences
def pre_process_hindi_sentence(line):
    #cleaning the text
    line = clean_text(line)
    factory = IndicNormalizerFactory()
    #remove_nuktas is set to false as it usually removes things like vishargya from the text and I did not need that to be done
    normalizer = factory.get_normalizer("hi",remove_nuktas=False)
    line = normalizer.normalize(line)
    tokens = list()
    #tokenising the text using trial_tokenize of indic library
    for t in indic_tokenize.trivial_tokenize(line):
        tokens.append(t)
    line = tokens
    line = [word for word in line if not re.search(r'\d', word)]
    line = ' '.join(line)
    return (line)

#this functions appends the preprocessed data to a list of list 
def to_pairs(source_sent_list,targ_sent_list):
    english_lines =targ_sent_list
    hindi_lines = source_sent_list
    pairs = []
    for i in range(len(hindi_lines)):
        pairs.append([])
        pairs[i].append(pre_process_hindi_sentence(hindi_lines[i]))
        pairs[i].append(pre_process_english_sentence(english_lines[i]))
    return pairs

#this preparesmy vocabulary for hindi and english language by using the words in the training dataset
def prepareData(pairs):
    input_lang = Lang('hin')
    output_lang = Lang('eng')
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    return input_lang, output_lang


pairs_all = to_pairs(source_sent_list,targ_sent_list) #pairs_all is a list of list containing processed hindi and its corresponding english text
input_lang, output_lang = prepareData(pairs_all) #input_lang and output_lang are objects of the Lang class defined earlier which contains information about the vocabulary

***Let us check if the pairs_all has been constructed correctly***

In [None]:
for i,j in enumerate(pairs_all):
  print(i,j)
  break

0 ['एल सालवाडोर मे जिन दोनो पक्षों ने सिविलयुद्ध से वापसी ली उन्होंने वही काम किये जो कैदियों की कश्मकश के निदान हैं', 'in el salvador both sides that withdrew from their civil war took moves that had been proven to mirror a prisoners dilemma strategy']


***Let us now split the pairs all into two lists one containing all source sentences and the other containing the target sentences . This will be helpful while constructing the train and the validation dataset.***

In [None]:
source_to_target = {}
i=0
for index,source_target in enumerate(pairs_all):
    source=source_target[0]
    target=source_target[1]
    if source in source_to_target:
        source_to_target[source].append(target)
    else:
        source_to_target[source] = [target]
source_sents, target_sents = zip(*source_to_target.items())


***Checking if my source_sents are correctly added***

In [None]:
for i in target_sents:
  print(i)
  break

['in el salvador both sides that withdrew from their civil war took moves that had been proven to mirror a prisoners dilemma strategy']


**Splitting into train,val data (95%,5%) as test set was provided every week**





In [None]:
import numpy as np
#used random_seed to ensure that my every split is uniform
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

source_length = len(source_sents)
inidices = np.random.permutation(source_length)

#spliting the dataset in the ratio of 9:1  (train:validation ratio)
training_indices = inidices[:int(source_length*0.95)] 
dev_indices = inidices[int(source_length*0.95):]

#preparing the training dataset
training_source = [source_sents[i] for i in training_indices]
dev_source = [source_sents[i] for i in dev_indices]

training_target = [target_sents[i] for i in training_indices]
dev_target = [target_sents[i] for i in dev_indices]

# Unwrap training examples
training_t = []
training_s = []
for source, tt in zip(training_source, training_target):
    for target in tt:
        training_t.append(target)
        training_s.append(source)

training_source = training_s
training_target = training_t

***Checking the number of words in my hindi vocabulary (constructed from the training dataset)***

In [None]:
input_lang.n_words

46185

***Checking the number of words in my english vocabulary(constructed from training dataset)***

In [None]:
output_lang.n_words

32596

In [None]:
import numpy as np

***checking the maximum length of the source and target sentence so that it is helpful while padding the shorter sentences , to make all of same length and then convert to a tensor object.***

In [None]:
lenght_list=[]
for l in source_sents:
    lenght_list.append(len(l.split(' ')))
max_length_src = np.max(lenght_list)
print (max_length_src)
lenght_list=[]
for l in target_sents:
    lenght_list.append(len(str(l).split(' ')))
max_length_tar = np.max(lenght_list)
print (max_length_tar)


394
301


**Starting the Model**

***Importing the necessary libraries for our model***

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.nn.init as init
#setting the device to cuda if its available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

**Encoder**

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, n_layers=1):
        super(EncoderRNN, self).__init__()

        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        self.embedding = nn.Embedding(vocab_size, hidden_size)
        init.normal_(self.embedding.weight, 0.0, 0.2)

        self.lstm = nn.LSTM(
            hidden_size,
            int(hidden_size/2),  # Bi-directional processing will ouput vectors of double size, therefore I reduced output dimensionality
            num_layers=n_layers,
            batch_first=True,  # First dimension of input tensor will be treated as a batch dimension
            bidirectional=True
        )

    # word_inputs: (batch_size, seq_length), h: (h_or_c, layer_n_direction, batch, seq_length)
    def forward(self, word_inputs, hidden):         
        # embedded (batch_size, seq_length, hidden_size)
        embedded = self.embedding(word_inputs)
        # output (batch_size, seq_length, hidden_size*directions)
        # hidden (h: (num_layers*directions, batch_size, hidden_size),
        #         c: (num_layers*directions, batch_size, hidden_size))
        output, hidden = self.lstm(embedded, hidden)
        return output, hidden

    def init_hidden(self, batches):
        h_s = torch.zeros(self.n_layers*2, batches, int(self.hidden_size/2)).contiguous().to(device)
        c_s = torch.zeros(self.n_layers*2, batches, int(self.hidden_size/2)).contiguous().to(device)
        hidden = (h_s, c_s)
        return hidden

***I have just tested my encoded with a small dummy example to make sure it is working properly***

In [None]:
vocab_size = 10
hidden_dim = 10
n_layers = 1

encoder_test = EncoderRNN(vocab_size, hidden_dim, n_layers).to(device)
print(encoder_test)

# Recurrent network requires initial hidden state
encoder_hidden = encoder_test.init_hidden(1)

# Test input of size (1x3), one sequence of size 3
word_input = torch.LongTensor([[1, 2, 3]]).to(device)
#print(word_input.shape," ",encoder_hidden.shape)

encoder_outputs, encoder_hidden = encoder_test(word_input, encoder_hidden)

# encoder_outputs: (batch_size, seq_length, hidden_size)
# encoder_hidden[0, 1]: (n_layers*2, batch_size, hidden_size/2)
print(encoder_outputs.shape, encoder_hidden[0].shape, encoder_hidden[1].shape)

EncoderRNN(
  (embedding): Embedding(10, 10)
  (lstm): LSTM(10, 5, batch_first=True, bidirectional=True)
)
torch.Size([1, 3, 10]) torch.Size([2, 1, 5]) torch.Size([2, 1, 5])


**Decoder**

In [None]:
class DecoderRNN(nn.Module):
  def __init__(self, vocab_size, hidden_size, n_layers=1):
        super(DecoderRNN, self).__init__()

        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        self.embedding = nn.Embedding(vocab_size, hidden_size)

        #For Attention 
        self.weight1=nn.Linear(hidden_size,hidden_size)
        self.weight2=nn.Linear(hidden_size,hidden_size)
        self.v=nn.Linear(hidden_size,1)

        self.lstm = nn.LSTM(
            hidden_size+hidden_size,
            hidden_size,
            num_layers=n_layers,
            batch_first=True,  # First dimension of input tensor will be treated as a batch dimension
            bidirectional=False
        )
        self.output=nn.Linear(hidden_size,vocab_size)
        self.softmax=nn.LogSoftmax(dim=2)
  def forward(self, word_inputs, hidden,encoder_outputs):
        # we run this one by one
        # embedded (batch_size, 1, hidden_size)
        embedded = self.embedding(word_inputs.to(device)).unsqueeze_(1)
        temp_hidden=hidden[0].permute(1,0,2)
        #print(self.weight1(temp_hidden).squeeze(1).size()," ",self.weight2(encoder_outputs).squeeze(1).size())
        attn_score=torch.tanh(self.weight1(temp_hidden) + self.weight2(encoder_outputs))
        attn_weights=F.softmax(self.v(attn_score),dim=1)
        context_vec=torch.sum(attn_weights * encoder_outputs,dim=1,keepdim=True)
        #print(attn_score.size()," ",attn_weights.size()," ",encoder_outputs.size()," ",context_vec.size()," ",embedded.size())
        #embedded_temp=embedded.permute(1, 0, 2).reshape(1, 2, 10).permute(1, 0, 2)
        output=torch.cat((embedded,context_vec),-1)
        #print(output.size(),hidden[0].size())
        output, hidden = self.lstm(output, hidden)
        return output, hidden
  




***I did a small test for the decoder code as well to make sure it works properly.***

In [None]:
decoder_test = DecoderRNN(vocab_size, hidden_dim, n_layers).to(device)
print(decoder_test)

word_inputs = torch.LongTensor([[1, 2, 3]]).to(device)

decoder_hidden_h = encoder_hidden[0].permute(1, 0, 2).reshape(1, 1, 10).permute(1, 0, 2)
decoder_hidden_c = encoder_hidden[1].permute(1, 0, 2).reshape(1, 1, 10).permute(1, 0, 2)
#print(decoder_hidden_h.size())

for i in range(3):
    input = word_inputs[:, i]
    decoder_output, decoder_hidden = decoder_test(input, (decoder_hidden_h, decoder_hidden_c),encoder_outputs)
    decoder_hidden_h, decoder_hidden_c = decoder_hidden
    print(decoder_output.size(), decoder_hidden_h.size(), decoder_hidden_c.size())

DecoderRNN(
  (embedding): Embedding(10, 10)
  (weight1): Linear(in_features=10, out_features=10, bias=True)
  (weight2): Linear(in_features=10, out_features=10, bias=True)
  (v): Linear(in_features=10, out_features=1, bias=True)
  (lstm): LSTM(20, 10, batch_first=True)
  (output): Linear(in_features=10, out_features=10, bias=True)
  (softmax): LogSoftmax(dim=2)
)
torch.Size([1, 1, 10]) torch.Size([1, 1, 10]) torch.Size([1, 1, 10])
torch.Size([1, 1, 10]) torch.Size([1, 1, 10]) torch.Size([1, 1, 10])
torch.Size([1, 1, 10]) torch.Size([1, 1, 10]) torch.Size([1, 1, 10])


**Model**

***Putting the encoder decoder structure to build the seq2seq model***

In [None]:
class Seq2seq(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, hidden_size, n_layers,device):
        super(Seq2seq, self).__init__()

        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.device = device
        #EncoderRNN takes in the input(hindi) language vocabulary size , hidden dimension and also the number of layers
        #I did to(self.device) to ensure the input and the hidden layer outputs are in the same device while running on GPUs
        self.encoder = EncoderRNN(input_vocab_size, hidden_size, self.n_layers).to(self.device)

        #DecoderRNN takes in the output(english) language vocabulary size , hidden dimension and also the number of layers
        #I did to(self.device) to ensure the input and the hidden layer outputs are in the same device while running on GPUs
        self.decoder = DecoderRNN(output_vocab_size, hidden_size, self.n_layers).to(self.device)

        self.W = nn.Linear(hidden_size, output_vocab_size)
        init.normal_(self.W.weight, 0.0, 0.2) #initialise the weights

        self.softmax = nn.Softmax()
        
    def _forward_encoder(self, x):
        # x is (batch size, sequence length)
        batch_size = x.shape[0]
        #initialising the hidden inputs according to the batch_size
        init_hidden = self.encoder.init_hidden(batch_size)
        #send the input x and use .to(device) on x to ensure it is in the same device as init_hidden while using GPUs
        encoder_outputs, encoder_hidden = self.encoder(x.to(device), init_hidden)
        encoder_hidden_h, encoder_hidden_c = encoder_hidden
        self.encoder_outputs=encoder_outputs
        self.decoder_hidden_h = encoder_hidden_h.contiguous().permute(1,0,2).reshape(batch_size, self.n_layers, self.hidden_size).permute(1,0,2)
        self.decoder_hidden_c = encoder_hidden_c.contiguous().permute(1,0,2).reshape(batch_size, self.n_layers, self.hidden_size).permute(1,0,2)
        return self.decoder_hidden_h, self.decoder_hidden_c,self.encoder_outputs

    def forward_train(self, x, y):
        decoder_hidden_h, decoder_hidden_c ,encoder_outputs= self._forward_encoder(x)
        #print(decoder_hidden_h.is_contiguous," ",decoder_hidden_c.is_contiguous)
        H = []
        for i in range(y.shape[1]):
            input = y[:, i]
            decoder_output, decoder_hidden = self.decoder(input, (decoder_hidden_h.contiguous(), decoder_hidden_c.contiguous()),encoder_outputs)
            decoder_hidden_h, decoder_hidden_c = decoder_hidden
            # h: (batch_size, vocab_size)
            h = self.W(decoder_output.squeeze(1))
            # h: (batch_size, vocab_size, 1)
            H.append(h.unsqueeze(2))

        # H: (batch_size, vocab_size, seq_len)
        return torch.cat(H, dim=2)

    #this is used during test time
    def forward(self, x):
        decoder_hidden_h, decoder_hidden_c,encoder_outputs = self._forward_encoder(x)

        current_y = SOS_idx
        result = [current_y]
        counter = 0
        #I have set the limit of counter to be 100 as i did not want my predicted sentences to be too long
        while current_y != EOS_idx and counter < 100:
            input = torch.tensor([current_y])
            decoder_output, decoder_hidden = self.decoder(input, (decoder_hidden_h, decoder_hidden_c),encoder_outputs)
            decoder_hidden_h, decoder_hidden_c = decoder_hidden
            # h: (vocab_size)
            h = self.W(decoder_output.squeeze(1)).squeeze(0)
            y = self.softmax(h)
            _, current_y = torch.max(y, dim=0)
            current_y = current_y.item()
            result.append(current_y)
            counter += 1

        return result

**We have prepared input output pairs which are strings but input to our model should be tensors. So need to convert to tensors**

In [None]:
import os 
data_dir='/content/drive/MyDrive/'

In [None]:
#this function takes in the lang object(which denotes the either hindi vocab set or english vocab set) and the sentence and converts each word to its index
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

#this function takes the lang object(which denotes the either hindi vocab set or english vocab set) and the sentence and also the type(which is either hindi or english)
def tensorFromSentence(lang, sentence,type):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_idx) #appended the end of sentence token at the end of the sentence
    indexes.insert(0, SOS_idx) #appended the start of sentence token at the start of the sentence
    if type=='hindi':
      max_seq_length=2+max_length_src #if the type is hindi the max length is set to 2+the max length of hindi sentences encountered in the training dataset(2 because we appended 2 tokens)
    else:
      max_seq_length=2+max_length_tar  #if the type is english the max length is set to 2+the max length of english sentences encountered in the training dataset(2 because we appended 2 tokens)
    if len(indexes) < max_seq_length:
        indexes += [PAD_idx] * (max_seq_length - len(indexes)) # padding shorter sentences with the pad token index
    return torch.tensor(indexes, dtype=torch.long, device=device) # converting the indexes to a tensor object


def tensorsFromPair(source,target):
    #I unsqeezed the tensors as I wanted the size to be (sequence length) and not (1,sequence length)
    input_tensor = tensorFromSentence(input_lang, source,'hindi').unsqueeze(1)
    target_tensor = tensorFromSentence(output_lang, target,'english').unsqueeze(1)
    return (input_tensor, target_tensor)


#constructing training and validation set tensors for source(hindi) and target(english) language
training = []
for source_sent, target_sent in zip(training_source, training_target):
    #print(source_sent)
    training.append(tensorsFromPair(source_sent, target_sent))

x_training, y_training = zip(*training)
x_training = torch.transpose(torch.cat(x_training, dim=-1), 1, 0)
y_training = torch.transpose(torch.cat(y_training, dim=-1), 1, 0)


x_development = []
for source_sent in dev_source:
    tensor = tensorFromSentence(input_lang, source_sent, 'hindi').unsqueeze(1)
    x_development.append(tensor)

x_development = torch.transpose(torch.cat(x_development, dim=-1), 1, 0)



***The unindex_words function will be used during the test time when the decoder provides us the indices of the predicted words we translate the indices back to words***

In [None]:
def unindex_words(lang, indices):
        return [lang.index2word[i] for i in indices]

**Training our model**

In [None]:
from torch.optim import Adam
#initialising our seq2seq model with no. of layers as 1 and hidden dimension as 256
model = Seq2seq(input_lang.n_words, output_lang.n_words, 256, 1,device).to(device)
#I have used Adam as my optimiser and have set my initial learning rate to 0.0001
optim = Adam(model.parameters(), lr=0.0001)

In [None]:
#checking specifications of my model
model

Seq2seq(
  (encoder): EncoderRNN(
    (embedding): Embedding(46185, 256)
    (lstm): LSTM(256, 128, batch_first=True, bidirectional=True)
  )
  (decoder): DecoderRNN(
    (embedding): Embedding(32596, 256)
    (weight1): Linear(in_features=256, out_features=256, bias=True)
    (weight2): Linear(in_features=256, out_features=256, bias=True)
    (v): Linear(in_features=256, out_features=1, bias=True)
    (lstm): LSTM(512, 256, batch_first=True)
    (output): Linear(in_features=256, out_features=32596, bias=True)
    (softmax): LogSoftmax(dim=2)
  )
  (W): Linear(in_features=256, out_features=32596, bias=True)
  (softmax): Softmax(dim=None)
)

**We will train our model in batches**

In [None]:
import math
#since I wanted to train my model in batches the batch_generator function takes in the size of the input to be batched and the batch size
#the function yields the set of indices required for the current batch
def batch_generator(batch_indices, batch_size):
    batches = math.ceil(len(batch_indices)/batch_size)
    for i in range(batches):
        batch_start = i*batch_size
        batch_end = (i+1)*batch_size
        if batch_end > len(batch_indices):
            yield batch_indices[batch_start:]
        else:
            yield batch_indices[batch_start:batch_end]

In [None]:
# for calculating the loss I have used the CrossEntropyLoss function
cross_entropy = nn.CrossEntropyLoss()

In [None]:
pip install nltk



***I have used corpus_bleu as a scoring metric to check my models performance on validation set(constructed from the provided training dataset) across different epochs***

In [None]:
from nltk.translate.bleu_score import corpus_bleu

def bleu(n):
    weights = [1.0/n]*n + [0.0]*(4-n)
    return lambda list_of_references, list_of_hypothesis: corpus_bleu(list_of_references, list_of_hypothesis, weights)

def accuracy(list_of_references, list_of_hypothesis):
    total = 0.0
    for references, hypothesis in zip(list_of_references, list_of_hypothesis):
        total += 1.0 if tuple(hypothesis) in set(references) else 0.0
    return total / len(list_of_references)

score_functions = {'BLEU-{}'.format(i):bleu(i) for i in range(1, 5)}
score_functions['Accuracy'] = accuracy

def score(model, X, target, desc='Scoring...'):
    scores = {name:0.0 for name in score_functions.keys()}
    length = len(target)
    list_of_hypothesis = []
    '''
    for i, x in tqdm(enumerate(X),
                     desc=desc,
                     total=length):
    '''
    for i, x in enumerate(X):
        y = model(x.unsqueeze(0).to(device))
        hypothesis = unindex_words(output_lang,y[1:-1])  # Remove SOS and EOS from y
        list_of_hypothesis.append(hypothesis)
    print(list_of_hypothesis)
    print(target)
    
    for name, func in score_functions.items():
        score = func(target, list_of_hypothesis)
        scores[name] = score

    return scores

***I have used tqdm_notebook just to monitor the progress***

In [None]:
#from tqdm import tqdm_notebook as tqdm

BATCH_SIZE = 16 # i have taken batch size to be 16
total_batches = int(len(x_training)/BATCH_SIZE) + 1  # total batches denote the numer of batches of size 16 
indices = list(range(len(x_training))) # it takes the length of the training set

early_stop_after = 3 # early stop counter is 3 so that we can do a early stop if we see no improvement for 3 back to back epochs
early_stop_counter = 0
best_model = None

best_score = 0.0
scoring_metric = 'BLEU-1'
scores_history = []
loss_history = []

for epoch in range(1):
    # Training
    total_loss = 0.0
    '''
    this part to be used only when we have a model state to start with
    
     
    if epoch==0:
      model.load_state_dict(torch.load('/content/drive/MyDrive/NLP_Challenge_Test_Phase/model_best_lstm_attn_26_April.pt'))
    '''
   
    for step, batch in enumerate(batch_generator(indices, BATCH_SIZE)):
    
    #for step, batch in enumerate(batch_generator(indices, BATCH_SIZE)):
            x = x_training[batch, :].to(device)
            # y for teacher forcing is all sequence without a last element
            y_tf = y_training[batch, :-1].to(device)
            # y for loss calculation is all sequence without a last element
            y_true = y_training[batch, 1:].to(device)
            # (batch_size, vocab_size, seq_length)
            H = model.forward_train(x, y_tf)
            loss = cross_entropy(H, y_true)

            assert loss.item() > 0 # made sure that the loss is always >0 

            optim.zero_grad()
            loss.backward()
            optim.step()

            total_loss += loss.item()

    loss_history.append(total_loss/total_batches)
    torch.save(model.state_dict(), '/content/drive/MyDrive/model_lstm_attn_26_april.pt')
    print('Epoch {} training is finished, loss: {:.4f}'.format(epoch+1, total_loss/total_batches))

    desc = 'Validating epoch {}'.format(epoch+1)
    scores = score(model, x_development.to(device), dev_target, desc=desc)
    scores_str = '\n'.join(['{}: {:.4f}'.format(name, score) for name, score in scores.items()])
    scores_history.append(scores)

    print ('Epoch {} validation is finished.\n{}'.format(
        epoch+1, scores_str
    ))

    metric = scores[scoring_metric]
    
    # Early Stop
    if metric > best_score:
        early_stop_counter = 0
        print('The best model is found, resetting early stop counter.')
        best_score = metric
        best_model = model
        torch.save(best_model.state_dict(), '/content/drive/MyDrive/model_best_lstm_attn_26_April.pt')
    else:
        early_stop_counter += 1
        print('No improvements for {} epochs.'.format(early_stop_counter))
        if early_stop_counter >= early_stop_after:
            print('Early stop!')
            break

**Evaluation on validation set separated from training set provided**

In [None]:
#checking the model's performance on the dev set prepared from the training dataset provided
examples = zip(dev_source[10:20], dev_target[10:20], x_development[10:20]) #checked for only 10 sentences
for source, target, x in examples:
    y = model(x.unsqueeze(0))
    translation = ' '.join(unindex_words(output_lang,y[1:-1]))
    source = ' '.join(source)
    references = '\n'.join([' '.join(t) for t in target])

    print('Source: "{}"\nReferences:\n{}\nTranslation: "{}"\n'.format(source, references, translation))

**Performing Translation on the Test Set Provided**

In [None]:
#loading the test set provided for final week
column_names = []
test_final = []
  
# reading csv file
with open('/content/drive/MyDrive/testhindistatements.csv', 'r') as csvfile:
    # creating a csv reader object
    csvreader = csv.reader(csvfile)
      
    # extracting field names through first row
    column_names = next(csvreader)
  
    # extracting each data row one by one
    for row in csvreader:
        #print(row[2])
        test_final.append([row[2]])

In [None]:
#preprocessing the sentences and storing it in a list
test_final_preprocessed=[]
for i in range(len(test_final)):
    clean_line=pre_process_hindi_sentence(str(test_final[i]))
    test_final_preprocessed.append(clean_line)

In [None]:
#checking one of the preprocessed text
test_final_preprocessed[0]

**Appending SOS and EOS token to hindi sentences and converting the hindi sentences to tensor objects**

In [None]:
def indexesFromSentence_test(lang, sentence):
    list_index=[]
    for word in sentence.split(' '):
      try:
        list_index.append(lang.word2index[word])
      except:
        #if a word is not present in the vocabulary made from training set I defined it as unknown token
        list_index.append(lang.word2index[UNK_token])
    return list_index


def tensorFromSentence_test(lang, sentence,type):
    indexes = indexesFromSentence_test(lang, sentence)
    #print(len(indexes))
    indexes.append(EOS_idx)
    indexes.insert(0, SOS_idx)
    if type=='hindi':
      max_seq_length=2+max_length_src
    else:
      max_seq_length=2+max_length_tar
    if len(indexes) < max_seq_length:
        indexes += [PAD_idx] * (max_seq_length - len(indexes))
    return torch.tensor(indexes, dtype=torch.long, device=device)

In [None]:
#converting the list to a tensor object so that we can send it as input to our model
test_final_tensor=[]
for source_sent in test_final_preprocessed:
    tensor = tensorFromSentence_test(input_lang, source_sent, 'hindi').unsqueeze(1)
    test_final_tensor.append(tensor)
test_final_tensor = torch.transpose(torch.cat(test_final_tensor, dim=-1), 1, 0)

In [None]:
#checking the shape of the tensor 
test_final_tensor.shape

In [None]:
#initialising the model
model = Seq2seq(input_lang.n_words, output_lang.n_words, 256, 1,device).to(device)
#loading the saved state of the model
model.load_state_dict(torch.load('/content/drive/MyDrive/NLP_Challenge_Test_Phase/model_best_lstm_attn_26_April.pt'))
examples = zip(test_final_preprocessed,test_final_tensor)
file1 = open("/content/drive/MyDrive/NLP_Challenge_Test_Phase/answer.txt","w") #writing the predicted outputs to a file 
for source,  x in examples:
    y = model(x.unsqueeze(0).to(device))
    translation = ' '.join(unindex_words(output_lang,y[1:-1]))
    source = ' '.join(source)
    
    file1.write(translation+'\n')
    print('Source: "{}"\nTranslation: "{}"\n'.format(source,  translation)) # I printed the source and model's predicted translations here
file1.close()