**Load Data and do preprocessing**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
dataset=pd.read_csv('/content/drive/MyDrive/train.csv',encoding='utf-8',usecols=['hindi','english'])
dataset.head()

Unnamed: 0,hindi,english
0,"एल सालवाडोर मे, जिन दोनो पक्षों ने सिविल-युद्ध...","In El Salvador, both sides that withdrew from ..."
1,मैं उनके साथ कोई लेना देना नहीं है.,I have nothing to do with them.
2,-हटाओ रिक.,"Fuck them, Rick."
3,क्योंकि यह एक खुशियों भरी फ़िल्म है.,Because it's a happy film.
4,The thought reaching the eyes...,The thought reaching the eyes...


**Pre-requisite for hindi text processing**

In [None]:
!git clone "https://github.com/anoopkunchukuttan/indic_nlp_library"
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
!pip install Morfessor
# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME=r"/content/indic_nlp_library"

# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES="/content/indic_nlp_resources"
import sys
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)
from indicnlp import loader
loader.load()

Cloning into 'indic_nlp_library'...
remote: Enumerating objects: 1271, done.[K
remote: Counting objects: 100% (93/93), done.[K
remote: Compressing objects: 100% (68/68), done.[K
remote: Total 1271 (delta 50), reused 54 (delta 25), pack-reused 1178[K
Receiving objects: 100% (1271/1271), 9.56 MiB | 12.39 MiB/s, done.
Resolving deltas: 100% (654/654), done.
Cloning into 'indic_nlp_resources'...
remote: Enumerating objects: 133, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126[K
Receiving objects: 100% (133/133), 149.77 MiB | 32.77 MiB/s, done.
Resolving deltas: 100% (51/51), done.
Checking out files: 100% (28/28), done.
Collecting Morfessor
  Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl
Installing collected packages: Morfessor
Successfully installed Morfessor

In [None]:
import re
import string
import random
from unicodedata import normalize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
from indicnlp.tokenize import indic_tokenize



**Building the vocabulary and also assigning indexes to each word including start token(SOS_token) and end token(EOS_token)**

In [None]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

**Pre-processing the text of both source and target language like tokenizing the text , removing quotes or other punctuations present**



*   The function pre_process_english_sentence takes in each english sentence and tokenises the words and removes all the unnecessary things like punctuation marks, numbers, quotes.
*   Similarly , pre_process_hindi_sentence takes in each hindi sentence and tokenises using indic tokeniser and returns the processed sentence.


*  The to_pairs function takes in the dataframe object containing out input dataset and it returns a list of lists containing processed source and target input sentences.






In [None]:
def to_pairs(df):
    english_lines =df['english'].tolist()
    hindi_lines = df['hindi'].tolist()
    pairs = []
    for i in range(len(hindi_lines)):
        pairs.append([])
        pairs[i].append(pre_process_hindi_sentence(hindi_lines[i]))
        pairs[i].append(pre_process_english_sentence(english_lines[i]))
    return pairs

def clean_text(text):
    text = text.replace(u',','')
    text = text.replace(u'"','')
    text = text.replace(u'"','')
    text = text.replace(u"‘‘",'')
    text = text.replace(u"’’",'')
    text = text.replace(u"''",'')
    text = text.replace(u"।",'')
    text=text.replace(u',','')
    text=text.replace(u'"','')
    text=text.replace(u'(','')
    text=text.replace(u')','')
    text=text.replace(u'"','')
    text=text.replace(u':','')
    text=text.replace(u"'",'')
    text=text.replace(u"‘‘",'')
    text=text.replace(u"’’",'')
    text=text.replace(u"''",'')
    text=text.replace(u".",'')
    text=text.replace(u"-",'')
    text=text.replace(u"।",'')
    text=text.replace(u"?",'')
    text=text.replace(u"\\",'')
    text=text.replace(u"_",'')
    text=text.replace("'", "")
    text=text.replace('"', "")
    text= re.sub("'", '', text)
    text= re.sub("’", '', text)
    text=re.sub('[0-9+\-*/.%]', '', text)
    text=text.strip()
    text=re.sub(' +', ' ',text)
    exclude = set(string.punctuation)
    text= ''.join(ch for ch in text if ch not in exclude)
    return text
def pre_process_english_sentence(line):
    line = line.lower()
    line = clean_text(line)
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    line = normalize('NFD', line).encode('ascii', 'ignore')
    line = line.decode('UTF-8')
    line = line.split()
    line = [re_print.sub('', w) for w in line]
    line = [word for word in line if word.isalpha()]
    line = ' '.join(line)
    return line

def pre_process_hindi_sentence(line):
    #print(line)
    #line=re.sub('[^a-zA-Z]', '', line)
    #print(line)
    line = clean_text(line)
    #remove_nuktas = False
    factory = IndicNormalizerFactory()
    normalizer = factory.get_normalizer("hi",remove_nuktas=False)
    line = normalizer.normalize(line)
    
    
    tokens = list()
    for t in indic_tokenize.trivial_tokenize(line):
        tokens.append(t)
    line = tokens
    line = [word for word in line if not re.search(r'\d', word)]
    line = ' '.join(line)
    return (line)

def prepareData(pairs):
    input_lang = Lang('hin')
    output_lang = Lang('eng')
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    return input_lang, output_lang


pairs_all = to_pairs(dataset)
input_lang, output_lang = prepareData(pairs_all)

In [None]:
l=pre_process_hindi_sentence('क्योंकि यह एक खुशियों भरी फ़िल्म है	')
l

'क्योंकि यह एक खुशियों भरी फ़िल्म है'

In [None]:
print(pairs_all[0][0].split(' '))

['एल', 'सालवाडोर', 'मे', 'जिन', 'दोनो', 'पक्षों', 'ने', 'सिविलयुद्ध', 'से', 'वापसी', 'ली', 'उन्होंने', 'वही', 'काम', 'किये', 'जो', 'कैदियों', 'की', 'कश्मकश', 'के', 'निदान', 'हैं']


**Lang object's n_words tells us the vocabulary size of source(Hindi) and target(English) language**

In [None]:
input_lang.n_words

46183

In [None]:
output_lang.n_words

32594

In [None]:
import numpy as np

Here I am checking the maximum sentence length of source(hindi) and target(english)

In [None]:
df = pd.DataFrame(pairs_all)
df.columns = ["hindi", "english"]
lines = df
lenght_list=[]
for l in lines.hindi:
    lenght_list.append(len(l.split(' ')))
max_length_src = np.max(lenght_list)
print (max_length_src)
lenght_list=[]
for l in lines.english:
    lenght_list.append(len(l.split(' ')))
max_length_tar = np.max(lenght_list)
print (max_length_tar)
df.head()

394
301


Unnamed: 0,hindi,english
0,एल सालवाडोर मे जिन दोनो पक्षों ने सिविलयुद्ध स...,in el salvador both sides that withdrew from t...
1,मैं उनके साथ कोई लेना देना नहीं है,i have nothing to do with them
2,हटाओ रिक,fuck them rick
3,क्योंकि यह एक खुशियों भरी फ़िल्म है,because its a happy film
4,The thought reaching the eyes,the thought reaching the eyes


**Starting the Model**

**Encoder**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

**Decoder**

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

**Attention Decoder**

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=output_lang.n_words):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

**We have prepared input output pairs which are strings but input to our model should be tensors. So need to convert to tensors**

In [None]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

**Training our model**

I have used teacher_forcing here , teacher_forcing basically ensures that at certain sequence time say t the input to the decoder is the actual word instead of the word decoded by the decoder in the previous sequence step (t-1).

In [None]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=output_lang.n_words):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

The asMinutes and timeSince functions are mainly used to see the time spent in each epoch 

In [None]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
   
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()
   
    training_pairs = [tensorsFromPair(random.choice(pairs_all))
                        for i in range(n_iters)]
      

    for iter in range(1, n_iters + 1):
          training_pair = training_pairs[iter - 1]
          input_tensor = training_pair[0]
          target_tensor = training_pair[1]

          loss = train(input_tensor, target_tensor, encoder,
                      decoder, encoder_optimizer, decoder_optimizer, criterion)
          print_loss_total += loss
          plot_loss_total += loss

          if iter % print_every == 0:
              print_loss_avg = print_loss_total / print_every
              print_loss_total = 0
              print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                          iter, iter / n_iters * 100, print_loss_avg))

          if iter % plot_every == 0:
              plot_loss_avg = plot_loss_total / plot_every
              plot_losses.append(plot_loss_avg)
              plot_loss_total = 0

    return plot_losses

**Evaluation**

In [None]:
def evaluate(encoder, decoder, sentence, max_length=output_lang.n_words):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

evaluateRandomly function I used to check the encoder decoder setting after I trained my model for few epochs. It evaluates the model by taking into consideration random inputs from training set

In [None]:
def evaluateRandomly(encoder, decoder, n=10,pairs_all=pairs_all):
  list_output=[]
  list_target=[]
  for i in range(n):
        pair = random.choice(pairs_all)
        print('Source : ', pair[0])
        print('Original Output :', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('Our Translation :', output_sentence)
        list_output.append(output_sentence)
        list_target.append(pair[1])
        print('')
  return list_output,list_target

**Train and Evaluate**

Here , I have intialised my encoder and decoder and started training my model. I have printed the loss after every 10000 iterations.

In [None]:
hidden_size = 256
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)
encoder1.load_state_dict(torch.load('/content/drive/MyDrive/myencoder_new.pt'))
attn_decoder1.load_state_dict(torch.load('/content/drive/MyDrive/mydecoder_new.pt'))
for epoch in range(4):
  print("Epoch ",epoch," started")
  loss=trainIters(encoder1, attn_decoder1, 102321, print_every=10000)
  torch.save(encoder1.state_dict(), '/content/drive/MyDrive/myencoder_new.pt')
  torch.save(attn_decoder1.state_dict(), '/content/drive/MyDrive/mydecoder_new.pt')
  print("Epoch ",epoch," ended")

Epoch  0  started
10m 53s (- 100m 31s) (10000 9%) 3.4415
21m 29s (- 88m 29s) (20000 19%) 3.4050
32m 17s (- 77m 51s) (30000 29%) 3.4386
42m 52s (- 66m 48s) (40000 39%) 3.3864
53m 35s (- 56m 4s) (50000 48%) 3.4343
64m 7s (- 45m 13s) (60000 58%) 3.4085
74m 45s (- 34m 30s) (70000 68%) 3.3866
85m 12s (- 23m 46s) (80000 78%) 3.3626
95m 48s (- 13m 6s) (90000 87%) 3.3604
106m 17s (- 2m 28s) (100000 97%) 3.3561
Epoch  0  ended
Epoch  1  started
10m 44s (- 99m 7s) (10000 9%) 3.3706
21m 20s (- 87m 51s) (20000 19%) 3.3244
31m 58s (- 77m 4s) (30000 29%) 3.3210
42m 33s (- 66m 17s) (40000 39%) 3.3264
53m 8s (- 55m 36s) (50000 48%) 3.3273
63m 40s (- 44m 54s) (60000 58%) 3.3128
74m 26s (- 34m 22s) (70000 68%) 3.3133
85m 6s (- 23m 44s) (80000 78%) 3.3197
95m 54s (- 13m 7s) (90000 87%) 3.3081
106m 32s (- 2m 28s) (100000 97%) 3.2999
Epoch  1  ended
Epoch  2  started
10m 40s (- 98m 34s) (10000 9%) 3.2823
21m 23s (- 88m 3s) (20000 19%) 3.2729
32m 16s (- 77m 47s) (30000 29%) 3.2884
42m 55s (- 66m 52s) (40000

In [None]:
torch.save(encoder1.state_dict(), '/content/drive/MyDrive/myencoder_new.pt')
torch.save(attn_decoder1.state_dict(), '/content/drive/MyDrive/mydecoder_new.pt')



**Testing time**

In [None]:
hidden_size = 256
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)


During test time I loaded the saved model and used the loaded encoder decoder states for predictions.

In [None]:
encoder1.load_state_dict(torch.load('/content/drive/MyDrive/myencoder_new.pt'))
attn_decoder1.load_state_dict(torch.load('/content/drive/MyDrive/mydecoder_new.pt'))

<All keys matched successfully>

Here i have checked my model's performance on 10 randomly picked inputs

In [None]:
list_output,list_target=evaluateRandomly(encoder1, attn_decoder1,10,pairs_all)

Source :  व्यवसाय
Original Output : business
Our Translation : business <EOS>

Source :  इस बदबूदार खलिहान में भूल जाओ
Original Output : forget this stinking barn
Our Translation : forget this in forget <EOS>

Source :  अब अपनी पोस्ट के लिए वापस जाओ
Original Output : get back to your post now
Our Translation : get back to get back <EOS>

Source :  यह कहानी शुरू होती है इन दोनों से मेरे बच्चे
Original Output : this story starts with these two my kids
Our Translation : these start starts both starts <EOS>

Source :  ठीक है ऐलिस ठीक हो सकता है नहीं एक अजनबी
Original Output : okay alice can not be a stranger okay
Our Translation : okay might be might see just right <EOS>

Source :  गोली मत चलाना मेरे पास कोई गन नहीं है
Original Output : dont shoot im not armed
Our Translation : dont have have have have have have have have have have have have have have have have a shoot <EOS>

Source :  और शायद मैनें उन्हें डरा दिया था हँसी क्योंकि उन्होंने कुल मिला कर मुझसे वादा किया कि और मेरे पास तैयार स

In [None]:
len(list_output),len(list_target)

(10, 10)

In [None]:
list_output_temp=[]
i=0
for item in list_output:
  item=item[:-5]
  print(item ," target ->",list_target[i])
  i+=1
  list_output_temp.append(item)

so we have been have been have been have been we have been have been on the women have been have been women and women and women have been women   target -> so we have the old we have the young we have the uprising power of women and theres one megatrend which affects all of us
and one of the was to to to these to to to   target -> one of the ways they wanted to do it was to put a tracking chip inside one of the whales
and now indus script script   target -> and the indus script now has this particular property
okay told me about my didnt give us about about about about about about about about about about about about about about about   target -> okay so they didnt tell us to become doctors or lawyers or anything like that but my dad did read to us about aristotle and pioneer germfighters when lots of other kids were hearing the wheels on the bus go round and round
i mean mean mean where i meant photo   target -> i mean where was this picture taken
yeah this was last the last car   targ

Here I have used corpus_blue metric to check my model's performance

In [None]:
from nltk.translate.bleu_score import corpus_bleu

def bleu(n):
    weights = [1.0/n]*n + [0.0]*(4-n)
    return lambda list_of_references, list_of_hypothesis: corpus_bleu(list_of_references, list_of_hypothesis, weights)

def accuracy(list_of_references, list_of_hypothesis):
    total = 0.0
    for references, hypothesis in zip(list_of_references, list_of_hypothesis):
        total += 1.0 if tuple(hypothesis) in set(references) else 0.0
    return total / len(list_of_references)

score_functions = {'BLEU-{}'.format(i):bleu(i) for i in range(1, 5)}
score_functions['Accuracy'] = accuracy

def score(list_of_hypothesis, target, desc='Scoring...'):
    scores = {name:0.0 for name in score_functions.keys()}
    length = len(target)
    #list_of_hypothesis=list_of_hypothesis[:-1] #reoving eos token
    

    for name, func in score_functions.items():
        score = func(target, list_of_hypothesis)
        scores[name] = score

    return scores

In [None]:
scores=score(list_output_temp,list_target)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [None]:
scores

{'Accuracy': 0.0,
 'BLEU-1': 0.1794871794871795,
 'BLEU-2': 0.4236592728681617,
 'BLEU-3': 0.5640849044411526,
 'BLEU-4': 0.6508911374939451}

Here I have checked the sentence_bleu score for my predicted sentences

In [None]:
import nltk
#import sys
nltk.download('wordnet')
from nltk.translate.bleu_score import sentence_bleu
#from nltk.translate.meteor_score import single_meteor_score

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
total_num = len(list_output_temp)
total_bleu_scores = 0
total_meteor_scores = 0
for i in range(total_num):
  total_bleu_scores+=sentence_bleu([list_target[i].split(" ")], list_output_temp[i].split(" "))
  #total_meteor_scores+=single_meteor_score(list_target[i], list_output_temp[i])

bleu_result = total_bleu_scores/total_num
#meteor_result = total_meteor_scores/total_num

print("bleu score: ",bleu_result)
#print("meteor score: ",meteor_result)

bleu score:  0.4131154606571806


Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


**Here onwards I have tested my model on the provided dev set**

Loading the dataset provided and converting it into a list

In [None]:
hindi_statements=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/hindistatements.csv',encoding='utf-8',usecols=['hindi'])

In [None]:
test_week1=hindi_statements.values.tolist()

**Here I have pre-processed the Hindi sentences**

In [None]:
test_week1_preprocessed=[]

In [None]:
for i in range(len(test_week1)):
    clean_line=pre_process_hindi_sentence(str(test_week1[i]))
    test_week1_preprocessed.append(clean_line)

The following function was used to convert the input sentences into tensor objects so that that can be fed into the encoder

In [None]:
def indexesFromSentence_test(lang, sentence):
    list_index=[]
    for word in sentence.split(' '):
      try:
        list_index.append(lang.word2index[word])
      except:
        continue
    return list_index


def tensorFromSentence_test(lang, sentence):
    indexes = indexesFromSentence_test(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

In [None]:
test_week1_tensor=[]
for source_sent in test_week1_preprocessed:
    tensor = tensorFromSentence_test(input_lang, source_sent)
    test_week1_tensor.append(tensor)
#test_week1_tensor = torch.transpose(torch.cat(test_week1_tensor, dim=-1), 1, 0)

evaluate_test function takes in the encoder and decoder along with the input tensor made in the previous step and the vocabulary of the target language

In [None]:
def evaluate_test(encoder, decoder, input_tensor, max_length=output_lang.n_words):
    with torch.no_grad():
        #input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

**Writing the predictions in answer.txt file**

In [None]:
examples = zip(test_week1_preprocessed,test_week1_tensor)
file1 = open("/content/drive/MyDrive/answer_27.txt","w")
for source,  x in examples:
    output_words, attentions = evaluate_test(encoder1, attn_decoder1, x)
    output_sentence = ' '.join(output_words)
    source = ' '.join(source)
    

    file1.write(output_sentence+'\n')
    print('Source: "{}"\nTranslation: "{}"\n'.format(source,  output_sentence))
file1.close()

Source: "अ ं त र ि क ् ष   व ा ल े   ल ो ग"
Translation: "the people <EOS>"

Source: "आ ं ट ी   य े   ख ब र   आ प क ो   ह ी   ब त ा न ा   प ड ़ े ग ा"
Translation: "and you will you you you <EOS>"

Source: "व े   आ त े   औ र   फ ि ल ् म   द े ख त े"
Translation: "they called the film <EOS>"

Source: "म ै ं   ज ा न त ा   ह ू ँ   आ प   क ् य ा   स ो च   र ह े   ह ै ं   आ प   स ो च   र ह े   प र   ब स ् त ी   त ो   क ि स ी   औ र   क ी   ज म ी न   प े   क ब ् ज े   स े   ब न त ी   ह ै   म ग र   स ो च ि ए   र ा त   क े   अ ं ध े र े   म े ं   क ब ् ज ा   न ह ी ं   ह ो त ा"
Translation: "i know you you you you you you you you you you you you you you you you you you you you you you you you you you you <EOS>"

Source: "अ ग र   म ै ं   भ ी   म ु झ े   स े   च ो र ी   क र त े   ह ै ं   ब ा क ी   म े ं   स ो च न े   क े   ल ि ए   क ् य ा   क र   र ह े   ह ै ं"
Translation: "but i what i do for too <EOS>"

Source: "ब ि द क ा   घ ो ड ़ ा   प ि छ ल ी   ट ा ं ग ो ं   प र   ख ड ़ ा   ह ु आ   छ ठ े   स

KeyboardInterrupt: ignored