In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np

# Instantiates the device to be used as GPU/CPU based on availability
device_gpu = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Visualization tools
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output

import random

Connecting to google drive for accessing data

In [2]:
# Load the Drive helper and mount
# from google.colab import drive
# drive.mount('/content/drive')

Since we are solving this problem at character level , hence creating vocal for english and hindi based upon the alphabets

In [3]:
eng_alphabets = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
pad_char = '-PAD-'

eng_alpha2index = {pad_char: 0}
for index, alpha in enumerate(eng_alphabets):
    eng_alpha2index[alpha] = index+1

print(eng_alpha2index)

{'-PAD-': 0, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19, 'T': 20, 'U': 21, 'V': 22, 'W': 23, 'X': 24, 'Y': 25, 'Z': 26}


In [4]:
# Hindi Unicode Hex Range is 2304:2432. Source: https://en.wikipedia.org/wiki/Devanagari_(Unicode_block)

hindi_alphabets = [chr(alpha) for alpha in range(2304, 2432)]
hindi_alphabet_size = len(hindi_alphabets)

hindi_alpha2index = {pad_char: 0}
for index, alpha in enumerate(hindi_alphabets):
    hindi_alpha2index[alpha] = index+1

print(hindi_alpha2index)

{'-PAD-': 0, 'ऀ': 1, 'ँ': 2, 'ं': 3, 'ः': 4, 'ऄ': 5, 'अ': 6, 'आ': 7, 'इ': 8, 'ई': 9, 'उ': 10, 'ऊ': 11, 'ऋ': 12, 'ऌ': 13, 'ऍ': 14, 'ऎ': 15, 'ए': 16, 'ऐ': 17, 'ऑ': 18, 'ऒ': 19, 'ओ': 20, 'औ': 21, 'क': 22, 'ख': 23, 'ग': 24, 'घ': 25, 'ङ': 26, 'च': 27, 'छ': 28, 'ज': 29, 'झ': 30, 'ञ': 31, 'ट': 32, 'ठ': 33, 'ड': 34, 'ढ': 35, 'ण': 36, 'त': 37, 'थ': 38, 'द': 39, 'ध': 40, 'न': 41, 'ऩ': 42, 'प': 43, 'फ': 44, 'ब': 45, 'भ': 46, 'म': 47, 'य': 48, 'र': 49, 'ऱ': 50, 'ल': 51, 'ळ': 52, 'ऴ': 53, 'व': 54, 'श': 55, 'ष': 56, 'स': 57, 'ह': 58, 'ऺ': 59, 'ऻ': 60, '़': 61, 'ऽ': 62, 'ा': 63, 'ि': 64, 'ी': 65, 'ु': 66, 'ू': 67, 'ृ': 68, 'ॄ': 69, 'ॅ': 70, 'ॆ': 71, 'े': 72, 'ै': 73, 'ॉ': 74, 'ॊ': 75, 'ो': 76, 'ौ': 77, '्': 78, 'ॎ': 79, 'ॏ': 80, 'ॐ': 81, '॑': 82, '॒': 83, '॓': 84, '॔': 85, 'ॕ': 86, 'ॖ': 87, 'ॗ': 88, 'क़': 89, 'ख़': 90, 'ग़': 91, 'ज़': 92, 'ड़': 93, 'ढ़': 94, 'फ़': 95, 'य़': 96, 'ॠ': 97, 'ॡ': 98, 'ॢ': 99, 'ॣ': 100, '।': 101, '॥': 102, '०': 103, '१': 104, '२': 105, '३': 106, '४': 107, '५': 108, '६': 109, '७': 

In [5]:

hindi_alphabets = [chr(alpha) for alpha in range(2304, 2432)]
hindi_alphabet_size = len(hindi_alphabets)

hindi_alpha2index_r = {}
for index, alpha in enumerate(hindi_alphabets):
    hindi_alpha2index_r[alpha] = index

print(hindi_alpha2index_r)

{'ऀ': 0, 'ँ': 1, 'ं': 2, 'ः': 3, 'ऄ': 4, 'अ': 5, 'आ': 6, 'इ': 7, 'ई': 8, 'उ': 9, 'ऊ': 10, 'ऋ': 11, 'ऌ': 12, 'ऍ': 13, 'ऎ': 14, 'ए': 15, 'ऐ': 16, 'ऑ': 17, 'ऒ': 18, 'ओ': 19, 'औ': 20, 'क': 21, 'ख': 22, 'ग': 23, 'घ': 24, 'ङ': 25, 'च': 26, 'छ': 27, 'ज': 28, 'झ': 29, 'ञ': 30, 'ट': 31, 'ठ': 32, 'ड': 33, 'ढ': 34, 'ण': 35, 'त': 36, 'थ': 37, 'द': 38, 'ध': 39, 'न': 40, 'ऩ': 41, 'प': 42, 'फ': 43, 'ब': 44, 'भ': 45, 'म': 46, 'य': 47, 'र': 48, 'ऱ': 49, 'ल': 50, 'ळ': 51, 'ऴ': 52, 'व': 53, 'श': 54, 'ष': 55, 'स': 56, 'ह': 57, 'ऺ': 58, 'ऻ': 59, '़': 60, 'ऽ': 61, 'ा': 62, 'ि': 63, 'ी': 64, 'ु': 65, 'ू': 66, 'ृ': 67, 'ॄ': 68, 'ॅ': 69, 'ॆ': 70, 'े': 71, 'ै': 72, 'ॉ': 73, 'ॊ': 74, 'ो': 75, 'ौ': 76, '्': 77, 'ॎ': 78, 'ॏ': 79, 'ॐ': 80, '॑': 81, '॒': 82, '॓': 83, '॔': 84, 'ॕ': 85, 'ॖ': 86, 'ॗ': 87, 'क़': 88, 'ख़': 89, 'ग़': 90, 'ज़': 91, 'ड़': 92, 'ढ़': 93, 'फ़': 94, 'य़': 95, 'ॠ': 96, 'ॡ': 97, 'ॢ': 98, 'ॣ': 99, '।': 100, '॥': 101, '०': 102, '१': 103, '२': 104, '३': 105, '४': 106, '५': 107, '६': 108, '७': 109, '८': 110,

In [6]:

eng_alphabets = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
pad_char = '-PAD-'

eng_alpha2index_r = {}
for index, alpha in enumerate(eng_alphabets):
    eng_alpha2index_r[alpha] = index

print(eng_alpha2index_r)

{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'J': 9, 'K': 10, 'L': 11, 'M': 12, 'N': 13, 'O': 14, 'P': 15, 'Q': 16, 'R': 17, 'S': 18, 'T': 19, 'U': 20, 'V': 21, 'W': 22, 'X': 23, 'Y': 24, 'Z': 25}


In [7]:
len(hindi_alpha2index)

129

Methods to clean the english and hindi data

In [8]:
import re
non_eng_letters_regex = re.compile('[^a-zA-Z ]')

# Remove all English non-letters
def cleanEnglishVocab(line):
    line = line.replace('-', ' ').replace(',', ' ').upper()
    line = non_eng_letters_regex.sub('', line)
    return line.split()

In [9]:
# Remove all Hindi non-letters
def cleanHindiVocab(line):
    line = line.replace('-', ' ').replace(',', ' ')
    cleaned_line = ''
    for char in line:
        if char in hindi_alpha2index or char == ' ':
            cleaned_line += char
    return cleaned_line.split()

Methods to convert hindi and english words into tensors based upon the characters in the words

In [10]:
def eng_word_to_tensor(word, letter2index, device = 'cpu'):
    rep = torch.zeros(len(word)+1, 1, len(letter2index)).to(device)
    for letter_index, letter in enumerate(word):
        pos = letter2index[letter]
        rep[letter_index][0][pos] = 1
    pad_pos = letter2index[pad_char]
    rep[letter_index+1][0][pad_pos] = 1
    return rep

def hindi_word_to_tensor(word, letter2index, device = 'cpu'):
    gt_rep = torch.zeros([len(word)+1, 1], dtype=torch.long).to(device)
    for letter_index, letter in enumerate(word):
        pos = letter2index[letter]
        gt_rep[letter_index][0] = pos
    gt_rep[letter_index+1][0] = letter2index[pad_char]
    return gt_rep

In [11]:
def eng_word_to_tensor_r(word, letter2index, device = 'cpu'):
    rep = torch.zeros(len(word), 1, len(letter2index)).to(device)
    for letter_index, letter in enumerate(word):
        pos = letter2index[letter]
        rep[letter_index][0][pos] = 1
    # pad_pos = letter2index[pad_char]
    # rep[letter_index+1][0][pad_pos] = 1
    return rep

def hindi_word_to_tensor_r(word, letter2index, device = 'cpu'):
    gt_rep = torch.zeros([len(word), 1], dtype=torch.long).to(device)
    for letter_index, letter in enumerate(word):
        pos = letter2index[letter]
        gt_rep[letter_index][0] = pos
    # gt_rep[letter_index+1][0] = letter2index[pad_char]
    return gt_rep

changing director to the proper folder where the data exists

In [12]:
# cd drive/MyDrive/"Machine Transliteration"/

Following is the dataloader class being used for this problem

In [13]:
import pandas as pd
from torch.utils.data import Dataset


class TransliterationDataLoader(Dataset):
    def __init__(self, filename):
        self.eng_words, self.hindi_words = self.readXmlDataset(filename, cleanHindiVocab)
        self.shuffle_indices = list(range(len(self.eng_words)))
        random.shuffle(self.shuffle_indices)
        self.shuffle_start_index = 0
        
    def __len__(self):
        return len(self.eng_words)
    
    def __getitem__(self, idx):
        return self.eng_words[idx], self.hindi_words[idx]
    
    def readXmlDataset(self, filename, lang_vocab_cleaner):
        trainData=pd.read_xml(filename)
        trainData['SourceName']=trainData['SourceName'].apply(lambda x: cleanEnglishVocab(x))
        trainData['TargetName']=trainData['TargetName'].apply(lambda x: cleanHindiVocab(x))
        lang1_words = []
        lang2_words = []
        for i, (wordlist1, wordlist2) in enumerate(zip(trainData['SourceName'].tolist(), trainData['TargetName'].tolist())):
            # Skip noisy data
            if len(wordlist1) != len(wordlist2):
                print('Skipping: ', wordlist1, ' - ', wordlist2)
                continue
            lang1_words.extend(wordlist1)
            lang2_words.extend(wordlist2)

        return lang1_words, lang2_words
    
    def get_random_sample(self):
        return self.__getitem__(np.random.randint(len(self.eng_words)))
    
    def get_batch_from_array(self, batch_size, array):
        end = self.shuffle_start_index + batch_size
        batch = []
        if end >= len(self.eng_words):
            batch = [array[i] for i in self.shuffle_indices[0:end%len(self.eng_words)]]
            end = len(self.eng_words)
        return batch + [array[i] for i in self.shuffle_indices[self.shuffle_start_index : end]]
    
    def get_batch(self, batch_size, postprocess = True):
        eng_batch = self.get_batch_from_array(batch_size, self.eng_words)
        hindi_batch = self.get_batch_from_array(batch_size, self.hindi_words)
        self.shuffle_start_index += batch_size + 1
        
        # Reshuffle if 1 epoch is complete
        if self.shuffle_start_index >= len(self.eng_words):
            random.shuffle(self.shuffle_indices)
            self.shuffle_start_index = 0
            
        return eng_batch, hindi_batch

In [15]:
! pip install pandas==1.3.0



^C


Defaulting to user installation because normal site-packages is not writeable
Collecting pandas==1.3.0
  Using cached pandas-1.3.0.tar.gz (4.7 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'error'


  error: subprocess-exited-with-error
  
  × pip subprocess to install build dependencies did not run successfully.
  │ exit code: 2
  ╰─> [115 lines of output]
      Ignoring numpy: markers 'python_version == "3.7" and (platform_machine != "arm64" or platform_system != "Darwin") and platform_machine != "aarch64"' don't match your environment
      Ignoring numpy: markers 'python_version == "3.8" and (platform_machine != "arm64" or platform_system != "Darwin") and platform_machine != "aarch64"' don't match your environment
      Ignoring numpy: markers 'python_version == "3.7" and platform_machine == "aarch64"' don't match your environment
      Ignoring numpy: markers 'python_version == "3.8" and platform_machine == "aarch64"' don't match your environment
      Ignoring numpy: markers 'python_version == "3.8" and platform_machine == "arm64" and platform_system == "Darwin"' don't match your environment
      Ignoring numpy: markers 'python_version == "3.9" and platform_machine == "arm6

In [16]:
train_data = TransliterationDataLoader('NEWS2018_M-EnHi_trn.xml')

Skipping:  ['AUSTRALIAN', 'NATIONAL', 'UNIVERSITY']  -  ['ऑस्ट्रेलियननेशनल', 'यूनिवर्सिटी']
Skipping:  ['AZAMNAGAR', 'ROAD']  -  ['आज़मनगर']
Skipping:  ['BAL', 'KRISHNA']  -  ['बालकृष्णा']
Skipping:  ['BARHARWA', 'JUNCTION']  -  ['बरहरवा']
Skipping:  ['CAPE', 'TOWN']  -  ['केपटाउन']
Skipping:  ['COLOURPLUS', 'FASHIONS']  -  ['कलर', 'प्लस', 'फ़ैशन्स']
Skipping:  ['DIBANG', 'VALLEY']  -  ['दिबंगवैली']
Skipping:  ['ENVOY', 'COMMUNICATIONS', 'GROUP']  -  ['एन्वॉय', 'कम्युनिकेशंस']
Skipping:  ['FAKHRUN', 'NISA']  -  ['फखरुन्निसा']
Skipping:  ['JAHAN', 'AARA']  -  ['जहाँआरा']
Skipping:  ['KARA', 'KUM']  -  ['काराकुम']
Skipping:  ['KELVINGROVE', 'ART', 'GALLERY', 'AND', 'MUSEUM']  -  ['केल्विनग्रोव', 'आर्ट', 'एण्ड', 'म्युज़ियम']
Skipping:  ['KING', 'EDWARD', 'VII']  -  ['किंग', 'एडवर्ड']
Skipping:  ['LONDONHEATHROW']  -  ['लंदन', 'हीथ्रो']
Skipping:  ['MASS', 'MUTUAL', 'LIFE']  -  ['मास', 'म्युच्युअल', 'लाइफ़', 'इंश्योरेंस']
Skipping:  ['MAUNA', 'LOA']  -  ['मौनालोआ']
Skipping:  ['NAVABHARAT', 

In [17]:
print("Train Set Size:\t", len(train_data))

print('\nSample data from train-set:')
for i in range(10):
    eng, hindi = train_data.get_random_sample()
    print(eng + ' - ' + hindi)

Train Set Size:	 19115

Sample data from train-set:
SHAILA - शैला
MURRAY - मुर्रे
RATUL - रतुल
ADALAT - अदालत
JUNCTION - जंक्शन
WASNA - वस्ना
TERI - तेरी
EESAA - ईसा
BHAVATMAJ - भावत्मज
KI - की


In [18]:
eng, hindi = train_data.get_random_sample()
eng_rep = eng_word_to_tensor(eng, eng_alpha2index)
print(eng, eng_rep)

SHIBLI tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]])


In [19]:
hindi_gt = hindi_word_to_tensor(hindi, hindi_alpha2index)
print(hindi, hindi_gt)

शिब्ली tensor([[55],
        [64],
        [45],
        [78],
        [51],
        [65],
        [ 0]])


In [20]:
# import torch
# all_letters=hindi_alpha2index
# n_letters=129
# # Find letter index from all_letters, e.g. "a" = 0
# class letter_to_index(object):
#   def __init__(self, all_letters,n_letters):
#     self.all_letters=all_letters
#     self.n_letters=n_letters
#     self.pad_char = '-PAD-'

#   def letterToIndex(self,letter):
#       return self.all_letters[letter]

#   # Just for demonstration, turn a letter into a <1 x n_letters> Tensor
#   def letterToTensor(self,letter):
#       tensor = torch.zeros(1, self.n_letters)
#       tensor[0][self.letterToIndex(letter)] = 1
#       return tensor

#   # Turn a line into a <line_length x 1 x n_letters>,
#   # or an array of one-hot letter vectors
#   def lineToTensor(self,line):
#       tensor = torch.zeros(30, 1, self.n_letters)
#       for li, letter in enumerate(line):
#           tensor[li][0][self.letterToIndex(letter)] = 1
#       for i in range(len(line),30,1):
#         tensor[i][0][self.letterToIndex(self.pad_char)]=1 
#       return tensor

# ltiobj=letter_to_index(hindi_alpha2index,129)
# print(ltiobj.letterToTensor('म'))
# print(len('परशुराम'))

# print(ltiobj.lineToTensor('परशुराम')[6])

In [21]:

# def eng_word_to_tensor(word, letter2index, device = 'cpu'):
#   ltiobj=letter_to_index(letter2index,30)
#   rep=ltiobj.lineToTensor(word)
#   return rep

# def hindi_word_to_tensor(word, letter2index, device = 'cpu'):
#     gt_rep = torch.zeros([len(word)+1, 1], dtype=torch.long).to(device)
#     for letter_index, letter in enumerate(word):
#         pos = letter2index[letter]
#         gt_rep[letter_index][0] = pos
#     gt_rep[letter_index+1][0] = letter2index[pad_char]
#     return gt_rep

# # def hindi_word_to_tensor(word, letter2index, device = 'cpu'):
# #   ltiobj=letter_to_index(letter2index,129)
# #   rep=ltiobj.lineToTensor(word)
# #   return rep

In [22]:
# from torch.utils.data import Dataset,DataLoader
# class TransliterationDataset(Dataset):

#   def __init__(self,x,y):
#     # x=train_df.iloc[:,1].values
#     # y=train_df.iloc[:,2].values
#     self.x_train=x
#     self.y_train=y

#   def __len__(self):
#     return len(self.y_train)
  
#   def __getitem__(self,idx):
#     X = eng_word_to_tensor(self.x_train[idx], eng_alpha2index)
#     y = hindi_word_to_tensor(self.y_train[idx], hindi_alpha2index)
#     return X,y

In [23]:
# len(eng_alpha2index)
# #eng_word_to_tensor('PARSHURAM',eng_alpha2index)

In [24]:
# len(hindi_alpha2index)
# hindi_word_to_tensor('परशुराम',hindi_alpha2index)

In [25]:
# MAX_LENGTH = 30
# class AttnDecoderRNN(nn.Module):
#     def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
#         super(AttnDecoderRNN, self).__init__()
#         self.hidden_size = hidden_size
#         self.output_size = output_size
#         self.dropout_p = dropout_p
#         self.max_length = max_length

#         self.embedding = nn.Embedding(self.output_size, self.hidden_size)
#         self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
#         self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
#         self.dropout = nn.Dropout(self.dropout_p)
#         self.gru = nn.GRU(self.hidden_size, self.hidden_size)
#         self.out = nn.Linear(self.hidden_size, self.output_size)

#     def forward(self, input, hidden, encoder_outputs):
#         embedded = self.embedding(input).view(1, 1, -1)
#         embedded = self.dropout(embedded)

#         attn_weights = F.softmax(
#             self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
#         attn_applied = torch.bmm(attn_weights.unsqueeze(0),
#                                  encoder_outputs.unsqueeze(0))

#         output = torch.cat((embedded[0], attn_applied[0]), 1)
#         output = self.attn_combine(output).unsqueeze(0)

#         output = F.relu(output)
#         output, hidden = self.gru(output, hidden)

#         output = F.log_softmax(self.out(output[0]), dim=1)
#         return output, hidden, attn_weights

#     def initHidden(self):
#         return torch.zeros(1, 1, self.hidden_size, device=device)

The model i will be using for this assignment would be Encoder Decoder attention model

In [26]:
MAX_OUTPUT_CHARS=30
class Transliteration_EncoderDecoder_Attention(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size, verbose=False):
        super(Transliteration_EncoderDecoder_Attention, self).__init__()
        
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.encoder_rnn_cell = nn.GRU(input_size, hidden_size)
        self.decoder_rnn_cell = nn.GRU(hidden_size*2, hidden_size)
        
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=2)
        
        self.U = nn.Linear(self.hidden_size, self.hidden_size)
        self.W = nn.Linear(self.hidden_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size, 1)
        self.out2hidden = nn.Linear(self.output_size, self.hidden_size)   
        
        self.verbose = verbose
        
    def forward(self, input, max_output_chars = MAX_OUTPUT_CHARS, device = 'cpu', ground_truth = None):
        
        # encoder
        encoder_outputs, hidden = self.encoder_rnn_cell(input)
        encoder_outputs = encoder_outputs.view(-1, self.hidden_size)
        
        if self.verbose:
            print('Encoder output', encoder_outputs.shape)
        
        # decoder
        decoder_state = hidden
        decoder_input = torch.zeros(1, 1, self.output_size).to(device)
        
        outputs = []
        U = self.U(encoder_outputs)
        
        if self.verbose:
            print('Decoder state', decoder_state.shape)
            print('Decoder intermediate input', decoder_input.shape)
            print('U * Encoder output', U.shape)
        
        for i in range(max_output_chars):
            
            W = self.W(decoder_state.view(1, -1).repeat(encoder_outputs.shape[0], 1))
            V = self.attn(torch.tanh(U + W))
            attn_weights = F.softmax(V.view(1, -1), dim = 1) 
            
            if self.verbose:
                print('W * Decoder state', W.shape)
                print('V', V.shape)
                print('Attn', attn_weights.shape)
            
            attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
            
            embedding = self.out2hidden(decoder_input)
            decoder_input = torch.cat((embedding[0], attn_applied[0]), 1).unsqueeze(0)
            
            if self.verbose:
                print('Attn LC', attn_applied.shape)
                print('Decoder input', decoder_input.shape)
                
            out, decoder_state = self.decoder_rnn_cell(decoder_input, decoder_state)
            
            if self.verbose:
                print('Decoder intermediate output', out.shape)
                
            out = self.h2o(decoder_state)
            out = self.softmax(out)
            outputs.append(out.view(1, -1))
            
            if self.verbose:
                print('Decoder output', out.shape)
                self.verbose = False
            
            max_idx = torch.argmax(out, 2, keepdim=True)
            if not ground_truth is None:
                max_idx = ground_truth[i].reshape(1, 1, 1)
            one_hot = torch.zeros(out.shape, device=device)
            one_hot.scatter_(2, max_idx, 1) 
            
            decoder_input = one_hot.detach()
            
        return outputs

In [27]:
net_attn = Transliteration_EncoderDecoder_Attention(len(eng_alpha2index), 256, len(hindi_alpha2index), verbose=True)

In [28]:
len(eng_alpha2index)

27

In [29]:
out=net_attn(eng_word_to_tensor('INDIA',eng_alpha2index))

Encoder output torch.Size([6, 256])
Decoder state torch.Size([1, 1, 256])
Decoder intermediate input torch.Size([1, 1, 129])
U * Encoder output torch.Size([6, 256])
W * Decoder state torch.Size([6, 256])
V torch.Size([6, 1])
Attn torch.Size([1, 6])
Attn LC torch.Size([1, 1, 256])
Decoder input torch.Size([1, 1, 512])
Decoder intermediate output torch.Size([1, 1, 256])
Decoder output torch.Size([1, 1, 129])


In [30]:
print(len(out))
for i in range(len(out)):
    print(out[i].shape, list(hindi_alpha2index.keys())[list(hindi_alpha2index.values()).index(torch.argmax(out[i]))])

30
torch.Size([1, 129]) ॎ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ
torch.Size([1, 129]) ॡ


Utilities methods for training

In [31]:
def train_batch(net, opt, criterion, batch_size, device = 'cpu', teacher_force = False):
    
    net.train().to(device)
    opt.zero_grad()
    eng_batch, hindi_batch = train_data.get_batch(batch_size)
    
    total_loss = 0
    for i in range(batch_size):
        
        input = eng_word_to_tensor(eng_batch[i], eng_alpha2index, device)
        gt = hindi_word_to_tensor(hindi_batch[i], hindi_alpha2index, device)
        outputs = net(input, gt.shape[0], device, ground_truth = gt if teacher_force else None)
        
        for index, output in enumerate(outputs):
            loss = criterion(output, gt[index]) / batch_size
            loss.backward(retain_graph = True)
            total_loss += loss
        
    opt.step()
    return total_loss/batch_size

In [32]:
def train_batch_r(net, opt, criterion, batch_size, device = 'cpu', teacher_force = False):
    
    net.train().to(device)
    opt.zero_grad()
    eng_batch, hindi_batch = train_data.get_batch(batch_size)
    
    total_loss = 0
    for i in range(batch_size):
        
        input = eng_word_to_tensor_r(eng_batch[i], eng_alpha2index_r, device)
        gt = hindi_word_to_tensor_r(hindi_batch[i], hindi_alpha2index_r, device)
        outputs = net(input, gt.shape[0], device, ground_truth = gt if teacher_force else None)
        
        for index, output in enumerate(outputs):
            loss = criterion(output, gt[index]) / batch_size
            loss.backward(retain_graph = True)
            total_loss += loss
        
    opt.step()
    return total_loss/batch_size

In [33]:
def train_setup_r(net, lr = 0.01, n_batches = 100, batch_size = 10, momentum = 0.9, display_freq=5, device = 'cpu'):
    
    net = net.to(device)
    criterion = nn.NLLLoss(ignore_index = -1)
    opt = optim.Adam(net.parameters(), lr=lr)
    teacher_force_upto = n_batches//3
    
    loss_arr = np.zeros(n_batches + 1)
    prev_loss=0.5
    
    for i in range(n_batches):
        loss_arr[i+1] = (loss_arr[i]*i + train_batch_r(net, opt, criterion, batch_size, device = device, teacher_force = i<teacher_force_upto ))/(i + 1)
        
        if i%display_freq == display_freq-1:
            clear_output(wait=True)
            
            print('Iteration', i, 'Loss', loss_arr[i])
            if loss_arr[i]<prev_loss:
              prev_loss=loss_arr[i]
              torch.save(net, 'model_best_6.pt')

            plt.figure()
            plt.plot(loss_arr[1:i], '-*')
            plt.xlabel('Iteration')
            plt.ylabel('Loss')
            plt.show()
            print('\n\n')
            
    
    return loss_arr

In [34]:
def train_setup(net, lr = 0.01, n_batches = 100, batch_size = 10, momentum = 0.9, display_freq=5, device = 'cpu'):
    
    net = net.to(device)
    criterion = nn.NLLLoss(ignore_index = -1)
    opt = optim.Adam(net.parameters(), lr=lr)
    teacher_force_upto = n_batches//3
    
    loss_arr = np.zeros(n_batches + 1)
    prev_loss=0.5
    
    for i in range(n_batches):
        loss_arr[i+1] = (loss_arr[i]*i + train_batch(net, opt, criterion, batch_size, device = device, teacher_force = i<teacher_force_upto ))/(i + 1)
        
        if i%display_freq == display_freq-1:
            clear_output(wait=True)
            
            print('Iteration', i, 'Loss', loss_arr[i])
            if loss_arr[i]<prev_loss:
              prev_loss=loss_arr[i]
              torch.save(net, 'model_best_6.pt')

            plt.figure()
            plt.plot(loss_arr[1:i], '-*')
            plt.xlabel('Iteration')
            plt.ylabel('Loss')
            plt.show()
            print('\n\n')
            
    
    return loss_arr

In [35]:
net_att = Transliteration_EncoderDecoder_Attention(len(eng_alpha2index), 256, len(hindi_alpha2index))

In [36]:
net_att_r = Transliteration_EncoderDecoder_Attention(len(eng_alpha2index_r), 256, len(hindi_alpha2index_r))

Following code is for model training with and without padding

In [37]:
loss_history = train_setup_r(net_att_r, lr=0.002, n_batches=5000, batch_size = 64, display_freq=10, device = device_gpu)

In [38]:
loss_history = train_setup(net_att, lr=0.002, n_batches=10000, batch_size = 64, display_freq=10, device = device_gpu)

Infrence from Model_trained_without_padding and Model_with_padding

In [39]:
device = torch.device('cpu')
model_without_padding = torch.load('model_best_1.pt',map_location=device)
model_with_padding = torch.load('model_best_6.pt',map_location=device)
# model.eval()

In [40]:
def test(model, word, device = 'cpu'):
    net_attn = model.eval().to(device)
    outputs=net_attn(eng_word_to_tensor(word,eng_alpha2index),30)
    hindi_output = ''
#     print(len(outputs))
    for out in outputs:
        val, indices = out.topk(1)
        index = indices.tolist()[0][0]
        if index == 0:
            break
        hindi_char = hindi_alphabets[index-1]
        hindi_output += hindi_char
    print(word + ' - ' + hindi_output)
    return hindi_output

In [41]:
def test_r(model, word, device = 'cpu'):
    net_attn = model.eval().to(device)
    outputs=net_attn(eng_word_to_tensor_r(word,eng_alpha2index_r),30)
    hindi_output = ''
#     print(len(outputs))
    prev_index=['0','0']
    for out in outputs:
        val, indices = out.topk(1)
        index = indices.tolist()[0][0]
#         print('type is ',type(prev_index))
        if index==int(prev_index[0]) or index==int(prev_index[1]) :
#           print(val.tolist()[0][0])
          break
        else:
          del prev_index[0]
          prev_index.append(index)
#           print(index)
#           print(val.tolist()[0][0])
        hindi_char = hindi_alphabets[index]
        hindi_output += hindi_char

    print(word + ' - ' + hindi_output)
    return hindi_output

In [42]:
len(test_r(model_without_padding, 'DEVNAGRI', device = 'cpu'))

DEVNAGRI - देवनार्


7

In [43]:
len(test(model_with_padding, 'GOOD', device = 'cpu'))

GOOD - गूड


3

In [44]:
test_data = TransliterationDataLoader('NEWS2018_M-EnHi_dev.xml')

Skipping:  ['STATS', 'CHIPPAC']  -  ['स्टेट्सचिपपैक']


In [45]:
def calc_accuracy(net, device = 'cpu'):
    net = net.eval().to(device)
    predictions = []
    accuracy = 0
    for i in range(len(test_data)):
        eng, hindi = test_data[i]
        gt = hindi_word_to_tensor(hindi, hindi_alpha2index, device)
        outputs=net(eng_word_to_tensor(eng,eng_alpha2index),gt.shape[0],device)
#         print(gt.shape[0])
        # outputs = infer(net, eng, gt.shape[0], device)
        correct = 0
        hindi_output=''
        for index, out in enumerate(outputs):
            val, indices = out.topk(1)
            hindi_pos = indices.tolist()[0]
#             print(hindi_pos[0])
            if hindi_pos[0]==0:
                break
            hindi_char = hindi_alphabets[hindi_pos[0]-1]
            hindi_output += hindi_char
#             print(eng+' - '+hindi_output)
            
            if hindi_pos[0] == gt[index][0]:
                correct += 1
        
        accuracy += correct/gt.shape[0]
    accuracy /= len(test_data)
    return accuracy

In [46]:
calc_accuracy(model_with_padding)

0.6444283968443626

In [47]:
# for i, (data, labels) in enumerate(train_loader):
#   print(data.shape, labels.shape)
#   print(data[1],labels[1])
#   break;

In [48]:
def solve():
    s = input("Enter a word: ")
    test(model_with_padding, s.upper(), device="cpu")
    pass

In [54]:
solve()

ANURAG - अनुराग


In [57]:
def solve2():
    s = input("Enter a sentence: ")
    words = list(map(lambda word: word.upper(), s.split()))
    for word in words:
        test(model_with_padding, word, device="cpu")

In [58]:
solve2()

PRINTING - प्रिंटिंग
HELLO - हेलो
WORLD - वर्ल्ड
