<a href="https://colab.research.google.com/github/Shruti-Subu/transliteration/blob/main/Untitled8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np

# Instantiates the device to be used as GPU/CPU based on availability
device_gpu = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Visualization tools
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output

import random

In [2]:
torch.cuda.is_available()

True

In [4]:
test="https://github.com/Shruti-Subu/transliteration/blob/main/NEWS2012-Ref-EnHi-1000.xml"
train="https://github.com/Shruti-Subu/transliteration/blob/main/NEWS2012-Training-EnHi-13937.xml"

In [3]:
test_dataset="/content/NEWS2012-Ref-EnHi-1000.xml"
train_dataset="/content/NEWS2012-Training-EnHi-13937.xml"

# **DATA MANAGEMENT**

**ALPHABET SETUP**

In [7]:
eng_alphabets = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
pad_char = '-PAD-'
eng_alpha2index={pad_char:0}
for index,alpha in enumerate(eng_alphabets):
  eng_alpha2index[alpha]=index+1
print(eng_alpha2index)

{'-PAD-': 0, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19, 'T': 20, 'U': 21, 'V': 22, 'W': 23, 'X': 24, 'Y': 25, 'Z': 26}


In [9]:
# Hindi Unicode Hex Range is 2304:2432.
hindi_alphabets=[chr(alpha) for alpha in range(2304,2432)]
hindi_alphabet_size=len(hindi_alphabets)

hindi_alpha2index={pad_char:0}
for index,alpha in enumerate(hindi_alphabets):
  hindi_alpha2index[alpha]=index+1
print(hindi_alpha2index)

{'-PAD-': 0, 'ऀ': 1, 'ँ': 2, 'ं': 3, 'ः': 4, 'ऄ': 5, 'अ': 6, 'आ': 7, 'इ': 8, 'ई': 9, 'उ': 10, 'ऊ': 11, 'ऋ': 12, 'ऌ': 13, 'ऍ': 14, 'ऎ': 15, 'ए': 16, 'ऐ': 17, 'ऑ': 18, 'ऒ': 19, 'ओ': 20, 'औ': 21, 'क': 22, 'ख': 23, 'ग': 24, 'घ': 25, 'ङ': 26, 'च': 27, 'छ': 28, 'ज': 29, 'झ': 30, 'ञ': 31, 'ट': 32, 'ठ': 33, 'ड': 34, 'ढ': 35, 'ण': 36, 'त': 37, 'थ': 38, 'द': 39, 'ध': 40, 'न': 41, 'ऩ': 42, 'प': 43, 'फ': 44, 'ब': 45, 'भ': 46, 'म': 47, 'य': 48, 'र': 49, 'ऱ': 50, 'ल': 51, 'ळ': 52, 'ऴ': 53, 'व': 54, 'श': 55, 'ष': 56, 'स': 57, 'ह': 58, 'ऺ': 59, 'ऻ': 60, '़': 61, 'ऽ': 62, 'ा': 63, 'ि': 64, 'ी': 65, 'ु': 66, 'ू': 67, 'ृ': 68, 'ॄ': 69, 'ॅ': 70, 'ॆ': 71, 'े': 72, 'ै': 73, 'ॉ': 74, 'ॊ': 75, 'ो': 76, 'ौ': 77, '्': 78, 'ॎ': 79, 'ॏ': 80, 'ॐ': 81, '॑': 82, '॒': 83, '॓': 84, '॔': 85, 'ॕ': 86, 'ॖ': 87, 'ॗ': 88, 'क़': 89, 'ख़': 90, 'ग़': 91, 'ज़': 92, 'ड़': 93, 'ढ़': 94, 'फ़': 95, 'य़': 96, 'ॠ': 97, 'ॡ': 98, 'ॢ': 99, 'ॣ': 100, '।': 101, '॥': 102, '०': 103, '१': 104, '२': 105, '३': 106, '४': 107, '५': 108, '६': 109, '७': 

**HELPER FUNCTION FOR DATA PRE-PROCESSING**

In [11]:
import re
non_eng_letters_regex = re.compile('[^a-zA-Z ]')

#remove all non-eng words
def cleanEnglishVocab(line):
  line=line.replace('-',' ').replace('.',' ')
  line=non_eng_letters_regex.sub('',line)
  return line.split()

#remove non hindi words
def cleanHindiVocab(line):
  line=line.replace('-',' ').replace('.',' ')
  cleaned_line=''
  for char in line:
    if char in hindi_alpha2index or char==' ':
      cleaned_line+=char
  return cleaned_line.split()



**DATASET LOADING**

In [32]:
from torch.utils.data import Dataset
import xml.etree.ElementTree as ET

class TransliterationDataLoader(Dataset):
    def __init__(self, filename):
        self.eng_words, self.hindi_words = self.readXmlDataset(filename, cleanHindiVocab)
        self.shuffle_indices = list(range(len(self.eng_words)))
        random.shuffle(self.shuffle_indices)
        self.shuffle_start_index = 0
        
    def __len__(self):
        return len(self.eng_words)
    
    def __getitem__(self, idx):
        return self.eng_words[idx], self.hindi_words[idx]
    
    def readXmlDataset(self, filename, lang_vocab_cleaner):
        transliterationCorpus = ET.parse(filename).getroot()
        lang1_words = []
        lang2_words = []

        for line in transliterationCorpus:
            wordlist1 = cleanEnglishVocab(line[0].text)
            wordlist2 = lang_vocab_cleaner(line[1].text)

            # Skip noisy data
            if len(wordlist1) != len(wordlist2):
                print('Skipping: ', line[0].text, ' - ', line[1].text)
                continue

            for word in wordlist1:
                lang1_words.append(word)
            for word in wordlist2:
                lang2_words.append(word)

        return lang1_words, lang2_words
    
    def get_random_sample(self):
        return self.__getitem__(np.random.randint(len(self.eng_words)))
    
    def get_batch_from_array(self, batch_size, array):
        end = self.shuffle_start_index + batch_size
        batch = []
        if end >= len(self.eng_words):
            batch = [array[i] for i in self.shuffle_indices[0:end%len(self.eng_words)]]
            end = len(self.eng_words)
        return batch + [array[i] for i in self.shuffle_indices[self.shuffle_start_index : end]]
    
    def get_batch(self, batch_size, postprocess = True):
        eng_batch = self.get_batch_from_array(batch_size, self.eng_words)
        hindi_batch = self.get_batch_from_array(batch_size, self.hindi_words)
        self.shuffle_start_index += batch_size + 1
        
        # Reshuffle if 1 epoch is complete
        if self.shuffle_start_index >= len(self.eng_words):
            random.shuffle(self.shuffle_indices)
            self.shuffle_start_index = 0
            
        return eng_batch, hindi_batch

In [33]:
train_dl=TransliterationDataLoader(train_dataset)
test_dl=TransliterationDataLoader(test_dataset)


Skipping:  BARHARWA JUNCTION  -  बरहरवा
Skipping:  STATE BNK TR  -  स्टेट बैंक ऑफ त्रावणकोर
Skipping:  SOUTH ARLINGTON CHURCH OF CHRIST  -  साउथ अर्लिंग्टन
Skipping:  KING EDWARD VII  -  किंग एडवर्ड
Skipping:  DIBANG VALLEY  -  दिबंगवैली
Skipping:  ORDER OF VASA  -  ऑडर ऑफ़ द वासा
Skipping:  AZAMNAGAR ROAD  -  आज़मनगर
Skipping:  CAPE TOWN  -  केपटाउन
Skipping:  NEW ZEALAND  -  न्यूज़ीलैंड
Skipping:  SEA OF THE HEBRIDES  -  सी ऑफ हरब्रिड्‍स
Skipping:  RAMCOIND  -  राम्को इंड
Skipping:  KELVINGROVE ART GALLERY AND MUSEUM  -  केल्व‍िनग्रोव आर्ट एण्ड म्युज़ियम
Skipping:  AUSTRALIAN NATIONAL UNIVERSITY  -  ऑस्ट्रेलियननेशनल यूनिवर्सिटी
Skipping:  JAHAN AARA  -  जहाँआरा
Skipping:  NAVABHARAT FERRO ALLOYS  -  नव भारत फ़ैरो अलॉय
Skipping:  RAMA LINGESHWARA  -  रामालिंगेश्वर
Skipping:  FAKHRUN NISA  -  फखरुन्निसा
Skipping:  REDIFF.COM INDIA LIMITED  -  रेडिफ़ डॉट कॉम इंडिया लिमिटेड
Skipping:  OMKARNATH THAKUR  -  ओंकार नाथ ठाकुर
Skipping:  OPENTV  -  ओपन टीवी
Skipping:  ENVOY COMMUNICATIONS GROUP  

In [34]:
print("Train Set Size:\t", len(train_dl))
print("Test Set Size:\t", len(test_dl))

print('\nSample data from train-set:')
for i in range(10):
    eng, hindi = train_dl.get_random_sample()
    print(eng + ' - ' + hindi)

Train Set Size:	 20531
Test Set Size:	 1000

Sample data from train-set:
AWAZ - आवाज़
SARAGADI - सारागड़ी
HASEENA - हसीना
GRUNDY - ग्रंडी
BOWDEN - बोडन
Chubb - शुब
Anjasaa - अंजसा
KHWAB - ख्वाब
DOBHAKOL - दोभाकोल
CUDDALORE - कुडालोर


# **ENCODING THE WORDS**

In [35]:
def word_rep(word, letter2index, device = 'cpu'):
    rep = torch.zeros(len(word)+1, 1, len(letter2index)).to(device)
    for letter_index, letter in enumerate(word):
        pos = letter2index[letter]
        rep[letter_index][0][pos] = 1
    pad_pos = letter2index[pad_char]
    rep[letter_index+1][0][pad_pos] = 1
    return rep

def gt_rep(word, letter2index, device = 'cpu'):
    gt_rep = torch.zeros([len(word)+1, 1], dtype=torch.long).to(device)
    for letter_index, letter in enumerate(word):
        pos = letter2index[letter]
        gt_rep[letter_index][0] = pos
    gt_rep[letter_index+1][0] = letter2index[pad_char]
    return gt_rep

In [37]:
eng, hindi = train_dl.get_random_sample()
eng_rep = word_rep(eng, eng_alpha2index)
print(eng, eng_rep)

AWARD tensor([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]])


In [38]:
hindi_rep=gt_rep(hindi,hindi_alpha2index)
print(hindi,hindi_rep)

अवार्ड tensor([[ 6],
        [54],
        [63],
        [49],
        [78],
        [34],
        [ 0]])


# **NETWORK ARCHITECTURE**

**ENCODER DECODER(GRU)**

In [45]:
MAX_OUTPUT_CHAR=30
class Transliteration_EncoderDecoder(nn.Module):

  def __init__(self,input_size,hidden_size,output_size,verbose):
     super(Transliteration_EncoderDecoder,self).__init__()
     self.hidden_size=hidden_size
     self.output_size=output_size

     self.encoder_rnn_cell=nn.GRU(input_size,hidden_size)
     self.decode_rnn_cell=nn.GRU(output_size,hidden_size)

     self.h2o=nn.Linear(hidden_size,output_size)
     self.softmax=nn.LogSoftmax(dim=2)
     
     self.verbose=verbose

  def forward(self,input,max_output_char=MAX_OUTPUT_CHAR,device='cpu',ground_truth=None):
      #encoder
      out,hidden=self.encoder_rnn_cell(input)

      if self.verbose:
        print('Encoder input', input.shape)
        print('Encoder output', out.shape)
        print('Encoder hidden', hidden.shape)

        #decoder
      decoder_state=hidden
      decoder_input=torch.zeros(1,1,self.output_size).to(device)
      output=[]

      if self.verbose:
        print('Decoder input ',decoder_input.shape)
        print('Decoder state ',decoder_state.shape)

      for i in range(max_output_char):
        out,decoder_state=self.decode_rnn_cell(decoder_input,decoder_state)

        if self.verbose:
          print('Decoder intermediate output ',out.shape)
        
        out=self.h2o(decoder_state)
        out=self.softmax(out)
        output.append(out.view(1,-1))

        if self.verbose:
          print('decoder output ',out.shape)
          self.verbose=False

        max_idx = torch.argmax(out, 2, keepdim=True)
        if not ground_truth is None:
          max_idx=ground_truth[i].reshape(1,1,1)
        
        one_hot=torch.FloatTensor(out.shape).to(device)
        one_hot.zero_()
        one_hot.scatter_(2,max_idx,1)

        decoder_input=one_hot.detach()
      return output



In [46]:
net = Transliteration_EncoderDecoder(len(eng_alpha2index), 256, len(hindi_alpha2index), verbose=True)

In [47]:
# unlike fully connceted or CNN model , we have to write an inference routine in case of sequence model.
def infer(net, eng_word,shape,device ='cpu'):
    # net.eval()
    input_ = word_rep(eng_word,eng_alpha2index,device) # convert the name into one hot encoding.
    outputs = net(input_,shape,device) # initilise the hidden layer.
    
    return outputs

In [50]:
out = infer(net, 'INDIA', 30)

In [51]:
print(len(out))
for i in range(len(out)):
    print(out[i].shape, list(hindi_alpha2index.keys())[list(hindi_alpha2index.values()).index(torch.argmax(out[i]))])

30
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य
torch.Size([1, 129]) य


#**TRAINING**

**CORE TRAINER**