In [6]:
import os
import string
import numpy as np
import pandas as pd
from string import digits
import re
import tensorflow as tf
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import unicodedata
import io
import time
import warnings
import sys
import xml.etree.ElementTree as ET

filename = "NEWS2018_M-EnHi_trn.xml"

In [7]:
hindi_alphabets = [chr(alpha) for alpha in range(2304, 2432)]
hindi_alphabet_size = len(hindi_alphabets)

hindi_alpha2index = {'': 0,'' : 1}
for index, alpha in enumerate(hindi_alphabets):
    hindi_alpha2index[alpha] = index+1


In [8]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w = w.rstrip().strip()
    return w

def hindi_preprocess_sentence(w):
    
    w = unicode_to_ascii(w.strip())
    w = w.replace('-', ' ').replace(',', ' ')
    cleaned_line = ''
    for char in w:
        if char in hindi_alpha2index or char == ' ':
            cleaned_line += char

    cleaned_line = cleaned_line.rstrip().strip()
    
    cleaned_line = cleaned_line.split()
    
    return cleaned_line

In [15]:
def create_dataset(filename):
    transliterationCorpus = ET.parse(filename).getroot()
    en = []
    hd = []
    for line in transliterationCorpus:
        wordlist1 = line[0].text 
        wordlist2 = line[1].text 
        if len(wordlist1) != len(wordlist2):
            continue

        en_1 = preprocess_sentence(wordlist1)
        en_1 = '@' + en_1 + '#'
        en.append(en_1)
            
        # for word in wordlist2:
        hd_1 = hindi_preprocess_sentence(wordlist2)[0]
        hd_1 = '@' + hd_1 + '#'
        hd.append(hd_1)
            
    # print(hd)
    return hd, en

In [16]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [17]:
class WordIndex():
  def __init__(self, lang):
    self.lang = lang
    self.word2idx = {}
    self.idx2word = {}
    self.vocab = set()
    
    self.create_index()
    
  def create_index(self):
    for phrase in self.lang:
      for l in phrase:
        self.vocab.update(l)
    
    self.vocab = sorted(self.vocab)
    
    self.word2idx[''] = 0
    for index, word in enumerate(self.vocab):
      self.word2idx[word] = index + 1
    
    for word, index in self.word2idx.items():
      self.idx2word[index] = word

In [18]:
def load_dataset(filename):
    
    targ_lang, inp_lang = create_dataset(filename)
    
    inp_lang_1 = WordIndex(inp_lang)
    targ_lang_1 = WordIndex(targ_lang)
    
    input_tensor = [[inp_lang_1.word2idx[s] for s in en] for en in inp_lang]
    
    target_tensor = [[targ_lang_1.word2idx[s] for s in hn] for hn in targ_lang]
    
    max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)
    
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, maxlen=max_length_inp, padding='post')
    
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, maxlen=max_length_tar,  padding='post')

    return input_tensor, target_tensor, inp_lang_1, targ_lang_1

In [19]:
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(filename)
max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)

In [20]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

2489 2489 623 623


In [21]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.idx2word[t]))
    
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[6])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[6])

Input Language; index to word mapping
4 ----> @
23 ----> s
13 ----> i
22 ----> r
13 ----> i
25 ----> u
23 ----> s
2 ----> #

Target Language; index to word mapping
2 ----> @
44 ----> स
47 ----> ि
39 ----> र
47 ----> ि
38 ----> य
44 ----> स
1 ----> #


In [22]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 128
units = 256
vocab_inp_size = len(inp_lang.word2idx)+1
vocab_tar_size = len(targ_lang.word2idx)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

2023-02-01 10:57:33.816962: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-01 10:57:34.262019: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 18068 MB memory:  -> device: 0, name: NVIDIA A100-PCIE-40GB MIG 3g.20gb, pci bus id: 0000:c1:00.0, compute capability: 8.0


In [23]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

In [24]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    hidden_with_time_axis = tf.expand_dims(query, 1)
    score = self.V(tf.nn.tanh(
        self.W1(values) + self.W2(hidden_with_time_axis)))
    attention_weights = tf.nn.softmax(score, axis=1)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)
    return context_vector, attention_weights

In [25]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    context_vector, attention_weights = self.attention(hidden, enc_output)
    x = self.embedding(x)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    output, state = self.gru(x)
    output = tf.reshape(output, (-1, output.shape[2]))
    x = self.fc(output)
    return x, state, attention_weights

decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

In [26]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)
  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  return tf.reduce_mean(loss_)

In [27]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [28]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0
  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word2idx['@']] * BATCH_SIZE, 1)
    # Teacher forcing
    for t in range(1, targ.shape[1]):
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
      loss += loss_function(targ[:, t], predictions)
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))      
  return batch_loss

In [30]:
EPOCHS = 40

for epoch in range(EPOCHS):
  start = time.time()
  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0
  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss
    if batch % 100 == 0:
        print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                     batch,
                                                     batch_loss.numpy()))
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 0.4296
Epoch 1 Loss 0.3750
Time taken for 1 epoch 0.9649903774261475 sec

Epoch 2 Batch 0 Loss 0.3969
Epoch 2 Loss 0.3261
Time taken for 1 epoch 1.026418924331665 sec

Epoch 3 Batch 0 Loss 0.2872
Epoch 3 Loss 0.2866
Time taken for 1 epoch 0.9521527290344238 sec

Epoch 4 Batch 0 Loss 0.3075
Epoch 4 Loss 0.2691
Time taken for 1 epoch 0.9899158477783203 sec

Epoch 5 Batch 0 Loss 0.2188
Epoch 5 Loss 0.2423
Time taken for 1 epoch 0.9469757080078125 sec

Epoch 6 Batch 0 Loss 0.2305
Epoch 6 Loss 0.2329
Time taken for 1 epoch 0.9923355579376221 sec

Epoch 7 Batch 0 Loss 0.2066
Epoch 7 Loss 0.2233
Time taken for 1 epoch 0.925208568572998 sec

Epoch 8 Batch 0 Loss 0.1739
Epoch 8 Loss 0.2110
Time taken for 1 epoch 0.9997899532318115 sec

Epoch 9 Batch 0 Loss 0.1499
Epoch 9 Loss 0.1981
Time taken for 1 epoch 0.9528632164001465 sec

Epoch 10 Batch 0 Loss 0.1647
Epoch 10 Loss 0.1817
Time taken for 1 epoch 1.011082410812378 sec

Epoch 11 Batch 0 Loss 0.1746
Epoch 11 Loss 0.1673
T

In [31]:
def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))
    sentence = preprocess_sentence(sentence)
    # inputs = [print(i) for i in sentence]
    inputs = [inp_lang.word2idx[i] for i in sentence]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                           maxlen=max_length_inp,
                                                           padding='post')
    inputs = tf.convert_to_tensor(inputs)
    result = ''
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word2idx['@']], 0)
    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                             dec_hidden,
                                                             enc_out)
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += targ_lang.idx2word[predicted_id] + ' '
        if targ_lang.idx2word[predicted_id] == '#':
            return result, sentence
        dec_input = tf.expand_dims([predicted_id], 0)
    return result, sentence

In [55]:
def transliterate(sentence):
    result, sentence = evaluate(sentence)
    # print('Input: %s' % (sentence))
    # print('transliteration: {}'.format(''.join(result.split(' '))))
    # print(format(''.join(result.split(' '))))
    return format(''.join(result.split(' ')))

In [44]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f67545beda0>

In [56]:
transliterate('rohit')

'रोहित#'

In [72]:
transliterate('Kya')

'कया#'

In [73]:
transliterate('Doctor')

'डोकटर#'

In [74]:
class Export(tf.Module):
  def __init__(self, model):
    self.model = model

  @tf.function(input_signature=[tf.TensorSpec(dtype=tf.string, shape=[None])])
  def translate(self, inputs):
    return self.model.translate(inputs)


In [75]:
def compare_similarity(word1, word2):
    count = 0
    
    # Convert the words to character n-grams
    word1_ngrams = [word1[i:i+3] for i in range(len(word1)-2)]
    word2_ngrams = [word2[i:i+3] for i in range(len(word2)-2)]

    # Calculate the number of common n-grams
    common_ngrams = len(set(word1_ngrams) & set(word2_ngrams))
    if (len(word1_ngrams) + len(word2_ngrams)) > 0:
    # Calculate the similarity score
        similarity_score = common_ngrams / (len(word1_ngrams) + len(word2_ngrams))
    
        if similarity_score > 0:

            count = 1

        return similarity_score, count
    else:
        
        return 0,0

In [76]:
transliterationCorpus = ET.parse("NEWS2018_M-EnHi_trn.xml").getroot()
en = []
hd = []

acc = 0

similarity_score = 0

similarity_total =0

total_count = 0

prev_similarity = 0

max_sim = 0

for line in transliterationCorpus:
    wordlist1 = line[0].text 
    wordlist2 = line[1].text 
    if len(wordlist1) != len(wordlist2):
        continue

    en_1 = preprocess_sentence(wordlist1)
    # en_1 = '@' + en_1 + '#'
    en.append(en_1)

    # for word in wordlist2:
    hd_1 = hindi_preprocess_sentence(wordlist2)[0]
    # hd_1 = '@' + hd_1 + '#'
    hd.append(hd_1)

for i in range(len(en)):
    
    convt = transliterate(en[i])
    
    similarity_score, count = compare_similarity(hd[i], convt)
    
    if similarity_score > prev_similarity:
        
        max_sim = similarity_score
        
    prev_similarity = similarity_score
    
    similarity_total = similarity_score + similarity_total
    
    total_count = total_count + count
    
similarity_avg = similarity_total/total_count
    
acc = total_count/len(en)

print('Similarity average score :', similarity_avg)

print('Accuracy :', acc)
    


Similarity average score : 0.28190008530825555
Accuracy : 0.49325192802056556
