In [26]:
#Necessary Libraries

import numpy as np
import tensorflow as tf
from random import randint
import matplotlib.pyplot as plt
from numpy import array,argmax,array_equal
import keras.backend as K
from tensorflow.keras import models,Input
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import LSTM, Bidirectional, SimpleRNN, GRU,Lambda,Dense, Flatten
from tensorflow.keras.optimizers import Adam
import matplotlib.ticker as ticker
tf.keras.backend.set_floatx('float64')
import csv

In [5]:
#Loading the dakshina dataset

!wget https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
!tar -xf dakshina_dataset_v1.0.tar

--2022-05-15 15:00:07--  https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
Resolving storage.googleapis.com (storage.googleapis.com)... 172.253.117.128, 74.125.20.128, 108.177.98.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.253.117.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2008340480 (1.9G) [application/x-tar]
Saving to: ‘dakshina_dataset_v1.0.tar’


2022-05-15 15:00:15 (248 MB/s) - ‘dakshina_dataset_v1.0.tar’ saved [2008340480/2008340480]



In [6]:
#Selecting the Hindi language

!ls dakshina_dataset_v1.0/hi/lexicons

hi.translit.sampled.dev.tsv   hi.translit.sampled.train.tsv
hi.translit.sampled.test.tsv


In [56]:
#Directory for Training,testidation and Testing
train_dir = "./dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"

test_dir = "./dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv"

In [57]:
# Reading the raw corpus
#returns the native(Hindi) and romanized(English) versions of the words in the corpus

import io
def raw_corpus(crp):
  Eng = []
  Hindi= []
  with io.open(crp, encoding ='utf-8') as f:
    for line in f:
      if '\t' not in line:
        continue
      tokens = line.rstrip().split("\t")
      Eng.append(tokens[1])
      Hindi.append(tokens[0])
  return Eng, Hindi 

In [58]:
train_src, train_tgt = raw_corpus(train_dir)

test_src, test_tgt = raw_corpus(test_dir)

print("Training examples: ", len(train_src))
print("Testing examples: ", len(test_src))

Training examples:  44204
Testing examples:  4502


In [59]:
ip_txt_ns = []
tgt_txt_ns = []
test_ip_txt_ns = []
test_tgt_txt_ns = []
ip_char = set()
tgt_char = set()

In [60]:
for (txt_ip, txt_tgt) in zip(train_src, train_tgt):
    # tab : "start sequence" character
    # \n  : "end sequence" character
    txt_tgt = "B" + txt_tgt + "E"
    ip_txt_ns.append(txt_ip)
    tgt_txt_ns.append(txt_tgt)

    for char in txt_ip:
        if char not in ip_char:
            ip_char.add(char)

    for char in txt_tgt:
        if char not in tgt_char:
            tgt_char.add(char)


for (txt_ip, txt_tgt) in zip(test_src, test_tgt):
    # tab : "start sequence" character
    # \n  : "end sequence" character
    txt_tgt = "B" + txt_tgt + "E"
    test_ip_txt_ns.append(txt_ip)
    test_tgt_txt_ns.append(txt_tgt)
    for char in txt_ip:
        if char not in ip_char:
            ip_char.add(char)
    for char in txt_tgt:
        if char not in tgt_char:
            tgt_char.add(char)

In [61]:
#Shuffling the Training and testidation dataset

train_arr = np.arange(len(train_src))
np.random.shuffle(train_arr)
test_arr = np.arange(len(test_src))
np.random.shuffle(test_arr)


In [62]:
ips_txt = []
tgts_txt = []

for i in range(len(train_src)):
    ips_txt.append(ip_txt_ns[train_arr[i]])
    tgts_txt.append(tgt_txt_ns[train_arr[i]])

In [63]:
test_ip_txt = []
test_tgt_txt = []

for i in range(len(test_src)):
    test_ip_txt.append(test_ip_txt_ns[test_arr[i]])
    test_tgt_txt.append(test_tgt_txt_ns[test_arr[i]])

In [64]:
ip_char.add(" ")
tgt_char.add(" ")
ip_char = sorted(list(ip_char))
tgt_char = sorted(list(tgt_char))

In [65]:
enc_tokens = len(ip_char)
dec_tokens= len(tgt_char)

In [66]:
max_enc_seq_length = max([len(txt) for txt in ips_txt])
max_dec_seq_length = max([len(txt) for txt in tgts_txt])
test_max_enc_seq_length = max([len(txt) for txt in test_ip_txt])
test_max_dec_seq_length = max([len(txt) for txt in test_tgt_txt])

In [70]:
ip_tk_idx= dict([(j, k) for k, j in enumerate(ip_char)])
tgt_tk_idx= dict([(j, k) for k, j in enumerate(tgt_char)])
rev_src_char_idx = dict((i, char) for char, i in ip_tk_idx.items())
rev_tgt_char_idx = dict((i, char) for char, i in tgt_tk_idx.items())

In [71]:
trc_ip_txt = ips_txt[:44160]
trc_tgt_txt = tgts_txt[:44160]

In [72]:
ip_encd = np.zeros(
    (len(trc_ip_txt), max_enc_seq_length, enc_tokens), dtype="float64"
)
tgt_decd = np.zeros(
    (len(trc_ip_txt), max_dec_seq_length, dec_tokens), dtype="float64"
)

In [73]:
for i, (txt_ip, txt_tgt) in enumerate(zip(trc_ip_txt, trc_tgt_txt)):
    for m, n in enumerate(txt_ip):
        ip_encd[i, m, ip_tk_idx[n]] = 1.0
    ip_encd[i, m + 1 :, ip_tk_idx[" "]] = 1.0
    for m, n in enumerate(txt_tgt):
        tgt_decd[i, m, tgt_tk_idx[n]] = 1.0
    tgt_decd[i, m + 1 :, tgt_tk_idx[" "]] = 1.0

In [74]:
test_ip_encd= np.zeros(
    (len(test_ip_txt), max_enc_seq_length, enc_tokens), dtype="float64"
)
test_tgt_decd = np.zeros(
    (len(test_tgt_txt), max_dec_seq_length, dec_tokens), dtype="float64"
)

In [75]:
for i, (txt_ip, txt_tgt) in enumerate(zip(test_ip_txt, test_tgt_txt)):
    
    for t, n in enumerate(txt_ip):
        test_ip_encd[i, t, ip_tk_idx[n]] = 1.0
    test_ip_encd[i, t + 1 :, ip_tk_idx[" "]] = 1.0

    for t, n in enumerate(txt_tgt):
        test_tgt_decd[i, t, tgt_tk_idx[n]] = 1.0
    test_tgt_decd[i, t + 1: , tgt_tk_idx[" "]] = 1.0

In [76]:
class Bahdanau(tf.keras.layers.Layer):
  def __init__(self, units):
    super(Bahdanau, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)
    
  def call(self, query, testue):
    
    query_with_time_axis = tf.expand_dims(query, 1)
    
    score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(testue)))
    
    aw = tf.nn.softmax(score, axis=1)
    vc = aw * testue
    vc = tf.reduce_sum(vc, axis=1)

    return vc, aw

In [80]:
class Seq_to_Seq_with_attention(object):

  def __init__(self, cell = 'RNN', hidden_layer=32, learning_rate= 1e-3, drop_out = 0.3,
               epochs = 10, batch_size = 32, attention = 'bahdanau'):
    
    self.cell = cell
    self.hidden_layer = hidden_layer
    self.learning_rate = learning_rate
    self.drop_out = drop_out
    self.epochs = epochs
    self.batch_size = batch_size
    self.attention = attention

  def fit_model(self, ip_encd, tgt_decd):

    ip_encds = Input(shape=(max_enc_seq_length, enc_tokens), name='encoder_inputs')

    if self.cell == 'LSTM':

      enc_lstm = LSTM(self.hidden_layer,return_sequences=True, return_state=True, dropout = self.drop_out, name='encoder_lstm')
      enc_ops, enc_hs, enc_cs = enc_lstm(ip_encds)
      states_enc = [enc_hs, enc_cs]

    elif self.cell == 'RNN':

      enc_rnn = SimpleRNN(self.hidden_layer,return_sequences=True, return_state=True, dropout = self.drop_out, name='encoder_rnn')
      enc_ops, enc_hs = enc_rnn(ip_encds)
      states_enc = [enc_hs]

    elif self.cell == 'GRU':

      enc_gru = GRU(self.hidden_layer,return_sequences=True, return_state=True, dropout = self.drop_out, name='encoder_gru')
      enc_ops, enc_hs = enc_gru(ip_encds)
      states_enc = [enc_hs]

    

    # Attention Layer
    if self.attention == 'bahdanau':
      attention= Bahdanau(self.hidden_layer)

    # Decoder Layers
    inps_deco = Input(shape=(1, (dec_tokens + self.hidden_layer)),name='decoder_inputs')

    if self.cell == 'LSTM':

      dec_lstm = LSTM(self.hidden_layer, dropout = self.drop_out, return_state=True, name='decoder_lstm')
    
    elif self.cell == 'GRU':

      dec_gru = GRU(self.hidden_layer, dropout = self.drop_out, return_state=True, name='decoder_gru')
    
    elif self.cell == 'RNN':

      dec_rnn = SimpleRNN(self.hidden_layer, dropout = self.drop_out, return_state=True, name='decoder_rnn')  
    
    
    dec_dense = Dense(dec_tokens, activation='softmax',  name='decoder_dense')
    all_ops = []

    ips = np.zeros((self.batch_size, 1, dec_tokens))
    ips[:, 0, 0] = 1 

    dec_ops = enc_hs
    states = states_enc

    for _ in range(max_dec_seq_length):

      vc, aw = attention(dec_ops, enc_ops)
      vc = tf.expand_dims(vc, 1)
      
      ips = tf.concat([vc, ips], axis=-1)

      if self.cell == 'LSTM':

        dec_ops, hs, cs = dec_lstm(ips, initial_state=states)

      if self.cell == 'GRU':

        dec_ops, hs = dec_gru(ips, initial_state=states)

      if self.cell == 'RNN':

        dec_ops, hs = dec_rnn(ips, initial_state=states)
      
      ops = dec_dense(dec_ops)
      ops = tf.expand_dims(ops, 1)
      all_ops.append(ops)
      ips = ops
      if self.cell == 'LSTM':

        states = [hs, cs]

      if self.cell == 'GRU' or self.cell == 'RNN':
        
        states = [hs]


    dec_ops = Lambda(lambda x: K.concatenate(x, axis=1))(all_ops)
    model = Model(ip_encds, dec_ops, name='model_encoder_decoder')
    
    optimizer = Adam(lr=self.learning_rate, beta_1=0.9, beta_2=0.999)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

    model.fit(ip_encd, tgt_decd,
              batch_size=self.batch_size, 
              epochs=self.epochs,
              #callbacks = [WandbCallback()]
              )

    pred = model.predict(test_ip_encd[:4352], batch_size=self.batch_size)
    dl=['Sno','Input Data','Target data','Predicted Data']
    total = 0
    right = 0
    v_t = 4352

    for k in range(v_t):
      
      ohv = pred[k]
      ohv1 = test_tgt_decd[k]
      id2 = tf.argmax(ohv, axis=1)
      id1 = tf.argmax(ohv1, axis=1)
      
      if (id2.numpy() == id1.numpy()).all():
        right = right + 1
        
      total = total + 1
      accuracy_epoch = right/total
      arr= id2.numpy()
      dec_seq=''
      for i in range(1,len(arr)):
            if arr[i] != 2:
                dec_seq = dec_seq + rev_tgt_char_idx[arr[i]]

      tw = test_tgt_txt[k] 
      tw = tw[1:len(tw)-1]
      dl1 = [k+1, test_ip_txt[k], tw, dec_seq]
      dl.append(dl1)

    with open('Attention_Predictions.tsv', 'w', newline='', encoding="utf-8") as file:
        writer = csv.writer(file, delimiter='\t')
        writer.writerows(dl)

    
    test_accuracy = right/total
    print(test_accuracy)

In [83]:
#best hyperparameters
best_attention = 'bahdanau'
best_batch_size = 64
best_cell = 'LSTM'
best_drop_out = 0.2
best_epochs = 15
best_hidden_layer = 128
best_learning_rate = 0.001

In [None]:
rnn_model = Seq_to_Seq_with_attention(best_cell, hidden_layer=best_hidden_layer,
                learning_rate= best_learning_rate, drop_out=best_drop_out, epochs = best_epochs,
                batch_size = best_batch_size)
  
rnn_model.fit_model(ip_encd,tgt_decd)

  super(Adam, self).__init__(name, **kwargs)


Epoch 1/15