In [1]:
#Necessary Libraries

import numpy as np
from math import log,log1p
from numpy import array
from numpy import argmax
import csv
import keras
from keras.layers import Input, LSTM, Dense, Embedding, GRU, Dropout, SimpleRNN
from keras.models import Model,load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam



In [2]:
#Install WandB

%pip install wandb -q
import wandb
from wandb.keras import WandbCallback

[K     |████████████████████████████████| 1.8 MB 8.9 MB/s 
[K     |████████████████████████████████| 181 kB 49.5 MB/s 
[K     |████████████████████████████████| 145 kB 52.9 MB/s 
[K     |████████████████████████████████| 63 kB 1.6 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [3]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
wandb.init(project="CS6910_DL_Assignment_3", entity="nomads")

[34m[1mwandb[0m: Currently logged in as: [33mtalksick[0m ([33mnomads[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
#Loading the dakshina dataset

!wget https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
!tar -xf dakshina_dataset_v1.0.tar

--2022-05-15 06:22:06--  https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.195.128, 74.125.20.128, 108.177.98.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.195.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2008340480 (1.9G) [application/x-tar]
Saving to: ‘dakshina_dataset_v1.0.tar’


2022-05-15 06:22:21 (134 MB/s) - ‘dakshina_dataset_v1.0.tar’ saved [2008340480/2008340480]



In [6]:
#Selecting the Hindi language

!ls dakshina_dataset_v1.0/hi/lexicons

hi.translit.sampled.dev.tsv   hi.translit.sampled.train.tsv
hi.translit.sampled.test.tsv


In [7]:
#Directory for Training,Validation and Testing
train_dir = "./dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
val_dir = "./dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv"
test_dir = "./dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv"

In [8]:
# Reading the raw corpus
#returns the native(Hindi) and romanized(English) versions of the words in the corpus

import io
def raw_corpus(crp):
  Eng = []
  Hindi= []

  with io.open(crp, encoding ='utf-8') as f:
    for line in f:
      if '\t' not in line:
        continue
      tokens = line.rstrip().split("\t")
      Eng.append(tokens[1])
      Hindi.append(tokens[0])
      
  return Eng, Hindi                                                             


In [9]:
train_src, train_tgt = raw_corpus(train_dir)
test_src, test_tgt = raw_corpus(test_dir)


In [10]:
#Shuffling the Training and Validation dataset

train_arr = np.arange(len(train_src))
np.random.shuffle(train_arr)
test_arr = np.arange(len(test_src))
np.random.shuffle(test_arr)

In [11]:
ip_txt_ns = []
tgt_txt_ns = []

test_ip_txt_ns = []
test_tgt_txt_ns = []

src_char = set()
tgt_char = set()

for (txt_ip, txt_tgt) in zip(train_src, train_tgt):

    txt_tgt = "B" + txt_tgt + "E"

    ip_txt_ns.append(txt_ip)
    
    tgt_txt_ns.append(txt_tgt)

    for char in txt_ip:

        if char not in src_char:

            src_char.add(char)

    for char in txt_tgt:

        if char not in tgt_char:

            tgt_char.add(char)

for (txt_ip, txt_tgt) in zip(test_src, test_tgt):

    txt_tgt = "B" + txt_tgt + "E"

    test_ip_txt_ns.append(txt_ip)

    test_tgt_txt_ns.append(txt_tgt)

    for char in txt_ip:

        if char not in src_char:

            src_char.add(char)
            
    for char in txt_tgt:

        if char not in tgt_char:

            tgt_char.add(char)



In [12]:
ip_txt= []
tgt_txt= []

for i in range(len(train_src)):

    ip_txt.append(ip_txt_ns[train_arr[i]])

    tgt_txt.append(tgt_txt_ns[train_arr[i]])

test_ip_txt= []
test_tgt_txt= []

for i in range(len(test_src)):

    test_ip_txt.append(test_ip_txt_ns[test_arr[i]])
    
    test_tgt_txt.append(test_tgt_txt_ns[test_arr[i]])

src_char.add(" ")
tgt_char.add(" ")

src_char = sorted(list(src_char))
tgt_char = sorted(list(tgt_char))

In [13]:
enc_tokens = len(src_char)
dec_tokens = len(tgt_char)

max_enc_seq_length = max([len(txt) for txt in ip_txt])
max_dec_seq_length = max([len(txt) for txt in tgt_txt])

test_max_enc_seq_length = max([len(txt) for txt in test_ip_txt])
test_max_dec_seq_length = max([len(txt) for txt in test_tgt_txt])


In [14]:
src_idx = dict([(char, i) for i, char in enumerate(src_char)])
tgt_idx = dict([(char, i) for i, char in enumerate(tgt_char)])


In [15]:
enc_ip = np.zeros((len(ip_txt), max_enc_seq_length), dtype="float32")

dec_ip = np.zeros((len(ip_txt), max_dec_seq_length), dtype="float32")

dec_tgt = np.zeros((len(ip_txt), max_dec_seq_length, dec_tokens), dtype="float32")

for i, (txt_ip, txt_tgt) in enumerate(zip(ip_txt, tgt_txt)):

    for t, char in enumerate(txt_ip):

        enc_ip[i, t] = src_idx[char]

    enc_ip[i, t + 1 :] = src_idx[" "]

    for t, char in enumerate(txt_tgt):

        dec_ip[i, t] = tgt_idx[char]

        if t > 0:

            dec_tgt[i, t - 1, tgt_idx[char]] = 1.0

    dec_ip[i, t + 1: ] = tgt_idx[" "]
    dec_tgt[i, t:, tgt_idx[" "]] = 1.0

test_enc_ip = np.zeros((len(ip_txt), test_max_enc_seq_length), dtype="float32")

test_dec_ip = np.zeros((len(ip_txt), test_max_dec_seq_length), dtype="float32")

test_dec_tgt = np.zeros((len(ip_txt), test_max_dec_seq_length, dec_tokens), dtype="float32")

for i, (txt_ip, txt_tgt) in enumerate(zip(test_ip_txt, test_tgt_txt)):

    for t, char in enumerate(txt_ip):

        test_enc_ip[i, t] = src_idx[char]

    test_enc_ip[i, t + 1 :] = src_idx[" "]

    for t, char in enumerate(txt_tgt):

        test_dec_ip[i, t] = tgt_idx[char]

        if t > 0:

            test_dec_tgt[i, t - 1, tgt_idx[char]] = 1.0
    
    test_dec_ip[i, t + 1: ] = tgt_idx[" "]
    
    test_dec_tgt[i, t:, tgt_idx[" "]] = 1.0

In [16]:
rev_src_char_idx = dict((i, char) for char, i in src_idx.items())

rev_tgt_char_idx = dict((i, char) for char, i in tgt_idx.items())



In [17]:
x_test = test_enc_ip
y_test = test_tgt_txt

In [21]:
class Seq_to_Seq(object):

  def __init__(self,cell = 'RNN',ip_emb = 32,epochs = 10, hidden_layer=32,batch_size = 32, learning_rate= 1e-3, 
               dropout=0.4,pred ='greedy',beam_width = 5,num_enc = 1,num_dec = 1):
    
        self.cell = cell
        self.ip_emb = ip_emb
        self.hidden_layer = hidden_layer
        self.learning_rate = learning_rate
        self.dropout = dropout
        self.pred = pred
        self.epochs = epochs
        self.batch_size = batch_size
        self.beam_width = beam_width
        self.num_enc = num_enc
        self.num_dec = num_dec

  def fit_model(self,enc_ip,dec_ip,dec_tgt,x_test, y_test):

        # Define an input sequence and process it.
        enc_ips = Input(shape=(None, ),name = 'Enc_ips')

        enc_emb =  Embedding(enc_tokens, self.ip_emb , mask_zero = True,name = 'Enc_emb')(enc_ips)

        enc_ops = enc_emb

        if self.cell == 'LSTM':

            enc_lstm = LSTM(self.hidden_layer, return_state=True,dropout = self.dropout, return_sequences=True, name="Enc_hidden_1")

            enc_ops, hs, cs = enc_lstm(enc_ops)

            enc_states = [hs, cs]

            # Add a LSTM layer with hidden_layer internal units.

            for i in range( 2, self.num_enc +1):

                layer_name = ('Enc_hidden_%d') %i

                enc_lstm = LSTM(self.hidden_layer, return_state=True,dropout = self.dropout, return_sequences=True, name=layer_name)

                enc_ops, hs, cs = enc_lstm(enc_ops,initial_state = enc_states)

                enc_states = [hs, cs]

        elif self.cell == 'GRU':

            enc_gru = GRU(self.hidden_layer, return_state=True,dropout = self.dropout, return_sequences=True, name="Enc_hidden_1")

            enc_ops, hs = enc_gru(enc_ops)

            enc_states = [hs]

            for i in range(2, self.num_enc +1):

                layer_name = ('Enc_hidden_%d') %i

                enc_gru = GRU(self.hidden_layer, return_state=True,dropout = self.dropout, return_sequences=True, name=layer_name)

                enc_ops, hs = enc_gru(enc_ops, initial_state = enc_states)

                enc_states = [hs]  

        elif self.cell == 'RNN':

            enc_rnn = SimpleRNN(self.hidden_layer, return_state=True,dropout = self.dropout, return_sequences=True, name="Enc_hidden_1")

            enc_ops, hs = enc_rnn(enc_ops)

            enc_states = [hs]

            for i in range(2, self.num_enc +1):

                layer_name = ('Enc_hidden_%d') %i

                enc_rnn = SimpleRNN(self.hidden_layer, return_state=True,dropout = self.dropout, return_sequences=True, name=layer_name)

                enc_ops, hs = enc_rnn(enc_ops, initial_state = enc_states)

                enc_states = [hs]  

        # Set up the dec, using `enc_states` as initial state.
        dec_ips = Input(shape=(None,), name = 'Dec_ips')

        dec_emb_layer = Embedding(dec_tokens, self.hidden_layer, mask_zero = True, name = 'Dec_emb')

        dec_emb = dec_emb_layer(dec_ips)

        dec_ops = dec_emb

        if self.cell == 'LSTM':

            dec_lstm = LSTM(self.hidden_layer, return_sequences=True, return_state=True,dropout = self.dropout, name="Dec_hidden_1")

            dec_ops, _, _ = dec_lstm(dec_ops, initial_state = enc_states)
          
            for i in range(2, self.num_dec +1):

              layer_name = ('Dec_hidden_%d') %i

              dec_lstm = LSTM(self.hidden_layer, return_sequences=True, return_state=True,dropout = self.dropout, name=layer_name)

              dec_ops, _, _ = dec_lstm(dec_ops, initial_state = enc_states)

        elif self.cell == 'GRU':
            dec_gru = GRU(self.hidden_layer, return_sequences=True, return_state=True,dropout = self.dropout, name="Dec_hidden_1")

            dec_ops, _ = dec_gru(dec_ops, initial_state = enc_states)

            for i in range(2, self.num_dec+1):

              layer_name = ('Dec_hidden_%d') %i

              dec_gru = GRU(self.hidden_layer, return_sequences=True, return_state=True,dropout = self.dropout, name=layer_name)

              dec_ops, _ = dec_gru(dec_ops, initial_state = enc_states)

        elif self.cell == 'RNN':
            dec_rnn = SimpleRNN(self.hidden_layer, return_sequences=True, return_state=True,dropout = self.dropout, name="Dec_hidden_1")

            dec_ops, _ = dec_rnn(dec_ops, initial_state = enc_states)

            for i in range(2, self.num_dec+1):

              layer_name = ('Dec_hidden_%d') %i

              dec_rnn = SimpleRNN(self.hidden_layer, return_sequences=True, return_state=True,dropout = self.dropout, name=layer_name)

              dec_ops, _ = dec_rnn(dec_ops, initial_state = enc_states)

        dec_dense = Dense(dec_tokens, activation='softmax', name = 'dense')

        dec_ops = dec_dense(dec_ops)

        # Define the model that takes enc and dec input 
        # to output dec_ops
        model = Model([enc_ips, dec_ips], dec_ops)
        model.summary()
        
        # Define the optimizer
        optimizer = Adam(lr=self.learning_rate, beta_1=0.9, beta_2=0.999)
        model.compile(loss = "categorical_crossentropy", optimizer = optimizer, metrics=['accuracy'])
      
        model.fit(
            [enc_ip, dec_ip],
            dec_tgt,
            batch_size=self.batch_size,
            epochs=self.epochs,
            )
        
        enc_model,dec_model = self.inference_model(model)
        dl=['Sno','Input Data','Target data','Predicted Data']
    
        total = 0
        right = 0
        for i in range(len(y_test)):
          input_seq = x_test[i : i + 1]
          result = self.decode_sequence(enc_model,dec_model,input_seq)

          target = y_test[i]
          target = target[1:len(target)-1]
          result = result[0:len(result)-1]
          dl1=[i+1,test_ip_txt[i],target,result]
          dl.append(dl1)

          if result.strip() == target.strip():
            right = right + 1
          
          total = total + 1
          accuracy_epoch = right/total
        
        with open('Vanilla_Predictions.tsv','w',newline='',encoding='utf-8') as file:
            writer=csv.writer(file,delimiter='\t')
            writer.writerows(dl)

        test_accuracy = right/total
        print(test_accuracy)
    
  def inference_model(self,model):
        enc_ips = model.input[0]  

        if self.cell == 'RNN' or self.cell == 'GRU':

          enc_ops, hs_enc = model.get_layer('Enc_hidden_'+ str(self.num_enc)).output

          enc_states = [hs_enc]

          enc_model = Model(enc_ips, enc_states)

          dec_ips = model.input[1]  

          dec_ops = model.get_layer('Dec_emb')(dec_ips)

          dec_states_ips = []

          dec_states = []

          for i in range(1,self.num_dec +1):

            dec_state_input_h = keras.Input(shape=(self.hidden_layer,))

            curr_states_ips = [dec_state_input_h]

            dec = model.get_layer('Dec_hidden_'+ str(i))

            dec_ops, hs_dec = dec(dec_ops, initial_state=curr_states_ips)

            dec_states += [hs_dec]

            dec_states_ips += curr_states_ips

        elif self.cell == 'LSTM':

          enc_ops, hs_enc, cs_enc = model.get_layer('Enc_hidden_'+ str(self.num_enc)).output 

          enc_states = [hs_enc, cs_enc]

          enc_model = Model(enc_ips, enc_states)

          dec_ips = model.input[1]  

          dec_ops = model.get_layer('Dec_emb')(dec_ips)

          dec_states_ips = []

          dec_states = []

          for i in range(1,self.num_dec +1):
            dec_state_input_h = keras.Input(shape=(self.hidden_layer,))

            dec_state_input_c = keras.Input(shape=(self.hidden_layer,))

            curr_states_ips = [dec_state_input_h, dec_state_input_c]

            dec = model.get_layer('Dec_hidden_'+ str(i))

            dec_ops, hs_dec, cs_dec = dec(dec_ops, initial_state=curr_states_ips)

            dec_states += [hs_dec, cs_dec]

            dec_states_ips += curr_states_ips


        dec_dense = model.get_layer('dense')

        dec_ops = dec_dense(dec_ops)

        dec_model = Model([dec_ips] + dec_states_ips, [dec_ops] + dec_states)

        return enc_model,dec_model

  def decode_sequence(self,enc_model,dec_model,input_seq):

        # Encode the input as state vectors.
        states_value = [enc_model.predict(input_seq)] * self.num_dec
        
        # Generate empty target sequence of length 1.
        target_seq = np.zeros((1, 1))

        # Populate the first character of target sequence with the start character.
        target_seq[0, 0] = tgt_idx['B']

        # Sampling loop for a batch of sequences
        # (to simplify, here we assume a batch of size 1).
        stop_condition = False
        decoded_sentence = ""

        while not stop_condition:

            if self.cell == 'RNN' or self.cell == 'GRU':

              dummy = dec_model.predict([target_seq] + [states_value])

              output_tokens, states_value = dummy[0],dummy[1:]
              
            elif self.cell == 'LSTM':  

              dummy = dec_model.predict([target_seq] + states_value)

              output_tokens, states_value = dummy[0],dummy[1:]

            if self.pred == 'greedy':

              beam_w = 1
            elif self.pred == 'beam_search':

              beam_w = self.beam_width

            sampled_token_index = self.beam_search_dec(output_tokens[0,:,:], beam_w)
            sampled_token_index = sampled_token_index[beam_w-1][0]

            # Sample a token
            sampled_token_index = np.argmax(output_tokens[0, -1, :])

            sampled_char = rev_tgt_char_idx[sampled_token_index]

            decoded_sentence += sampled_char

            # Exit condition: either hit max length
            # or find stop character.
            if sampled_char == 'E' or len(decoded_sentence) > max_dec_seq_length:
                stop_condition = True

            # Update the target sequence (of length 1).
            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index


        return decoded_sentence
  
  def beam_search_dec(self,data, k):
    
        sequences = [[list(), 0.0]]
        # walk over each step in sequence
        for row in data:
          all_candidates = list()
          # expand each current candidate
          for i in range(len(sequences)):
            seq, score = sequences[i]
            for j in range(len(row)):
              candidate = [seq + [j], score - log(row[j])]
              #candidate = [seq + [j], score - log1p(row[j])]
              all_candidates.append(candidate)
          # order all candidates by score
          ordered = sorted(all_candidates, key=lambda tup:tup[1])
          # select k best
          sequences = ordered[:k]
        return sequences

In [22]:
#Best Hyperparameters

best_batch_size = 64
best_beam_width = 3
best_cell = 'LSTM'
best_dec_search = 'beam_search'
best_dropout = 0.2
best_epochs = 10
best_hidden_layer = 128
best_ip_emb = 64
best_learning_rate = 0.001
best_num_dec = 3
best_num_enc = 3

In [24]:
rnn_model = Seq_to_Seq(best_cell, ip_emb = best_ip_emb, hidden_layer=best_hidden_layer,
                learning_rate= best_learning_rate, dropout=best_dropout,pred= best_dec_search,epochs = best_epochs,
                batch_size = best_batch_size, beam_width = best_beam_width, num_enc =best_num_enc, num_dec = best_num_dec)
  
rnn_model.fit_model(enc_ip,dec_ip,dec_tgt,x_test, y_test)

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Enc_ips (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 Enc_emb (Embedding)            (None, None, 64)     1728        ['Enc_ips[0][0]']                
                                                                                                  
 Enc_hidden_1 (LSTM)            [(None, None, 128),  98816       ['Enc_emb[0][0]']                
                                 (None, 128),                                                     
                                 (None, 128)]                                                     
                                                                                            

  super(Adam, self).__init__(name, **kwargs)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.3351843625055531
