In [16]:
# Importing the required packages
import argparse
import os
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
import cv2

In [7]:
!pip install wandb --upgrade
import wandb
api = wandb.Api()

Collecting wandb
  Downloading wandb-0.12.16-py2.py3-none-any.whl (1.8 MB)
[?25l[K     |▏                               | 10 kB 31.1 MB/s eta 0:00:01[K     |▍                               | 20 kB 19.4 MB/s eta 0:00:01[K     |▌                               | 30 kB 15.4 MB/s eta 0:00:01[K     |▊                               | 40 kB 14.2 MB/s eta 0:00:01[K     |█                               | 51 kB 6.9 MB/s eta 0:00:01[K     |█                               | 61 kB 8.2 MB/s eta 0:00:01[K     |█▎                              | 71 kB 8.6 MB/s eta 0:00:01[K     |█▌                              | 81 kB 8.4 MB/s eta 0:00:01[K     |█▋                              | 92 kB 9.2 MB/s eta 0:00:01[K     |█▉                              | 102 kB 7.5 MB/s eta 0:00:01[K     |██                              | 112 kB 7.5 MB/s eta 0:00:01[K     |██▏                             | 122 kB 7.5 MB/s eta 0:00:01[K     |██▍                             | 133 kB 7.5 MB/s eta 0:00:01

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [8]:
# b9780837d34fae7e3219f11d7f38fe86fc87d616
%%capture
!wget https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
!tar -xf dakshina_dataset_v1.0.tar

In [2]:
!cp /content/dakshina_dataset_v1.0/hi/lexicons/* ./

In [3]:
train_path = "hi.translit.sampled.train.tsv"
dev_path = "hi.translit.sampled.dev.tsv"
test_path = "hi.translit.sampled.test.tsv"

In [4]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [5]:
import numpy as np
import tensorflow as tf
import pandas as pd

def load_data(path):
    with open(path) as f:
        data = pd.read_csv(f, sep='\t',header=None,names=["indic","english",""],skip_blank_lines=True,index_col=None)
    data = data[data['indic'].notna()]
    data = data[data['english'].notna()]
    data = data[['indic','english']]
    return data

def preprocess(train_path, dev_path, test_path, batch_size):

    train_df = load_data(train_path)
    val_df = load_data(dev_path)
    test_df = load_data(test_path)

    train_indic = train_df['indic'].values
    train_english = train_df['english'].values
    val_indic = val_df['indic'].values
    val_english = val_df['english'].values
    test_indic = test_df['indic'].values
    test_english = test_df['english'].values


    # "\t" is considered as the "start" character
    # "\n" is considered as the "end" character.

    #We add the above characters to the indic transliterated words.
    train_indic =  "\t" + train_indic + "\n"
    val_indic =  "\t" + val_indic + "\n"
    test_indic =  "\t" + test_indic + "\n"


    #Create character sets for each language
    indic_char_set = set()
    english_char_set = set()

    indic_char_set.add(' ')
    english_char_set.add(' ')
    
    for word_english, word_indic in zip(train_english, train_indic):
        for char in word_english:
            english_char_set.add(char)
        for char in word_indic:
            indic_char_set.add(char)

    english_char_set = sorted(list(english_char_set))
    indic_char_set = sorted(list(indic_char_set))


    #Create empty dicts.
    english_char_to_idx = dict()
    indic_char_to_idx = dict()

    english_idx_to_char = dict()
    indic_idx_to_char = dict()

    #As our character sets don't consider spaces, we assign a special id -1 to space.
    # We will pad the strings with spaces to make them of equal length, to support batchwise training.

    english_char_to_idx[" "] = -1
    indic_char_to_idx[" "] = -1

    #Create a mapping of characters to indices    
    for i, char in enumerate(english_char_set):
        english_char_to_idx[char] = i

    for i, char in enumerate(indic_char_set):
        indic_char_to_idx[char] = i


    #Create a mapping of indices to characters.

    for char, idx in english_char_to_idx.items():
        english_idx_to_char[idx] = char

    for char, idx in indic_char_to_idx.items():
        indic_idx_to_char[idx] = char
    
    #Find the max word length in the indic and english sentences respectively.

    max_seq_len_english_encoder = max([len(word) for word in train_english])
    max_seq_len_indic_decoder = max([len(word) for word in train_indic])

    encoder_train_english = np.zeros((len(train_english), max_seq_len_english_encoder), dtype="float32")
    decoder_train_english = np.zeros((len(train_english), max_seq_len_indic_decoder), dtype="float32")
    decoder_train_indic = np.zeros(
        (len(train_english), max_seq_len_indic_decoder, len(indic_char_set)), dtype="float32"
    )

    encoder_val_english = np.zeros(
        (len(val_english), max_seq_len_english_encoder), dtype="float32"
    )
    decoder_val_english = np.zeros(
        (len(val_english), max_seq_len_indic_decoder), dtype="float32"
    )
    decoder_val_indic = np.zeros(
        (len(val_english), max_seq_len_indic_decoder, len(indic_char_set)), dtype="float32"
    )

    encoder_test_english = np.zeros(
        (len(test_english), max_seq_len_english_encoder), dtype="float32"
    )
    decoder_test_english = np.zeros(
        (len(test_english), max_seq_len_indic_decoder), dtype="float32"
    )
    decoder_test_indic = np.zeros(
        (len(test_english), max_seq_len_indic_decoder, len(indic_char_set)), dtype="float32"
    )

    # print(encoder_train_english.shape, "ENC Train Eng")
    # print(decoder_train_english.shape, "DEC Train Eng")
    # print(decoder_train_indic.shape, "DEC Train Indic")
    # print(encoder_val_english.shape, "ENC Val Eng")
    # print(decoder_val_english.shape, "DEC Val Eng")
    # print(decoder_val_indic.shape, "DEC Val Eng")
    # print(encoder_test_english.shape, "ENC Test Eng")
    # print(decoder_test_english.shape, "DEC Test Eng")
    # print(decoder_test_indic.shape, "DEC Test Eng")
  

    for i, (input_word, target_word) in enumerate(zip(train_english, train_indic)):
        for t, char in enumerate(input_word):
            #Replace character by its index.
            encoder_train_english[i, t] = english_char_to_idx[char]
        #Padding with zeros.
        encoder_train_english[i, t + 1 :] = english_char_to_idx[' ']
        
        for t, char in enumerate(target_word):
            decoder_train_english[i, t] = indic_char_to_idx[char]
            if t > 0:
                # Indic decoder will be ahead by one timestep.
                decoder_train_indic[i, t - 1, indic_char_to_idx[char]] = 1.0
        #Padding with spaces.
        decoder_train_english[i, t + 1 :] = indic_char_to_idx[' ']
        decoder_train_indic[i, t :, indic_char_to_idx[' ']] = 1.0


    for i, (input_word, target_word) in enumerate(zip(val_english, val_indic)):
        for t, char in enumerate(input_word):
            #Replace character by its index.
            encoder_val_english[i, t] = english_char_to_idx[char]
        #Padding with zeros.
        encoder_val_english[i, t + 1 :] = english_char_to_idx[' ']
        
        for t, char in enumerate(target_word):
            decoder_val_english[i, t] = indic_char_to_idx[char]
            if t > 0:
                # Indic decoder will be ahead by one timestep.
                decoder_val_indic[i, t - 1, indic_char_to_idx[char]] = 1.0
        #Padding with spaces.
        decoder_val_english[i, t + 1 :] = indic_char_to_idx[' ']
        decoder_val_indic[i, t :, indic_char_to_idx[' ']] = 1.0

    for i, (input_word, target_word) in enumerate(zip(test_english, test_indic)):
        for t, char in enumerate(input_word):
            #Replace character by its index.
            encoder_test_english[i, t] = english_char_to_idx[char]
        #Padding with spaces.
        encoder_test_english[i, t + 1 :] = english_char_to_idx[' ']
        
        for t, char in enumerate(target_word):
            decoder_test_english[i, t] = indic_char_to_idx[char]
            if t > 0:
                # Indic decoder will be ahead by one timestep.
                decoder_test_indic[i, t - 1, indic_char_to_idx[char]] = 1.0
        #Padding with spaces.
        decoder_test_english[i, t + 1 :] = indic_char_to_idx[' ']
        decoder_test_indic[i, t :, indic_char_to_idx[' ']] = 1.0


    return (encoder_train_english, decoder_train_english, decoder_train_indic), (encoder_val_english, decoder_val_english, decoder_val_indic), (val_english, val_indic), (encoder_test_english, decoder_test_english, decoder_test_indic), (english_char_set, indic_char_set, max_seq_len_english_encoder, max_seq_len_indic_decoder), (indic_char_to_idx, indic_idx_to_char), (english_char_to_idx, english_idx_to_char)
    

#Reference : Keras Documentation.
#https://keras.io/examples/nlp/lstm_seq2seq/
#https://stackoverflow.com/questions/54176051/invalidargumenterror-indicesi-0-x-is-not-in-0-x-in-keras

In [10]:
from tensorflow import keras
from keras.layers import Dense, Input,LSTM,SimpleRNN,GRU,TimeDistributed,Embedding
from tensorflow.keras.optimizers import Adam,Nadam
# import wandb
import tensorflow as tf
from tensorflow.keras.layers import Concatenate, AdditiveAttention
# from wandb.keras import WandbCallback
import numpy as np
import pandas as pd


class Model(object):
    def __init__(self, english_char_set, indic_char_set, max_seq_len_english_encoder, max_seq_len_indic_decoder, indic_char_to_idx, indic_idx_to_char, english_char_to_idx, english_idx_to_char, cell ="LSTM", optimizer = "adam", embedding_size = 32, num_enc_layers = 5, num_dec_layers =2, num_hidden_layers = 64, dropout = 0):
        self.len_enc_charset = len(english_char_set)
        self.len_dec_charset = len(indic_char_set)
        self.max_seq_len_english_encoder = max_seq_len_english_encoder
        self.max_seq_len_indic_decoder = max_seq_len_indic_decoder
        self.indic_char_to_idx = indic_char_to_idx
        self.indic_idx_to_char = indic_idx_to_char
        self.english_char_to_idx = english_char_to_idx
        self.english_idx_to_char = english_idx_to_char
        self.cell = cell
        self.embedding_size = embedding_size
        self.num_enc_layers = num_enc_layers
        self.num_dec_layers= num_dec_layers
        self.num_hidden_layers =num_hidden_layers
        self.encoder_model = None
        self.decoder_model = None
        self.model = None
        self.dropout = dropout
        self.num_epochs = None
        self.batch_size = None
        self.optimizer = optimizer
        

    def build_model(self):
        encoder_inputs = Input(shape=(None,), name="encoder_input")
        encoder_outputs = Embedding(self.len_enc_charset, self.embedding_size, name = "encoder_embedding")(encoder_inputs)
        self.enc_layers = []
        self.dec_layers = []
        encoder_states = list()
        for j in range(self.num_enc_layers):
            if self.cell == "rnn":
                encoder = SimpleRNN(self.num_hidden_layers, dropout = self.dropout, return_state = True, return_sequences = True)
                encoder_outputs, state = encoder(encoder_outputs)
                encoder_states.append([state])
                self.enc_layers.append(encoder)
            if self.cell == "lstm":
                encoder = LSTM(self.num_hidden_layers, dropout = self.dropout, return_state = True, return_sequences = True)
                encoder_outputs, state_h, state_c = encoder(encoder_outputs)
                encoder_states.append([state_h,state_c])
                self.enc_layers.append(encoder)
            if self.cell == "gru":
                encoder = GRU(self.num_hidden_layers, dropout = self.dropout, return_state = True, return_sequences = True)
                encoder_outputs, state = encoder(encoder_outputs)
                encoder_states.append([state])
                self.enc_layers.append(encoder)

        self.encoder_model = keras.Model(encoder_inputs,encoder_states)

        decoder_inputs = keras.Input(shape=(self.max_seq_len_indic_decoder, ), name = "decoder_input")
      
        decoder_outputs = Embedding(self.len_dec_charset, self.embedding_size, name = "decoder_embedding")(decoder_inputs)
        decoder_states = list()

        for j in range(self.num_dec_layers):
            if self.cell == "rnn":
                decoder = SimpleRNN(self.num_hidden_layers, dropout = self.dropout, return_sequences = True, return_state = True)
                decoder_outputs, state = decoder(decoder_outputs, initial_state = encoder_states[j])
                decoder_states.append([state])
                self.dec_layers.append(decoder)
            if self.cell == "lstm":
                decoder = LSTM(self.num_hidden_layers, dropout = self.dropout, return_sequences = True, return_state = True)
                decoder_outputs, state_h, state_c = decoder(decoder_outputs, initial_state = encoder_states[j])
                decoder_states.append([state_h, state_c])
                self.dec_layers.append(decoder)
            if self.cell == "gru":
                decoder = GRU(self.num_hidden_layers, dropout = self.dropout, return_sequences = True, return_state = True)
                decoder_outputs, state = decoder(decoder_outputs, initial_state = encoder_states[j])
                decoder_states.append([state])
                self.dec_layers.append(decoder)

        decoder_attn = AdditiveAttention(name="attention_layer")
        decoder_concat = Concatenate(name="concatenate_layer")
        cont_vec, attn_wts = decoder_attn([decoder_outputs, encoder_outputs],return_attention_scores=True)
        decoder_outputs = decoder_concat([decoder_outputs,cont_vec])
        
        dec_dense =Dense(self.len_dec_charset, activation="softmax", name="dense_layer")
        dec_pred = dec_dense(decoder_outputs)
            
        
        model = keras.Model([encoder_inputs, decoder_inputs], dec_pred)

        model.compile(
            optimizer=self.optimizer, loss="categorical_crossentropy", metrics=["accuracy"]
        )

        self.model = model


    def train(self, encoder_train_english, decoder_train_english, decoder_train_indic, encoder_val_english, decoder_val_english, decoder_val_indic, num_epochs =10, batch_size = 64):
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.model.fit(
        x = [encoder_train_english, decoder_train_english],
        y = decoder_train_indic,
        validation_data = ([encoder_val_english, decoder_val_english], decoder_val_indic),
        batch_size = self.batch_size,
        epochs = self.num_epochs,
        # callbacks = [WandbCallback()]
        )  

    def inference_setup(self):
    
        encoder_inputs = self.model.input[0]

        enc_embed_layer = self.model.get_layer('encoder_embedding')

        encoder_outputs = enc_embed_layer(encoder_inputs)

        encoder_states = []

        if self.cell == 'rnn':
            for i in range(self.num_enc_layers):
                encoder_outputs, state_h = self.enc_layers[i](encoder_outputs)
                encoder_states += [state_h] 
        elif self.cell == 'lstm':
            for i in range(self.num_enc_layers):
                encoder_outputs, state_h, state_c = self.enc_layers[i](encoder_outputs)
                encoder_states += [state_h, state_c]   
        elif self.cell == 'gru':
            for i in range(self.num_enc_layers):
                encoder_outputs, state_h = self.enc_layers[i](encoder_outputs)
                encoder_states += [state_h] 

        self.encoder_model = keras.Model(encoder_inputs, encoder_states + [encoder_outputs])


        decoder_inputs = self.model.input[1]    
        dec_embed_layer = self.model.get_layer('decoder_embedding')
        decoder_outputs = dec_embed_layer(decoder_inputs)

        dec_states = []
        dec_initial_states = []
        
        if self.cell == 'lstm' :
            j=0
            for i in range(self.num_dec_layers):
                dec_initial_states += [Input(shape=(self.num_hidden_layers, )) , Input(shape=(self.num_hidden_layers, ))]
                decoder_outputs, state_h, state_c = self.dec_layers[i](decoder_outputs, initial_state=dec_initial_states[i+j:i+j+2])
                dec_states += [state_h , state_c]
                j += 1

        else:
            for i in range(self.num_dec_layers):
                dec_initial_states += [Input(shape=(self.num_hidden_layers,))]
                decoder_outputs, state_h = self.dec_layers[i](decoder_outputs, initial_state = dec_initial_states[i])
                dec_states += [state_h]

        attention_layer = self.model.get_layer('attention_layer')

        attention_input = Input(shape=(self.max_seq_len_english_encoder,self.num_hidden_layers))   

        context_vector, alphas = attention_layer([decoder_outputs, attention_input], return_attention_scores=True)
    
        concat_layer = self.model.get_layer('concatenate_layer')

        decoder_outputs = concat_layer([decoder_outputs, context_vector])


        # Dense layer
        decoder_dense = self.model.get_layer('dense_layer')

        decoder_outputs = decoder_dense(decoder_outputs)

        # Decoder model
        self.decoder_model = keras.Model(
            [decoder_inputs] + dec_initial_states + [attention_input], [decoder_outputs] + dec_states + [alphas])

    def decode_sequence(self, input_seq):
        self.inference_setup()
        enc_states = self.encoder_model.predict(input_seq)
        attention_input = enc_states[-1]

        enc_states = enc_states[:-1]
        
        target_seq = np.zeros((1, 1)) 
        target_seq[0, 0] = self.indic_char_to_idx["\t"]
        
        attention_weights = []
        stop_condition = False
        decoded_sentence = ""
        while not stop_condition:
            output_tokens = self.decoder_model.predict([target_seq] + enc_states + [attention_input])
            sampled_token_index = np.argmax(output_tokens[0][0, -1, :])
            sampled_char = self.indic_idx_to_char[sampled_token_index]
            decoded_sentence += sampled_char

            if sampled_char == "\n" or len(decoded_sentence) > self.max_seq_len_indic_decoder:
                stop_condition = True

            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index

            enc_states = output_tokens[1:-1]
            attention_weights.append(output_tokens[-1][0][0])
            
        return decoded_sentence, attention_weights


In [17]:

sweep = api.sweep("cs6910_a2/CS6910_A3/7hfn1ujr")
runs = sorted(sweep.runs, key=lambda run: run.summary.get("val_accuracy", 0), reverse=True)
runs[0].file("model-best.h5").download(replace=True)
print("Best model saved to model-best.h5")
image_size = [224,224]
input_shape= image_size
input_shape.append(3)
model = keras.models.load_model('./model-best.h5')
model.summary()

Best model saved to model-best.h5
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 decoder_input (InputLayer)     [(None, 21)]         0           []                               
                                                                                                  
 encoder_embedding (Embedding)  (None, None, 128)    3456        ['encoder_input[0][0]']          
                                                                                                  
 decoder_embedding (Embedding)  (None, 21, 128)      8448        ['decoder_input[0][0]']          
                                                          