In [None]:
import numpy as np
import pandas as pd 
import random
import tensorflow as tf
from tensorflow.keras.layers import SimpleRNN, GRU, LSTM, Embedding, Layer, Dense, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow.keras.backend as K

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def data_process(directory, tokenize_english=None, tokenize_tamil=None):
#This function creates textual data to model readable one
    df = pd.read_csv(directory, sep="\t", header=None)

    for column in [0,1]:
      df[column]="\t"+df[column]+"\n"

    if tokenize_english is None:
        tokenize_english = Tokenizer(char_level=True)
        tokenize_english.fit_on_texts(df[1].astype(str).tolist())

    input_lang_tensor = tokenize_english.texts_to_sequences(df[1].astype(str).tolist())
    input_lang_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_lang_tensor, padding='post')

    if tokenize_tamil is None:
        tokenize_tamil = Tokenizer(char_level=True)
        tokenize_tamil.fit_on_texts(df[0].astype(str).tolist())

    targ_lang_tensor = tokenize_tamil.texts_to_sequences(df[0].astype(str).tolist())
    targ_lang_tensor = tf.keras.preprocessing.sequence.pad_sequences(targ_lang_tensor, padding='post')

    
    dataset = tf.data.Dataset.from_tensor_slices((input_lang_tensor, targ_lang_tensor))
    dataset = dataset.shuffle(len(dataset))
    
    return dataset, tokenize_english, tokenize_tamil

In [None]:
def layer_selection(neuron, dropout, layer_name, return_sequences=False, return_state=False):

    if layer_name=="SimpleRNN":
        return SimpleRNN(units=neuron, dropout=dropout, return_sequences=return_sequences, return_state=return_state)

    elif layer_name=="GRU":
        return GRU(units=neuron, dropout=dropout, return_sequences=return_sequences, return_state=return_state)

    elif layer_name=="LSTM":
        return LSTM(units=neuron, dropout=dropout, return_sequences=return_sequences, return_state=return_state)

class Attention(Layer):
#This class is used to have attention on top of the encoder  
  def __init__(self, neuron):
    super(Attention, self).__init__()
    self.Q = Dense(neuron)
    self.k = Dense(neuron)
    self.VT = Dense(1)
# This function uses Dot product attention formula
  def call(self, encoder_state, encoder_out):    
    encoder_state = tf.concat(encoder_state, 1)
    encoder_state = tf.expand_dims(encoder_state, 1)
    score = self.VT(tf.nn.tanh(self.Q(encoder_state) + self.k(encoder_out)))
    att_weights = tf.nn.softmax(score, axis=1)
    context = att_weights * encoder_out
    context = tf.reduce_sum(context, axis=1)
    return context, att_weights

class TranslationEncoder(tf.keras.Model):
    def __init__(self, emb_dimension, neuron, dropout, layer, no_of_layers, encoder_word_size):
        super(TranslationEncoder, self).__init__()
        self.neuron = neuron
        self.dropout = dropout
        self.layer = layer
        self.no_of_layers = no_of_layers
        self.embedding = Embedding(encoder_word_size, emb_dimension)
        self.create_layers()

    def call(self, statex, hidden):
        statex = self.embedding(statex)
        statex = self.netwrk_layers[0](statex, initial_state=hidden)

        for layer in self.netwrk_layers[1:]:
            statex = layer(statex)

        state_output, state = statex[0], statex[1:]

        return state_output, state
    
    def create_layers(self):
        self.netwrk_layers = []

        for i in range(self.no_of_layers):
            self.netwrk_layers.append(layer_selection(self.neuron, self.dropout, self.layer, return_sequences=True, return_state=True))


    def build_hidden_state(self, batch_size):
        h= [tf.zeros((batch_size, self.neuron))]
        if self.layer == "LSTM":
          h=h*2
        return h  


class TransaltionDecoder(tf.keras.Model):
    def __init__(self, emb_dimension, neuron, dropout, layer, no_of_layers, decoder_word_size, att=False):
        super(TransaltionDecoder, self).__init__()
        self.neuron = neuron
        self.dropout = dropout
        self.layer = layer
        self.no_of_layers = no_of_layers
        self.embedding_layer = Embedding(input_dim=decoder_word_size, output_dim=emb_dimension)
        self.att = att
        
        self.dense = Dense(decoder_word_size, activation="softmax")
        self.flatten = Flatten()
        if self.att:
            self.att_layer = Attention(self.neuron)
        self.create_layers()

    def call(self, h, hidden, encoder_out=None):
        
        h = self.embedding_layer(h)

        if self.att:
            context, att_weights = self.att_layer(hidden, encoder_out)
            h = tf.concat([tf.expand_dims(context, 1), h], -1)
        else:
            att_weights = None

        h = self.netwrk_layers[0](h, initial_state=hidden)

        for layer in self.netwrk_layers[1:]:
            h = layer(h)

        state_output, state = h[0], h[1:]

        state_output = self.dense(self.flatten(state_output))
        
        return state_output, state, att_weights

    def create_layers(self):
        self.netwrk_layers = []    

        for i in range(self.no_of_layers - 1):
            self.netwrk_layers.append(layer_selection(self.neuron, self.dropout, self.layer, return_sequences=True, return_state=True))
        
        self.netwrk_layers.append(layer_selection(self.neuron, self.dropout, self.layer, return_sequences=False, return_state=True))



In [None]:
class NLPModel():
    def __init__(self, emb_dimension, neuron, dropout, no_of_layers, layer, att=False):
        self.emb_dimension = emb_dimension
        self.no_of_layers = no_of_layers
        self.layer = layer
        self.neuron = neuron
        self.dropout = dropout
        self.att = att
        self.batch_size = 64

    def initialize(self, input_lang_tokenizer, targ_lang_tokenizer, loss, optimizer, metric):
        self.input_lang_tokenizer = input_lang_tokenizer
        self.targ_lang_tokenizer = targ_lang_tokenizer
        encoder_word_size = len(self.input_lang_tokenizer.word_index) + 1
        decoder_word_size = len(self.targ_lang_tokenizer.word_index) + 1
        self.encoder = TranslationEncoder(self.emb_dimension, self.neuron, self.dropout, self.layer, self.no_of_layers, encoder_word_size)
        self.decoder = TransaltionDecoder(self.emb_dimension, self.neuron, self.dropout, self.layer, self.no_of_layers, decoder_word_size, self.att)    
        self.loss = loss
        self.optimizer = optimizer
        self.metric = metric


    @tf.function
    def training(self, english, tamil, encoder_state):
        loss = 0 
        # This function is to do training step by step in each epoch
        with tf.GradientTape() as t: 
            encoder_out, encoder_state = self.encoder(english, encoder_state)
            decoder_state = encoder_state
            decoder_input = tf.expand_dims([self.targ_lang_tokenizer.word_index["\t"]]*self.batch_size ,1)

            if random.random() < self.teacher_forcing:
                for i in range(1, tamil.shape[1]):

                    predictions, decoder_state, _ = self.decoder(decoder_input, decoder_state, encoder_out)
                    loss = loss+self.loss(tamil[:,i], predictions)
                    self.metric.update_state(tamil[:,i], predictions)
                    decoder_input = tf.expand_dims(tamil[:,i], 1)
            
            else:

                for i in range(1, tamil.shape[1]):

                    predictions, decoder_state, _ = self.decoder(decoder_input, decoder_state, encoder_out)
                    loss = loss+self.loss(tamil[:,i], predictions)
                    self.metric.update_state(tamil[:,i], predictions)

                    predictions = tf.argmax(predictions, 1)
                    decoder_input = tf.expand_dims(predictions, 1)


            total_batch_loss = loss / tamil.shape[1]

            total_variables = self.encoder.variables + self.decoder.variables
            final_gradients = t.gradient(loss, variables)

            self.optimizer.apply_gradients(zip(final_gradients, total_variables))

        return total_batch_loss, self.metric.result()

    @tf.function
    def validation(self, english, tamil, encoder_state):

        loss = 0
        # This function is to do validation step by step in each epoch  
        encoder_out, encoder_state = self.encoder(english, encoder_state)
        decoder_state = encoder_state
        decoder_input = tf.expand_dims([self.targ_lang_tokenizer.word_index["\t"]]*self.batch_size ,1)
        for t in range(1, tamil.shape[1]):
            predictions, decoder_state, _ = self.decoder(decoder_input, decoder_state, encoder_out)
            loss = loss+self.loss(tamil[:,t], predictions)
            self.metric.update_state(tamil[:,t], predictions)
            predictions = tf.argmax(predictions, 1)
            decoder_input = tf.expand_dims(predictions, 1)
        total_batch_loss = loss / tamil.shape[1]
        
        return total_batch_loss, self.metric.result()


    def fit_model(self, dataset, v_data, batch_size=64, epochs=20, teacher_forcing=0.9):
# This function uses teacher forcing technique before decoder      
        self.teacher_forcing = teacher_forcing
        self.batch_size = batch_size

        steps = len(dataset) // self.batch_size
        steps_in_validation = len(v_data) // self.batch_size
        
        dataset = dataset.batch(self.batch_size, drop_remainder=True)
        v_data = v_data.batch(self.batch_size, drop_remainder=True)

        english_sample, tamil_sample = next(iter(dataset))
        self.max_tamil_length = tamil_sample.shape[1]
        self.max_english_length = english_sample.shape[1]
        for epoch in range(1, epochs+1):
            loss_sum = 0
            accuracy_sum = 0
            self.metric.reset_states()
            encoder_state = self.encoder.build_hidden_state(self.batch_size)
            #Batch wise training happens in this loop
            for batch, (english, tamil) in enumerate(dataset.take(steps)):
                batch_loss, acc = self.training(english, tamil, encoder_state)
                accuracy_sum = accuracy_sum+acc                
                loss_sum =loss_sum+ batch_loss

            average_accuracy = (accuracy_sum/steps)*100
            average_loss = loss_sum/steps

            total_validation_loss = 0
            total_validation_accuracy = 0
            self.metric.reset_states()
            encoder_state = self.encoder.build_hidden_state(self.batch_size)
            #Validation happens in this loop
            for batch, (english, tamil) in enumerate(v_data.take(steps_in_validation)):
                batch_loss, accuracy_v = self.validation(english, tamil, encoder_state)
                total_validation_loss = total_validation_loss+batch_loss
                total_validation_accuracy = total_validation_accuracy+accuracy_v

            average_validation_accuracy = (total_validation_accuracy / steps_in_validation)*100
            average_validation_loss = total_validation_loss / steps_in_validation
            wandb.log({"epoch": epoch,"training_loss": average_loss,"validation_loss": average_validation_loss,"training_accuracy": average_accuracy,"validation_accuracy": average_validation_accuracy})


    def stats(self, test_data, batch_size=100):
        #This function computes test accuracy and loss
        self.batch_size = batch_size
        steps_in_test = len(test_data) // batch_size
        test_data = test_data.batch(batch_size, drop_remainder=True) 
        test_loss_sum = 0
        test_accuracy_sum = 0
        self.metric.reset_states()
        encoder_state = self.encoder.build_hidden_state(self.batch_size)
        for batch, (english, tamil) in enumerate(test_data.take(steps_in_test)):
            batch_loss, accuracy = self.validation(english, tamil, encoder_state)
            test_loss_sum = test_loss_sum+batch_loss
            test_accuracy_sum = test_accuracy_sum+accuracy 
        average_test_accuracy = test_accuracy_sum / steps_in_test
        average_test_loss = test_loss_sum / steps_in_test
    
        return average_test_loss, average_test_accuracy


    def word_translation(self, english_word):

        english_word = "\t"+english_word+"\n"
        english = self.input_lang_tokenizer.texts_to_sequences([english_word])
        english = tf.keras.preprocessing.sequence.pad_sequences(english, maxlen=self.max_english_length, padding="post")
        final_prediction = ""

        encoder_state = self.encoder.build_hidden_state(1)
        encoder_out, encoder_state = self.encoder(english, encoder_state)
        decoder_state = encoder_state
        decoder_input = tf.expand_dims([self.targ_lang_tokenizer.word_index["\t"]]*1, 1)

        for t in range(1, self.max_tamil_length):
            pred, decoder_state, _ = self.decoder(decoder_input, decoder_state, encoder_out)                     
            pred = tf.argmax(pred, 1)
            next_character = self.targ_lang_tokenizer.index_word[pred.numpy().item()]
            final_prediction = final_prediction+next_character
            decoder_input = tf.expand_dims(pred, 1)

            if next_character == "\n":
                return final_prediction[:-1]

        return final_prediction[:-1]


In [None]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.12.16-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 5.3 MB/s 
Collecting setproctitle
  Downloading setproctitle-1.2.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29 kB)
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.9-py3-none-any.whl (9.4 kB)
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.5.12-py2.py3-none-any.whl (145 kB)
[K     |████████████████████████████████| 145 kB 37.5 MB/s 
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 42.1 MB/s 
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.9-py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 1.7 MB/s 
Collecting smmap<6,>

In [None]:
import wandb
wandb.login()
wandb.init()

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtejoram[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
def train():

    config_defaults = {"emb_dimension": 64, 
                       "neuron": 128,
                       "dropout": 0,                       
                       "no_of_layers": 1,
                       "layer": "LSTM",
                       "att": False,
                       "teacher_forcing": 1.0
                       }

    wandb.init(project="cs6910-assignment3", entity="tejoram")
    config=config_defaults
    emb_dimension=wandb.config.emb_dimension
    neuron=wandb.config.neuron
    dropout=wandb.config.dropout    
    no_of_layers=wandb.config.no_of_layers
    layer=wandb.config.layer
    att=wandb.config.att                       
    teacher_forcing= wandb.config.teacher_forcing
    wandb.run.name = "emd_{}_u_{}_d_{}_No.l_{}_l.type_{}_at_{}_tf_{}".format(emb_dimension, \
                                                                             neuron , \
                                                                             dropout , \
                                                                             no_of_layers , \
                                                                             layer , \
                                                                             att , \
                                                                             teacher_forcing)

    
    train_data = "/content/drive/MyDrive/Assignment3/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv"
    validation_data= "/content/drive/MyDrive/Assignment3/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.dev.tsv"
    dataset, input_lang_tokenizer, targ_lang_tokenizer = data_process(train_data)
    v_data, _, _ = data_process(validation_data, input_lang_tokenizer, targ_lang_tokenizer)
    model = NLPModel(emb_dimension, neuron, dropout, no_of_layers, layer, att)
    model.initialize(input_lang_tokenizer, targ_lang_tokenizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(), optimizer = tf.keras.optimizers.Adam(), metric = tf.keras.metrics.SparseCategoricalAccuracy())
    model.fit_model(dataset, v_data, epochs=15, teacher_forcing=wandb.config.teacher_forcing)



In [None]:
sweep_config = {
  'metric': {'name': 'validation_accuracy','goal':'maximize'},    
  "name": "Assignment3",
  "method": "grid",
  "parameters": {
        "emb_dimension": {
            "values": [64]
        },      
        "neuron": {
            "values": [256]
        },     
        "dropout": {
            "values": [0.2]
        },         
        "no_of_layers": {
           "values": [3]
        },
        "layer": {
            "values": ["LSTM"]
        },
        "att": {
            "values": [False]
        },        
         "teacher_forcing": {
            "values": [1.0]
        }      
    }
}

In [None]:
sweep_id = wandb.sweep(sweep_config, project="Assignment3")

Create sweep with ID: 6ltjhljk
Sweep URL: https://wandb.ai/tejoram/Assignment3/sweeps/6ltjhljk


In [None]:
wandb.agent(sweep_id, function=train)

In [None]:
def besttest(emb_dimension, neuron, dropout, no_of_layers, layer, att, teacher_forcing=1.0):
    
    train_data= "/content/drive/MyDrive/Assignment3/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv"
    validation_data= "/content/drive/MyDrive/Assignment3/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.dev.tsv"
    test_data= "/content/drive/MyDrive/Assignment3/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.test.tsv"


    model = NLPModel(emb_dimension, neuron, dropout, no_of_layers, layer, att)
    dataset, input_lang_tokenizer, targ_lang_tokenizer = data_process(train_data)
    v_data, _, _ = data_process(validation_data, input_lang_tokenizer, targ_lang_tokenizer)
    model.initialize(input_lang_tokenizer, targ_lang_tokenizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(), optimizer = tf.keras.optimizers.Adam(), metric = tf.keras.metrics.SparseCategoricalAccuracy())
    model.fit_model(dataset, v_data, epochs=20, teacher_forcing=teacher_forcing)

    test_data_processed, _, _ = data_process(test_data, model.input_lang_tokenizer, model.targ_lang_tokenizer)
    test_loss, test_accuarcy = model.stats(test_data_processed)
    print("\ncharacter level accuracy:",test_accuarcy)
    print("\ncharacter level loss:",test_loss)

    td = pd.read_csv(test_data, sep="\t", header=None)
    tamil = td[0].astype(str).tolist()
    english = td[1].astype(str).tolist() 
    predictions = []
    for english_word in english:
        predictions.append(model.word_translation(english_word))
    print("\n Word Accuracy",(np.sum(np.asarray(predictions) == np.array(tamil)) / len(predictions)))
    df = pd.DataFrame()
    df["inputs"] = english
    df["targets"] = tamil
    df["predictions"] = predictions
    df.to_csv("/content/drive/MyDrive/Assignment3/save_outputs.csv")


    return model

In [None]:
model = besttest(emb_dimension=256, neuron=256, dropout=0.2, no_of_layers=3, layer="LSTM", att=False, teacher_forcing=1.0)