<a href="https://colab.research.google.com/github/RahulSundar/CS6910-DeepLearningFundamentals/blob/main/Assignment3/Assignment3_Seq2Seq_NeuralMachineTranslation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Downloading the **Dakshina** dataset:


In [None]:
!pip install wandb

!wget https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar

!tar -xf dakshina_dataset_v1.0.tar

## 2. Processing of the **Dakshina** dataset

### 2.1 Data Processing Class:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import cv2
import pathlib


class DataProcessing():

    def __init__(self, DATAPATH, source_lang = 'en', target_lang = "te"):
    
        self.source_lang = source_lang
        self.target_lang = target_lang
    
        self.trainpath = os.path.join(DATAPATH, target_lang, "lexicons", target_lang+".translit.sampled.train.tsv")
        self.valpath = os.path.join(DATAPATH, target_lang, "lexicons", target_lang+".translit.sampled.dev.tsv")
        self.testpath = os.path.join(DATAPATH, target_lang, "lexicons", target_lang+".translit.sampled.test.tsv")
        self.train = pd.read_csv(
            self.trainpath,
            sep="\t",
            names=["tgt", "src", "count"],
        )
        self.val = pd.read_csv(
            self.valpath,
            sep="\t",
            names=["tgt", "src", "count"],
        )
        self.test = pd.read_csv(
            self.testpath,
            sep="\t",
            names=["tgt", "src", "count"],
        )

        # create train data
        self.train_data = self.preprocess(self.train["src"].to_list(), self.train["tgt"].to_list())
        (
            self.train_encoder_input,
            self.train_decoder_input,
            self.train_decoder_target,
            self.source_vocab,
            self.target_vocab,
        ) = self.train_data
        self.source_char2int, self.source_int2char = self.source_vocab
        self.target_char2int, self.target_int2char = self.target_vocab

        # create val data (only encode function suffices as the dictionary lookup should be kep the same.
        self.val_data = self.encode(
            self.val["src"].to_list(),
            self.val["tgt"].to_list(),
            list(self.source_char2int.keys()),
            list(self.target_char2int.keys()),
            source_char2int=self.source_char2int,
            target_char2int=self.target_char2int,
        )
        self.val_encoder_input, self.val_decoder_input, self.val_decoder_target = self.val_data
        self.source_char2int, self.source_int2char = self.source_vocab
        self.target_char2int, self.target_int2char = self.target_vocab

        # create test data
        self.test_data = self.encode(
            self.test["src"].to_list(),
            self.test["tgt"].to_list(),
            list(self.source_char2int.keys()),
            list(self.target_char2int.keys()),
            source_char2int=self.source_char2int,
            target_char2int=self.target_char2int,
        )
        self.test_encoder_input, self.test_decoder_input, self.test_decoder_target = self.test_data
        self.source_char2int, self.source_int2char = self.source_vocab
        self.target_char2int, self.target_int2char = self.target_vocab

    


    def dictionary_lookup(self, vocab):
        char2int = dict([(char, i) for i, char in enumerate(vocab)])
        int2char = dict((i, char) for char, i in char2int.items())
        return char2int, int2char


    def encode(self, source, target, source_chars, target_chars, source_char2int=None, target_char2int=None):
        num_encoder_tokens = len(source_chars)
        num_decoder_tokens = len(target_chars)
        max_source_length = max([len(txt) for txt in source])
        max_target_length = max([len(txt) for txt in target])

        source_vocab, target_vocab = None, None
        if source_char2int == None and target_char2int == None:
            print("Generating the dictionary lookups for character to integer mapping and back")
            source_char2int, source_int2char = self.dictionary_lookup(source_chars)
            target_char2int, target_int2char = self.dictionary_lookup(target_chars)

            source_vocab = (source_char2int, source_int2char)
            target_vocab = (target_char2int, target_int2char)

        encoder_input_data = np.zeros(
            (len(source), max_source_length, num_encoder_tokens), dtype="float32"
        )
        decoder_input_data = np.zeros(
            (len(source), max_target_length, num_decoder_tokens), dtype="float32"
        )
        decoder_target_data = np.zeros(
            (len(source), max_target_length, num_decoder_tokens), dtype="float32"
        )

        for i, (input_text, target_text) in enumerate(zip(source, target)):
            for t, char in enumerate(input_text):
                encoder_input_data[i, t, source_char2int[char]] = 1.0
            encoder_input_data[i, t + 1 :, source_char2int[" "]] = 1.0
            for t, char in enumerate(target_text):
                # decoder_target_data is ahead of decoder_input_data by one timestep
                decoder_input_data[i, t, target_char2int[char]] = 1.0
                if t > 0:
                    # decoder_target_data will be ahead by one timestep
                    # and will not include the start character.
                    decoder_target_data[i, t - 1, target_char2int[char]] = 1.0
            decoder_input_data[i, t + 1 :, target_char2int[" "]] = 1.0
            decoder_target_data[i, t:, target_char2int[" "]] = 1.0
        if source_vocab != None and target_vocab != None:
            return (
                encoder_input_data,
                decoder_input_data,
                decoder_target_data,
                source_vocab,
                target_vocab,
            )
        else:
            return encoder_input_data, decoder_input_data, decoder_target_data


    def preprocess(self, source , target):
        source_chars = set()
        target_chars = set()

        source = [str(x) for x in source]
        target = [str(x) for x in target]

        source_words = []
        target_words = []
        for src, tgt in zip(source, target):
            tgt = "\t" + tgt + "\n"
            source_words.append(src)
            target_words.append(tgt)
            for char in src:
                if char not in source_chars:
                    source_chars.add(char)
            for char in tgt:
                if char not in target_chars:
                    target_chars.add(char)

        source_chars = sorted(list(source_chars))
        target_chars = sorted(list(target_chars))

        #The space needs to be appended so that the encode function doesn't throw errors
        source_chars.append(" ")
        target_chars.append(" ")

        num_encoder_tokens = len(source_chars)
        num_decoder_tokens = len(target_chars)
        max_source_length = max([len(txt) for txt in source_words])
        max_target_length = max([len(txt) for txt in target_words])

        print("Number of samples:", len(source))
        print("Source Vocab length:", num_encoder_tokens)
        print("Target Vocab length:", num_decoder_tokens)
        print("Max sequence length for inputs:", max_source_length)
        print("Max sequence length for outputs:", max_target_length)

        return self.encode(source_words, target_words, source_chars, target_chars)


### 2.2 Processing the database

Default input language is English and output language is Telugu

In [None]:
import numpy as np
import pandas as pd
import os

DATAPATH = "./dakshina_dataset_v1.0"

#By default source language is english and target lang is telugu
dataBase = DataProcessing(DATAPATH) 



## 3. Recurrent neural networks based model for sequence to sequence machine translation 
### 3.1 Seq2Seq **Translation** Model class

In [None]:
import os

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras import layers
 

#from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense, Input, InputLayer, Flatten, Activation, LSTM, SimpleRNN, GRU, TimeDistributed
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import load_model, Sequential,  Model
from tensorflow.keras.callbacks import EarlyStopping


import wandb


class S2STranslation():

    def __init__(self, modelConfigDict, srcChar2Int, tgtChar2Int, using_pretrained_model = False):
        #self.native_vocabulary = modelConfigDict["native_vocabulary"]
        self.numEncoders = modelConfigDict["numEncoders"]
        self.cell_type = modelConfigDict["cell_type"]
        self.latentDim = modelConfigDict["latentDim"]
        self.dropout = modelConfigDict["dropout"]
        self.numDecoders = modelConfigDict["numDecoders"]
        self.hidden = modelConfigDict["hidden"]
        self.tgtChar2Int = tgtChar2Int
        self.srcChar2Int = srcChar2Int

    def build_configurable_model(self):       
        if self.cell_type == "RNN":
            # encoder
            encoder_inputs = Input(shape=(None, len(self.srcChar2Int)))
            encoder_outputs = encoder_inputs
            for i in range(1, self.numEncoders + 1):
                encoder = SimpleRNN(
                    self.latentDim,
                    return_state=True,
                    return_sequences=True,
                    dropout=self.dropout,
                )
                encoder_outputs, state = encoder(encoder_inputs)
            encoder_states = [state]

            # decoder
            decoder_inputs = Input(shape=(None, len(self.tgtChar2Int)))
            decoder_outputs = decoder_inputs
            for i in range(1, self.numDecoders + 1):
                decoder = SimpleRNN(
                    self.latentDim,
                    return_sequences=True,
                    return_state=True,
                    dropout=self.dropout,
                )
                decoder_outputs, _ = decoder(decoder_inputs, initial_state=encoder_states)

            # dense
            hidden = Dense(self.hidden, activation="relu")
            hidden_outputs = hidden(decoder_outputs)
            decoder_dense = Dense(len(self.tgtChar2Int), activation="softmax")
            decoder_outputs = decoder_dense(hidden_outputs)
            model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
            
            return model
        
        elif self.cell_type == "LSTM":
            # encoder
            encoder_inputs = Input(shape=(None, len(self.srcChar2Int)))
            encoder_outputs = encoder_inputs
            for i in range(1, self.numEncoders + 1):
                encoder = LSTM(
                    self.latentDim,
                    return_state=True,
                    return_sequences=True,
                    dropout=self.dropout,
                )
                encoder_outputs, state_h, state_c = encoder(encoder_outputs)
            encoder_states = [state_h, state_c]

            # decoder
            decoder_inputs = Input(shape=(None, len(self.tgtChar2Int)))
            decoder_outputs = decoder_inputs
            for i in range(1, self.numDecoders + 1):
                decoder = LSTM(
                    self.latentDim,
                    return_state=True,
                    return_sequences=True,
                    dropout=self.dropout,
                )
                decoder_outputs, _, _ = decoder(
                    decoder_outputs, initial_state=encoder_states
                )

            # dense
            hidden = Dense(self.hidden, activation="relu")
            hidden_outputs = hidden(decoder_outputs)
            decoder_dense = Dense(len(self.tgtChar2Int), activation="softmax")
            decoder_outputs = decoder_dense(hidden_outputs)
            model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
            
            return model
        
        elif self.cell_type == "GRU":
            # encoder
            encoder_inputs = Input(shape=(None, len(self.srcChar2Int)))
            encoder_outputs = encoder_inputs
            for i in range(1, self.numEncoders + 1):
                encoder = GRU(
                    self.latentDim,
                    return_state=True,
                    return_sequences=True,
                    dropout=self.dropout,
                )
                encoder_outputs, state = encoder(encoder_inputs)
            encoder_states = [state]

            # decoder
            decoder_inputs = Input(shape=(None, len(self.tgtChar2Int)))
            decoder_outputs = decoder_inputs
            for i in range(1, self.numDecoders + 1):
                decoder = GRU(
                    self.latentDim,
                    return_sequences=True,
                    return_state=True,
                    dropout=self.dropout,
                )
                decoder_outputs, _ = decoder(decoder_inputs, initial_state=encoder_states)

            # dense
            hidden = Dense(self.hidden, activation="relu")
            hidden_outputs = hidden(decoder_outputs)
            decoder_dense = Dense(len(self.tgtChar2Int), activation="softmax")
            decoder_outputs = decoder_dense(hidden_outputs)
            model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
            
            return model

### 3.2 Model Training


In [None]:
import numpy as np
import pandas as pd
import os

from tensorflow.keras import Input, Model
from tensorflow.keras.layers import RNN, LSTM, GRU, Dense
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping

import wandb
from wandb.keras import WandbCallback


import tensorflow as tf

physical_devices = tf.config.list_physical_devices('GPU')
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
#Invalid device or cannot modify virtual devices once initialized.
    pass


def train():

    config_defaults = {
        "cell_type": "RNN",
        "latentDim": 256,
        "hidden": 128,
        "optimiser": "rmsprop",
        "numEncoders": 1,
        "numDecoders": 1,
        "dropout": 0.2,
        "epochs": 1,
        "batch_size": 64,
    }


    wandb.init(config=config_defaults,  project="CS6910-Assignment-3", entity="rahulsundar")
    config = wandb.config
    wandb.run.name = (
        str(config.cell_type)
        + dataBase.source_lang
        + str(config.numEncoders)
        + "_"
        + dataBase.target_lang
        + "_"
        + str(config.numDecoders)
        + "_"
        + config.optimiser
        + "_"
        + str(config.epochs)
        + "_"
        + str(config.dropout) 
        + "_"
        + str(config.batch_size)
        + "_"
        + str(config.latentDim)
    )
    wandb.run.save()

    modelInit = S2STranslation(config,srcChar2Int=dataBase.source_char2int, tgtChar2Int=dataBase.target_char2int)
    
    model = modelInit.build_configurable_model()
    
    model.summary()

    model.compile(
        optimizer=config.optimiser,
        loss="categorical_crossentropy",
        metrics=["accuracy"],
    )

    earlystopping = EarlyStopping(
        monitor="val_accuracy", min_delta=0.01, patience=5, verbose=2, mode="auto"
    )

    model.fit(
        [dataBase.train_encoder_input, dataBase.train_decoder_input],
        dataBase.train_decoder_target,
        batch_size=config.batch_size,
        epochs=config.epochs,
        validation_data=([dataBase.val_encoder_input, dataBase.val_decoder_input], dataBase.val_decoder_target),
        callbacks=[earlystopping, WandbCallback()],
    )

    model.save(os.path.join("./TrainedModels", wandb.run.name))    
    wandb.finish()
    
    #return model






Running the train function without sweep: 

In [None]:
'''    
sweep_config = {
    "name": "Bayesian Sweep without attention",
    "method": "bayes",
    "metric": {"name": "val_accuracy", "goal": "maximize"},
    "parameters": {
        
        "cell_type": {"values": ["RNN", "GRU", "LSTM"]},
        
        "latentDim": {"values": [256]},
        
        "hidden": {"values": [128, 64]},
        
        "optimiser": {"values": ["rmsprop", "adam"]},
        
        "numEncoders": {"values": [1, 2, 3]},
        
        "numDecoders": {"values": [1, 2, 3]},
        
        "dropout": {"values": [0.1, 0.2, 0.3]},
        
        "epochs": {"values": [5,10,15]},
        
        "batch_size": {"values": [32, 64]},
    },
}

sweep_id = wandb.sweep(sweep_config, project="CS6910-Assignment-3", entity="rahulsundar")

wandb.agent(sweep_id, train)

'''
#train()

Running the wandb sweep: 

In [None]:
  
sweep_config = {
    "name": "Bayesian Sweep without attention",
    "method": "bayes",
    "metric": {"name": "val_accuracy", "goal": "maximize"},
    "parameters": {
        
        "cell_type": {"values": ["RNN", "GRU", "LSTM"]},
        
        "latentDim": {"values": [256]},
        
        "hidden": {"values": [128, 64]},
        
        "optimiser": {"values": ["rmsprop", "adam"]},
        
        "numEncoders": {"values": [1, 2, 3]},
        
        "numDecoders": {"values": [1, 2, 3]},
        
        "dropout": {"values": [0.1, 0.2, 0.3]},
        
        "epochs": {"values": [5,10,15, 20]},
        
        "batch_size": {"values": [32, 64]},
    },
}

sweep_id = wandb.sweep(sweep_config, project="CS6910-Assignment-3", entity="rahulsundar")

wandb.agent(sweep_id, train, count = 200)


#train()

Move the trained models to Google drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!cp -rf ./TrainedModels /content/gdrive/MyDrive/CS6910/Assignment3/