In [1]:
#JV

In [2]:
import numpy as np

import pandas as pd

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset,TensorDataset, DataLoader, RandomSampler
from torch.nn.utils.rnn import pad_sequence
import random
from functools import partial

import os
import gc
import time
import math

import matplotlib.pyplot as plt
#plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

from tqdm.notebook import tqdm

#!python3 -m pip install wandb
import wandb

from Core_Utils import *

from Encoder_Decoder_Architecture import *

In [3]:
seed = 23

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [13]:
class MachineTranslator:

    """
    The class that instantiates the encoder-decoder architecture and brings all methods relevant for training, computing accuracy and evaluation here.
    """

    def __init__(self,source_vocab_size,target_vocab_size,hidden_size,embedding_size,rnn_type,batch_size,pad_token_id,dropout,num_layers,bidirectional,use_attention,device):

        """
        The constructor of the class.

        Params:

            source_vocab_size : The vocabulary size of the source language.
            target_vocab_size : The vocabulary size of the target language.
            hidden_size : The dimension of the hidden state of the recurrent cell.
            embedding_size : The dimension of the embedding used.
            rnn_type : "GRU"/"LSTM"/"RNN", case INsensitive. Default : "GRU".
            batch_size : The batch size used for training. This is needed to resize dimensions in the BahdanauAttention's forward pass.
            pad_token_id : The id corresponding to the token <pad>.
            dropout : Droput probability. Encoder and Decoder by default use a dropout of 0.1, unless specified otherwise.
            num_layers(int) : The number of encoder (recurrence unit) layers. Default : 1
            bidirectional : True/False. If True, encoding is done by parsing input L->R and R->L, hence doubling the hiddenstate size. Default False.
            use_attention : Boolean variable, default True, indicating to make use of BahdanauAttention.

            Note : hidden_size,embedding_size,dopout, num_layers,bidirectional, rnn_type. These parameters are consistent across the encoder and decoder.
                    However, the code supports use of different values.
        
        Returns:

            None.

        
        """
        self.device = device
        
        self.encoder = Encoder(source_vocab_size = source_vocab_size, hidden_size = hidden_size,embedding_size=embedding_size,rnn_type = rnn_type,padding_idx=pad_token_id,num_layers=num_layers,bidirectional=bidirectional,dropout=dropout).to(self.device)
    
        self.decoder = Decoder(hidden_size = hidden_size,embedding_size=embedding_size, target_vocab_size = target_vocab_size,batch_size = batch_size,rnn_type = rnn_type,use_attention = use_attention, padding_idx = pad_token_id,num_layers = num_layers,bidirectional = bidirectional,dropout=dropout,device=self.device).to(self.device)

        
    def train_epoch(self,train_loader, encoder, decoder, encoder_optim,decoder_optim, loss_criterion,teacher_forcing_ratio,ignore_padding=True,device='cpu'):

        """
        Method to train the encoder-decoder model for 1 epoch.

        Params:

            train_loader : The dataloader object (which wraps around WordDataset object of Core_Utils) corresponding to the traindata.
            encoder : The encoder model object.
            decoder : The decoder model object.
            encoder_optim : A torch optim object, corresponding to the optimizer of encoder.
            decoder_optim : A torch optim object, corresponding to the optimizer of decoder.
            loss_criterion : The loss criterion
            teacher_forcing_ratio : The teacher forcing ratio to be used.
            ignore_padding : True, by default.
            device : CPU by default.

        Returns:
            Loss and accuracy of the current epoch.
        
        """

        tot_correct_word_preds = 0
        tot_words = 0
        epoch_loss = 0

        for data in tqdm(train_loader):

            input_tensor, target_tensor,_,_ = data

            encoder_optim.zero_grad()
            decoder_optim.zero_grad()

            batch_size = data[0].shape[0]

            if encoder.rnn_type == "LSTM":
                encoder_hidden = torch.zeros(encoder.num_layers*encoder.D, batch_size, encoder.hidden_size, device=device)
                encoder_cell = torch.zeros(encoder.num_layers*encoder.D, batch_size, encoder.hidden_size, device=device)
            else:
                encoder_hidden = None
                encoder_cell = None
            

            encoder_hidden_contexts, encoder_last_hidden, encoder_cell = encoder(input_tensor,encoder_hidden,encoder_cell)
            
            decoder_outputs, _, _ = decoder(encoder_hidden_contexts, encoder_last_hidden,encoder_cell, target_tensor=target_tensor,teacher_forcing_ratio = teacher_forcing_ratio)

            multi_step_preds = torch.argmax(decoder_outputs,dim=2)
            multi_step_pred_correctness = (multi_step_preds ==  target_tensor)
            num_words = multi_step_preds.shape[0]
            
            if ignore_padding: ## if padding has to be ignored.

                ## for each word, based on the padding token ID, find the first occurance of the padding token, marking the begining of padding.
                ## argmax is not supported for bool on cuda, hence casting it to long.
                padding_start = torch.argmax((target_tensor == train_loader.dataset.pad_token_id).to(torch.long),dim=1).to(device)
                ## Creating a mask with 1's in each position of a padding token
                mask = (torch.arange(target_tensor.size(1)).unsqueeze(0).to(device) >= padding_start.unsqueeze(1))
                
                ##doing a logical OR with the mask makes sure that the padding tokens do not affect the correctness of the word
                tot_correct_word_preds += (torch.all(torch.logical_or(multi_step_pred_correctness,mask),dim=1).int().sum()).item()
                tot_words += num_words

            loss = loss_criterion(
                decoder_outputs.view(-1, decoder_outputs.size(-1)),
                target_tensor.view(-1)
            )
            loss.backward()

            encoder_optim.step()
            decoder_optim.step()

            epoch_loss += loss.item()

        epoch_loss = round(epoch_loss / len(train_loader),4)
        epoch_accuracy = round(tot_correct_word_preds*100/tot_words,2)

        return epoch_loss,epoch_accuracy
    
    def train(self,train_loader,valid_loader, encoder, decoder, epochs,padding_idx,optimiser = "adam",loss="crossentropy",weight_decay=0, lr=0.001,teacher_forcing_ratio = 0,device='cpu',wandb_logging = False):

        """
        The method to train the encoder-decoder model. Makes use of other methods like train_epoch, compute_accuracy to train and return the accuracy.

        train_loader : The dataloader object (which wraps around WordDataset object of Core_Utils) corresponding to the traindata.
        valid_loader : The dataloader object (which wraps around WordDataset object of Core_Utils) corresponding to the Validaiton data.
        encoder : The encoder model object.
        decoder : The decoder model object. 
        epochs : Number of epochs of training.
        padding_idx : The index corresponding to the <pad> token.
        optimiser : The optimiser used for training, "adam"/"nadam"/"rmsprop", default : "adam". (Case sensitive)
        loss : The loss function, only "crossentropy" is supported.
        weight_decay : L2, regularization of encoder and decoder model weights.
        lr : The learning rate, default is 0.001
        teacher_forcing_ratio : Teacher forcing ratio, default is 0.
        device : Default is CPU.
        wandb_logging : Default is False.
        
        """
    
        ## specify the optimiser
        if optimiser.lower() == "adam":
            encoder_optimizer = optim.Adam(encoder.parameters(), lr=lr,weight_decay=weight_decay)
            decoder_optimizer = optim.Adam(decoder.parameters(), lr=lr,weight_decay=weight_decay)

        elif optimiser.lower() == "nadam":
            encoder_optimizer = optim.NAdam(encoder.parameters(), lr=lr,weight_decay=weight_decay)
            decoder_optimizer = optim.NAdam(decoder.parameters(), lr=lr,weight_decay=weight_decay)

        elif optimiser.lower() == "rmsprop":
            encoder_optimizer = optim.RMSprop(encoder.parameters(), lr=lr,weight_decay=weight_decay)
            decoder_optimizer = optim.RMSprop(decoder.parameters(), lr=lr,weight_decay=weight_decay)
            
        ## Specify the loss criteria
        if loss.lower() == "crossentropy":
            loss_criterion = nn.CrossEntropyLoss(ignore_index = padding_idx).to(device)

        lp = train_loader.dataset.lp
        
        for epoch in tqdm(range(epochs)):

            ## Train for 1 epoch.
            train_loss,train_accuracy = self.train_epoch(train_loader, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_criterion,teacher_forcing_ratio,device=device)

            ## compute validation accuracy.
            val_loss,_,val_accuracy = self.compute_accuracy(valid_loader,encoder,decoder,loss_criterion,ignore_padding=True,device=device)

            print(f"Epoch {epoch+1}\t Train Loss : {train_loss}\t Train Acc : {train_accuracy}% \t Val Loss : {val_loss}\t Val Acc : {val_accuracy}%")
            if wandb_logging:
                wandb.log({'epoch': epoch+1,'train loss': train_loss, 'train accuracy': train_accuracy, 'Validation loss': val_loss, 'Validation accuracy': val_accuracy})

    def compute_accuracy(self,dataloader,encoder,decoder,criterion,ignore_padding = True,device='cpu'):

        """
        Method to compute the accuracy using the model (encoder-decoder) using dataloader.

        This method returns word and character level accuracy.

            Word Level Accuracy : Accuracy is computed at the word level and a word is right iff every character is predicted correctly.
            Char Level Accuracy : Accuracy is computed by comparing each predicted character wrt the correct char.

        Params:

            dataloader : The train/test/valid dataloader.
            encoder : The encoder 
            decoder : The decoder
            padding_token_id : The id of the padding token.
            ignore_padding : If True, then in word level accuracy, the padding characters are ignored in computing the word level accuracy.
                            char level accuracy, the padding characters are not considered at all.

                            If false, padding is considered to be a part of the word (for word level accuracy) and 
        """

        char_lvl_accuracy = 0
        word_level_accuracy = 0

        tot_chars = 0
        tot_words = 0

        tot_correct_char_preds = 0
        tot_correct_word_preds = 0

        loss = 0

        #criterion = loss_criterion.to(device)

        with torch.no_grad():

            train = 0

            if encoder.training and decoder.training: ## reset the the model back to train mode
                train = 1

            encoder.eval()
            decoder.eval()

            for data in dataloader:
                
                input_tensor, target_tensor,_,target_max_len = data

                batch_size = data[0].shape[0]
        
                if encoder.rnn_type == "LSTM":
                    encoder_hidden = torch.zeros(encoder.num_layers*encoder.D, batch_size, encoder.hidden_size, device=device)
                    encoder_cell = torch.zeros(encoder.num_layers*encoder.D, batch_size, encoder.hidden_size, device=device)
                else:
                    encoder_hidden = None
                    encoder_cell = None
                
        
                encoder_hidden_contexts, encoder_last_hidden, encoder_cell = encoder(input_tensor,encoder_hidden,encoder_cell)
                ## even though we are passing target tensor, the teacher forcing ratio is 0, so no teacher forcing
                decoder_outputs, _, _ = decoder(encoder_hidden_contexts, encoder_last_hidden,encoder_cell, target_tensor = target_tensor,teacher_forcing_ratio = 0)
        
                loss += criterion(decoder_outputs.view(-1, decoder_outputs.size(-1)), target_tensor.view(-1)).item()
        
                ## For a batch, for each character find the most probable output word.
                multi_step_preds = torch.argmax(decoder_outputs,dim=2)
                multi_step_pred_correctness = (multi_step_preds ==  target_tensor)
                num_chars = multi_step_preds.numel() ##find the total number of characters in the current batch
                num_words = multi_step_preds.shape[0] ##find the total number of words in the current batch.
        
                if ignore_padding: ## if padding has to be ignored.
        
                    ## for each word, based on the padding token ID, find the first occurance of the padding token, marking the begining of padding.
                    
                    ## argmax is not supported for bool on cuda, hence casting it to long.
                    padding_start = torch.argmax((target_tensor == dataloader.dataset.pad_token_id).to(torch.long),dim=1).to(device)
                    ## Creating a mask with 1's in each position of a padding token
                    mask = (torch.arange(target_tensor.size(1)).unsqueeze(0).to(device) >= padding_start.unsqueeze(1))
                    #print(mask)
                    
                    ##doing a logical OR with the mask makes sure that the padding tokens do not affect the correctness of the word
                    tot_correct_word_preds += (torch.all(torch.logical_or(multi_step_pred_correctness,mask),dim=1).int().sum()).item()
                    tot_words += num_words
        
                    ##creating a complement of the mask so to mark padding tokens as irrelevant
                    complement_mask = (1-mask.int()).bool()
                    num_pad_chars = mask.int().sum().item()
                    ##counting number of non_pad_chars to compute accuracy.
                    num_non_pad_chars = num_chars - num_pad_chars
        
                    tot_correct_char_preds += (torch.logical_and(multi_step_pred_correctness,complement_mask).int().sum()).item()
                    tot_chars += num_non_pad_chars
                    
            
                else: ##otherwise.
        
                    tot_correct_word_preds += (torch.all(multi_step_pred_correctness,dim=1).int().sum()).item()
                    tot_words += num_words
                    
                    tot_correct_char_preds += (multi_step_pred_correctness.int().sum()).item()
                    tot_chars += num_chars

            char_lvl_accuracy = round(tot_correct_char_preds*100/tot_chars,2)
            word_lvl_accuracy = round(tot_correct_word_preds*100/tot_words,2)

            loss /= dataloader.dataset.data.shape[0]

            if train:

                encoder.train()
                decoder.train()
        
            return round(loss,4),char_lvl_accuracy,word_lvl_accuracy
        
                

In [8]:
def evaluate(encoder, decoder, word, language_processor,device = "cpu"):

    lp = language_processor
    
    with torch.no_grad():

        train = 0

        if encoder.training and decoder.training: ## reset the the model back to train mode
            train = 1

        encoder.eval()
        decoder.eval()
        
        input_tensor = torch.tensor(lp.encode_word(word,lp.source_lang,padding=False,append_eos = True)).to(device).view(1,-1)

        batch_size = 1

        if encoder.rnn_type == "LSTM":
                encoder_hidden = torch.zeros(encoder.num_layers*encoder.D, batch_size, encoder.hidden_size, device=device)
                encoder_cell = torch.zeros(encoder.num_layers*encoder.D, batch_size, encoder.hidden_size, device=device)
        else:
            encoder_hidden = None
            encoder_cell = None
        

        encoder_hidden_contexts, encoder_last_hidden, encoder_cell = encoder(input_tensor,encoder_hidden,encoder_cell)

        """if encoder_hidden.shape[0] != decoder.expected_h0_dim1:
            reshaped_encoder_hidden = encoder_hidden.repeat(decoder.expected_h0_dim1,1,1)
        else:
            reshaped_encoder_hidden = encoder_hidden"""

        #print(encoder_hidden.shape,encoder_hidden.shape)

        decoder_outputs, _, attentions = decoder(encoder_hidden_contexts, encoder_last_hidden,encoder_cell ,eval_mode = True,target_tensor = None)

        output_size = len(list(lp.target_char2id.keys()))
        decoder_outputs = decoder_outputs.view(30,-1)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_chars = []
        """for idx in decoded_ids:
            if idx.item() == end_token_id:
                break
            decoded_chars.append(lp.target_id2char[idx.item()])"""

        decoded_word = lp.decode_word(decoded_ids.cpu().numpy(),lp.target_lang)

    if train:
        encoder.train()
        decoder.train()

    
    return decoded_word, attentions

In [19]:
def setup_and_start_expt(config,wandb_log = True,kaggle=False):
    
    batch_size = config['batch_size']
    target_lang = "tel"

    if kaggle:
        base_dir = "/kaggle/input/aksharantar-sampled/aksharantar_sampled/"
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    else:
        base_dir = "aksharantar_sampled/"
        device = torch.device("mps")

    use_meta_tokens = True
    append_eos = 1
    
    lang_dir = base_dir + target_lang + "/"
    
    ##creating train loader
    train_lp = LanguageProcessor(language_directory=lang_dir,target_lang_name=target_lang,mode="train",meta_tokens=use_meta_tokens)
    ## the ids of these tokens are the same in the source and target language
    start_token_id = train_lp.source_char2id['<start>']
    end_token_id = train_lp.source_char2id['<end>']
    pad_token_id = train_lp.source_char2id['<pad>']

    collate_fn_ptr = partial(collate_fn,pad_token_id=pad_token_id,device=device)
    
    train_dataset = WordDataset(train_lp,device=device)
    train_loader = DataLoader(train_dataset, batch_size=batch_size,collate_fn=collate_fn_ptr, shuffle=True)
    
    ## creating test loader
    test_lp = LanguageProcessor(language_directory=lang_dir,target_lang_name=target_lang,mode="test",meta_tokens=use_meta_tokens)
    
    ## to make sure that the same vocabulary and dictionaries are used everywhere
    test_lp.source_char2id = train_lp.source_char2id
    test_lp.source_id2char = train_lp.source_id2char
    test_lp.target_char2id = train_lp.target_char2id
    test_lp.target_id2char = train_lp.target_id2char
    
    test_dataset = WordDataset(test_lp,device=device)
    test_loader = DataLoader(test_dataset, batch_size=batch_size,collate_fn=collate_fn_ptr, shuffle=True)
    
    ## creating validation loader
    valid_lp = LanguageProcessor(language_directory=lang_dir,target_lang_name=target_lang,mode="valid",meta_tokens=use_meta_tokens)
    valid_lp.source_char2id = train_lp.source_char2id
    valid_lp.source_id2char = train_lp.source_id2char
    valid_lp.target_char2id = train_lp.target_char2id
    valid_lp.target_id2char = train_lp.target_id2char
    
    valid_dataset = WordDataset(valid_lp,device=device)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size,collate_fn=collate_fn_ptr, shuffle=True)
    
    ##in principle these are all fixed across train/test/valid data
    
    #source_max_len = train_lp.source_max_len
    #target_max_len = train_lp.target_max_len
    
    source_vocab_size = len(list(train_lp.source_char2id.keys()))
    target_vocab_size = len(list(train_lp.target_char2id.keys()))
    
    hidden_size = config['hidden_size']
    embedding_size = hidden_size
    
    epochs = config['epochs']
    
    optimiser = config['optimiser']
    
    weight_decay = config['weight_decay']
    
    lr = config['lr']
    
    num_layers = config['num_layers']
    #num_decoder_layers = num_encoder_layers
    
    ## Allowed Values : "GRU"/"RNN"/"LSTM" (not case sensitive)
    rnn_type = config['rnn_type'].upper()
    
    bidirectional = config['bidirectional']
    teacher_forcing_ratio = config['teacher_forcing_ratio']

    dropout=config['dropout']

    use_attention = config['use_attention']
    
    #encoder = Encoder(source_vocab_size = source_vocab_size, hidden_size = hidden_size,embedding_size=embedding_size,rnn_type = rnn_type,padding_idx=pad_token_id,num_layers=num_encoder_layers,bidirectional=bidirectional,dropout=dropout).to(device)
    
    #decoder = Decoder(hidden_size = hidden_size,embedding_size=embedding_size, target_vocab_size = target_vocab_size,batch_size = batch_size,rnn_type = rnn_type,use_attention = use_attention, padding_idx = None,num_layers = num_decoder_layers,bidirectional = bidirectional,dropout=dropout,device=device).to(device)
    
    model = MachineTranslator(source_vocab_size,target_vocab_size,hidden_size,embedding_size,rnn_type,batch_size,pad_token_id,dropout,num_layers,bidirectional,use_attention,device)

    #train(train_loader,valid_loader, encoder, decoder, 3,loss_criterion=loss_criterion, print_every=3, plot_every=5,device=device,teacher_forcing = teacher_forcing,teacher_forcing_ratio=teacher_forcing_ratio)
    model.train(train_loader,valid_loader,model.encoder, model.decoder, epochs,padding_idx = pad_token_id,optimiser = optimiser,weight_decay=weight_decay, lr=lr,teacher_forcing_ratio = teacher_forcing_ratio,device=device,wandb_logging = wandb_log)

    return model

In [20]:
"""config = {
    'hidden_size':128,
        
        'embedding_size':128,

        'rnn_type' : "gru",
        
        'batch_size':256,
        
        'optimiser': "adam",

        'num_layers' : 2,

        'lr':  1e-3,

        'dropout' : 0.1,
        
        'epochs' : 15,

        'teacher_forcing_ratio' : 0.3,

        'bidirectional' :  True,
    
        'weight_decay': 0
}

config = {
    'hidden_size':512,
        
        'embedding_size':512,

        'rnn_type' : "lstm",
        
        'batch_size':64,
        
        'optimiser': "nadam",

        'num_layers' : 3,

        'lr':  1e-3,

        'dropout' : 0.4,
        
        'epochs' : 15,

        'teacher_forcing_ratio' : 0.4,

        'bidirectional' :  True,
    
        'weight_decay': 1e-5
}"""

config = {

        'batch_size':64,

        'bidirectional' :  True,

        'dropout' : 0.4,

        'embedding_size':128,

        'epochs' : 1,

        'hidden_size':512,

        'lr':  3e-4,

        'num_layers' : 4,

        'optimiser': "nadam",

        'rnn_type' : "lstm",

        'teacher_forcing_ratio' : 0.4,

        'weight_decay': 1e-5,

        'use_attention' : True

}

model = setup_and_start_expt(config,wandb_log = False,kaggle=False)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/800 [00:00<?, ?it/s]

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


batch_size = config['batch_size']
kaggle = False

target_lang = "tel"

if kaggle:
    base_dir = "/kaggle/input/aksharantar-sampled/aksharantar_sampled/"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else:
    base_dir = "aksharantar_sampled/"
    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = torch.device("mps")


use_meta_tokens = True
append_eos = 1

lang_dir = base_dir + target_lang + "/"


##creating train loader
train_lp = LanguageProcessor(language_directory=lang_dir,target_lang_name=target_lang,mode="train",meta_tokens=use_meta_tokens)
## the ids of these tokens are the same in the source and target language
start_token_id = train_lp.source_char2id['<start>']
end_token_id = train_lp.source_char2id['<end>']
pad_token_id = train_lp.source_char2id['<pad>']

collate_fn_ptr = partial(collate_fn,pad_token_id=pad_token_id,device=device)

train_dataset = WordDataset(train_lp,device=device)
train_loader = DataLoader(train_dataset, batch_size=batch_size,collate_fn=collate_fn_ptr, shuffle=True)

## creating test loader
test_lp = LanguageProcessor(language_directory=lang_dir,target_lang_name=target_lang,mode="test",meta_tokens=use_meta_tokens)

## to make sure that the same vocabulary and dictionaries are used everywhere
test_lp.source_char2id = train_lp.source_char2id
test_lp.source_id2char = train_lp.source_id2char
test_lp.target_char2id = train_lp.target_char2id
test_lp.target_id2char = train_lp.target_id2char

test_dataset = WordDataset(test_lp,device=device)
test_loader = DataLoader(test_dataset, batch_size=batch_size,collate_fn=collate_fn_ptr, shuffle=True)

criterion = nn.CrossEntropyLoss(ignore_index = 2).to(device)

test_loss,_,test_accuracy = compute_accuracy(test_loader,encoder,decoder,criterion,padding_token_id = test_lp.source_char2id['<pad>'],end_token_id = test_lp.source_char2id['<end>'],ignore_padding=True,device=device)


In [17]:
word = "srirama"

decoded_word, attentions = evaluate(encoder, decoder, word, train_lp,device = "mps")


IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)

In [None]:
wandb.login(key="")

sweep_config = {
    'method': 'random',
    'name' : 'PA3 Hyper Sweep GRU',
    'metric': {
      'name': 'Validation accuracy',
      'goal': 'maximize'
    },
    'parameters': {
        
         'hidden_size':{
            'values':[128]#[64,128,256,512]
        },
        
        'embedding_size':{
            'values':[128]#[64,128,256,512]
        },

        'rnn_type':{
            'values':["gru"]#['lstm','rnn','gru']
        },
        
        'batch_size':{
            'values':[64]#[32,64,128,256]
        },
        
        'optimiser': {
            'values': ["adam"]#,"rmsprop","nadam"]
        },

        'num_layers' :{
            'values' : [2]#[1,2,3,4,5]
        },

        'lr': {
            'values': [1e-3]#[1e-2,1e-3,1e-4,3e-4]
        },

        'dropout' : {

            'values' : [0.1]#[0,0.1,0.2,0.3,0.4]
        },
        
        'epochs' : {

            'values' : [15]
        },

        'teacher_forcing_ratio' : {
            'values' : [0.3]#[0,0.1,0.2,0.3,0.4,0.5]
        },

        'bidirectional' : {
            'values' : [True]#[True,False]
        },
        'weight_decay': {
            'values': [0,1e-3,]#,1e-3,5e-3,5e-4]
        },
        }
    }

sweep_id = wandb.sweep(sweep=sweep_config, project='JV_CS23M036_TEJASVI_DL_ASSIGNMENT3')

In [None]:
def main():
    '''
    WandB calls main function each time with differnet combination.

    We can retrive the same and use the same values for our hypermeters.

    '''


    with wandb.init() as run:

        run_name="-hl_"+str(wandb.config.num_layers)+"-hs_"+str(wandb.config.hidden_size)+"-es_"+str(wandb.config.hidden_size)+"-biDir_"+str(wandb.config.bidirectional)

        run_name = run_name+"-rnn_type_"+str(wandb.config.rnn_type)+run_name+"-optim_"+str(wandb.config.optimiser)+"-lr_"+str(wandb.config.lr)+"-reg_"+str(wandb.config.weight_decay)+"-epochs_"+str(wandb.config.epochs)+"-tf_ratio_"+str(wandb.config.teacher_forcing_ratio)

        run_name = run_name+"-dropout_"+str(wandb.config.dropout)

        wandb.run.name=run_name

        setup_and_start_expt(wandb.config,wandb_log = True,kaggle=False)


wandb.agent(sweep_id, function=main,count=400) # calls main function for count number of times.
wandb.finish()

In [None]:
"""op1,_ = evaluate(encoder, decoder, word="srirama", language_processor=train_lp,device = device)

op2,_ = evaluate(encoder, decoder, word="tejasvi", language_processor=train_lp,device = device)

op1_string = "".join(op1)
print(op1_string[:8])

op2_string = "".join(op2)
print(op2_string[:7])

print()
"""

### JV

###### To Do:

1. Add LSTM,RNN support : Train
2. Model parameter Initialization.
3. Attention.
4. Now create a seq2seq class, specify attention = True for attention to work.

Hyper Params:

1. Batch size
2. Hidden layer size
3. Embedding size
4. number of encoder layers
5. number of decoder layers
6. bidirectional
7. tf ratio ; 0/0.1/0.2/0.3/0.5
8. Optimizer
9. Learning Rate
10. Batch size
11. Dropout

In [None]:
"""
Dropout within GR/LSTM/RNN could be applied only when num_layers>1, so if we have 1 layer dropout of 0 is applied.
However, the specified dropout is applied to embedding.

considering encoder_layers = decoder_layers = num_layers.

considering embedding_size = hidden_size.


"""