In [1]:
#JV

In [2]:
import numpy as np

import pandas as pd

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset,TensorDataset, DataLoader, RandomSampler
from torch.nn.utils.rnn import pad_sequence
import random

import os
import gc
import time
import math

import matplotlib.pyplot as plt
#plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

from tqdm.notebook import tqdm

#!python3 -m pip install wandb
import wandb

In [3]:
seed = 23

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [4]:
class LanguageProcessor:

    def __init__(self,language_directory,target_lang_name,mode="train",meta_tokens=True):

        """
        Default Constructor for this class.

        Params:

            language_directory : ex : "aksharantar_sampled/tel/"
            mode : "train" or "test" or "valid", accordingly the appropriate dataset is read.
            meta_tokens : If true creates the first three tokens of the dictionary as <start>,<end>,<pad>.
            
        """

        self.meta_tokens = meta_tokens ## boolean variable, if 1, then <start>,<end> and <pad> tokens are cosidered in the vocab.
        self.language_directory = language_directory
        self.target_lang_name = target_lang_name
        self.mode = mode ## accordingly helps to read and generate the appropriate dataset.
    
        self.source_lang = 0
        self.target_lang = 1

        self.source_max_len = self.find_max_len(self.source_lang)
        self.target_max_len = self.find_max_len(self.target_lang)

        self.max_len = max(self.source_max_len,self.target_max_len)+1 ##accomodating End token also, irrespective of whether it is used.

        self.source_char2id,self.source_id2char = self.build_char_vocab(self.source_lang,self.source_max_len)
        self.target_char2id,self.target_id2char = self.build_char_vocab(self.target_lang,self.target_max_len)


    def find_max_len(self,lang):

        """
        Method to find the maximum length of a word across train/test and validation data.

        This would help in padding, the embedding accordingly.

        Params:

            lang : 0/1 (source/target) language for which the length of the longest word must be found.
        
        """

        train_df = pd.read_csv(self.language_directory+self.target_lang_name+"_train.csv",header=None)
        test_df = pd.read_csv(self.language_directory+self.target_lang_name+"_test.csv",header=None)
        valid_df = pd.read_csv(self.language_directory+self.target_lang_name+"_valid.csv",header=None)

        train_max_len = len(max(list(train_df[lang]), key = len))
        test_max_len = len(max(list(test_df[lang]), key = len))
        valid_max_len = len(max(list(valid_df[lang]), key = len))

        del train_df
        del test_df
        del valid_df

        gc.collect()

        return max(train_max_len,test_max_len,valid_max_len)

    def build_char_vocab(self,lang_id,max_len=None):

        """
        Method to create a vocabulary of characters in language corresponding to lang_id.
        """

        df = pd.read_csv(self.language_directory+self.target_lang_name+"_"+self.mode+".csv",header=None)

        self.data = df.to_numpy()

        lang_chars = []
        lang_words = df[lang_id].to_numpy()
    
        for word in lang_words:
            lang_chars += list(word)
    
        unique_lang_chars =  sorted(list(set(lang_chars)))
        
        if self.meta_tokens:
            char2id_dict = {'<start>':0,'<end>':1,'<pad>': 2}
            id2char_dict = {0:'<start>',1:'<end>',2:'<pad>'}
            
        else:
            char2id_dict = {}
            id2char_dict = {}

        start = len(char2id_dict) ##Key of each language character starts based on meta tokens are used or not.
    
        for i in range(len(unique_lang_chars)):
            char2id_dict[unique_lang_chars[i]] = i+start
            id2char_dict[i+start] = unique_lang_chars[i]
    
        del df
        del lang_chars
        del unique_lang_chars

        gc.collect()
    
        return char2id_dict,id2char_dict

    def encode_word(self,word,lang_id,padding=True,append_eos = True):

        """
        Method to encode characters of a given word.

        Params:

            word: The word to be encoded.
            lang_id : 0/1 for source/target lang.
            padding : If true, the word encoding would be padded upto max len.
            append_eos : Appends <end> token at the end of every word.
        
        """

        if lang_id == self.source_lang:
            char2id_dict = self.source_char2id
            
        else:
            char2id_dict = self.target_char2id
        
        max_len = self.max_len

        word_encoding = []
        
        #if lang_id == self.source_lang:
        #    word_encoding = [char2id_dict['<start>']] ##every input starts with the <start> token.
        
        for i in word.lower():
            word_encoding.append(char2id_dict[i])

        #if append_eos:
        #    word_encoding.append(char2id_dict['<end>'])

        ## pad till maxlen, if padding is used.
        if padding:
            word_encoding += [char2id_dict['<pad>']] * (max_len - len(word_encoding))
        
        return np.array(word_encoding)

    def decode_word(self,code_word,lang_id):

        """
        Method to decode an encoded word.

        Params:

            code_word : The encoded word.
            lang_id : 0/1 for source/target lang.
        """
    
        word = []

        if lang_id == self.source_lang:
            id2char_dict = self.source_id2char
            char2id_dict = self.source_char2id
            
        else:
            id2char_dict = self.target_id2char
            char2id_dict = self.target_char2id

        start_idx = 0#1-lang_id
        
        for i in code_word[start_idx:]:
            ## if we reached <end>, then stop decoding
            if self.meta_tokens and i == char2id_dict['<end>'] or i == char2id_dict['<pad>']:
                break
            
            word.append(id2char_dict[i])
            
        return np.array(word)
            

In [5]:
class WordDataset(Dataset):

    """
    Class that inherits and overrides the methods of Dataset class. This helps in creating a data loader.
    """
    
    def __init__(self, language_processor,append_eos=True,device='cpu'):

        self.lp = language_processor
        self.data = self.lp.data
        self.device = device
        self.append_eos = append_eos

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_word, output_word = self.data[idx]
        
        input_sequence = self.lp.encode_word(input_word,self.lp.source_lang,padding=False,append_eos=self.append_eos)
        output_sequence = self.lp.encode_word(output_word,self.lp.target_lang,padding=False,append_eos=self.append_eos)
        

        #if len(input_sequence) != len(output_sequence):
        #    print(input_word,len(input_word),output_word,len(output_word))
        
        return torch.tensor(input_sequence).to(self.device), torch.tensor(output_sequence).to(self.device)
        #return input_sequence, output_sequence


In [6]:
pad_token_id = 2
device = torch.device("mps")
#torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_fn(batch):
    
    input_words, target_words = zip(*batch)
    
    padded_inputs = pad_sequence(input_words, batch_first=True, padding_value=pad_token_id)
    
    padded_targets = pad_sequence(target_words, batch_first=True, padding_value=pad_token_id)
    
    input_lengths = torch.LongTensor([len(seq) for seq in input_words]).to(device)
    target_lengths = torch.LongTensor([len(seq) for seq in target_words]).to(device)
    
    return padded_inputs, padded_targets, input_lengths, target_lengths

In [7]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size,embedding_size,rnn_type,padding_index, dropout=0.1,num_layers=1,bidirectional=False):
        
        super(EncoderRNN, self).__init__()

        self.num_layers = num_layers
        self.hidden_size = hidden_size

        self.D = 1 ##the number of directions in which the input is viewed.
        if bidirectional:
            self.D = 2
        
        self.rnn_type = rnn_type
        self.embedding = nn.Embedding(input_size, hidden_size,padding_idx=padding_index)
        #self.gru = nn.GRU(hidden_size, hidden_size,num_layers = num_layers,bidirectional=bidirectional, batch_first=True)
        if self.rnn_type == "GRU":
            self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True,num_layers = num_layers,bidirectional = bidirectional,dropout=dropout)
        elif self.rnn_type == "RNN":
            self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True,num_layers = num_layers,bidirectional = bidirectional,dropout=dropout)
        elif self.rnn_type == "LSTM":
            self.rnn = nn.LSTM(hidden_size, hidden_size, batch_first=True,num_layers = num_layers,bidirectional = bidirectional,dropout=dropout)
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, input,hidden=None,cell=None):
        embedded = self.dropout(self.embedding(input))
        if self.rnn_type == "LSTM":
            output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        else:
            output, hidden = self.rnn(embedded)
            cell = None
        #output, hidden = self.gru(embedded)
        return output, hidden,cell

In [8]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size,D,expected_dim,batch_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size*expected_dim, hidden_size)
        self.Ua = nn.Linear(hidden_size*D, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)
        self.batch_size = batch_size

    def forward(self, query, keys):

        #print(f"(encoder_outputs)keys.shape:{keys.shape}")
        #print(f"\nquery.shape:{query.shape}")
        reshaped_query = query.reshape(self.batch_size,1,-1)
        #print(f"reshaped_query.shape:{reshaped_query.shape}")
        #print(f"Wa(query.reshape(batch_size,1,-1)).shape : {self.Wa(reshaped_query).shape}")
        #print(f"Ua(keys).shape : {self.Ua(keys).shape}")
        
        scores = self.Va(torch.tanh(self.Wa(query.reshape(self.batch_size,1,-1)) + self.Ua(keys)))

        #print(f"scores.shape : {scores.shape}")
        scores = scores.squeeze(2).unsqueeze(1)
        #print(f"scores.squeeze(2).unsqueeze(1).shape : {scores.shape}")

        weights = F.softmax(scores, dim=-1)
        #print(f"weights.shape : {weights.shape}\n")
        context = torch.bmm(weights, keys)

        return context, weights

In [9]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size,embedding_size, output_size,rnn_type,max_len,batch_size,padding_index,start_token_id, dropout=0.1,num_layers=1,bidirectional=False,device = "cpu"):
        super(AttnDecoderRNN, self).__init__()

        self.num_layers = num_layers
        self.start_token_id = start_token_id
        
        self.D = 1 ##the number of directions in which the input is viewed.
        if bidirectional:
            self.D = 2
            
        ## In h0 (the input to the decoder) first dimension expected is number of directions X number of layers 
        self.expected_h0_dim1 = self.D*self.num_layers

        self.rnn_type = rnn_type
        
        self.embedding = nn.Embedding(output_size, hidden_size*self.D,padding_idx=padding_index)
        self.attention = BahdanauAttention(hidden_size,self.D,self.expected_h0_dim1,batch_size)

        #self.gru = nn.GRU(2 * self.D * hidden_size, hidden_size,num_layers=self.num_layers,bidirectional = bidirectional, batch_first=True)
        if self.rnn_type == "GRU":
            self.rnn = nn.GRU(2 * self.D * hidden_size, hidden_size, batch_first=True,num_layers = num_layers,bidirectional = bidirectional,dropout=dropout)
        elif self.rnn_type == "RNN":
            self.rnn = nn.RNN(2 * self.D * hidden_size, hidden_size, batch_first=True,num_layers = num_layers,bidirectional = bidirectional,dropout=dropout)
        elif self.rnn_type == "LSTM":
            self.rnn = nn.LSTM(2 * self.D * hidden_size, hidden_size, batch_first=True,num_layers = num_layers,bidirectional = bidirectional,dropout=dropout)
        
        self.out = nn.Linear(hidden_size*self.D, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, encoder_outputs, encoder_hidden,encoder_cell, target_tensor=None,MAX_LENGTH=28,teacher_forcing_ratio = 0):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(self.start_token_id)
        decoder_hidden = encoder_hidden
        decoder_cell = encoder_cell ## the cell state, which is initially same as that of encoder, (applies to LSTM unit only)
        decoder_outputs = []
        attentions = []

        max_len = target_tensor.size(1)

        for i in range(max_len):
            decoder_output, decoder_hidden,decoder_cell, attn_weights = self.forward_step(decoder_input, decoder_hidden,decoder_cell, encoder_outputs)
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            teacher_force = random.random() < teacher_forcing_ratio

            if (target_tensor is not None) and (teacher_force):
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden,cell, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))
        

        if hidden.shape[0] != self.expected_h0_dim1:
            reshaped_hidden = hidden.repeat(self.expected_h0_dim1,1,1)
        else:
            reshaped_hidden = hidden

        #print(f"encoder_outputs.shape:{encoder_outputs.shape}")
        #print(f"hidden.shape:{hidden.shape}")
        #print(f"reshaped_hidden.shape:{reshaped_hidden.shape}")

        query = reshaped_hidden.permute(1, 0, 2)
        #print(f"query.shape:{query.shape}")
        context, attn_weights = self.attention(query, encoder_outputs)
        #print(f"embedded.shape:{embedded.shape}")
        #print(f"context.shape:{context.shape}")
        #print(f"attn_weights.shape:{attn_weights.shape}")
        input = torch.cat((embedded, context), dim=2)
        #print(f"input_gru.shape:{input_gru.shape}")
        
        

        #output, hidden = self.gru(input_gru, reshaped_hidden)
        if self.rnn_type == "LSTM":
            output, (hidden, cell) = self.rnn(input, (reshaped_hidden, cell))
        else:
            output, hidden = self.rnn(input, reshaped_hidden)
            cell = None
        
        #print(f"output1.shape:{output.shape}")
        output = self.out(output)
        #print(f"output2.shape:{output.shape}")
        #print("========================================================================")

        return output, hidden,cell, attn_weights

In [10]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,decoder_optimizer, criterion,teacher_forcing_ratio=0,ignore_padding=True):

    tot_correct_word_preds = 0
    tot_words = 0
    
    total_loss = 0
    for data in tqdm(dataloader):
        input_tensor, target_tensor,_,tar_max_len = data

        max_len = torch.max(tar_max_len).item()
        
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        batch_size = data[0].shape[0]

        #print(data[0].shape,batch_size)

        if encoder.rnn_type == "LSTM":
            encoder_hidden = torch.zeros(encoder.num_layers*encoder.D, batch_size, encoder.hidden_size, device=device)
            encoder_cell = torch.zeros(encoder.num_layers*encoder.D, batch_size, encoder.hidden_size, device=device)
        else:
            encoder_hidden = None
            encoder_cell = None

        encoder_outputs, encoder_hidden, encoder_cell = encoder(input_tensor,encoder_hidden,encoder_cell)

        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden,encoder_cell, target_tensor,max_len,teacher_forcing_ratio = teacher_forcing_ratio)

        multi_step_preds = torch.argmax(decoder_outputs,dim=2)
        multi_step_pred_correctness = (multi_step_preds ==  target_tensor)
        num_words = batch_size
        
        if ignore_padding: ## if padding has to be ignored.

            ## for each word, based on the padding token ID, find the first occurance of the padding token, marking the begining of padding.
            
            ## argmax is not supported for bool on cuda, hence casting it to long.
            padding_start = torch.argmax((target_tensor == pad_token_id).to(torch.long),dim=1).to(device)
            ## Creating a mask with 1's in each position of a padding token
            mask = (torch.arange(target_tensor.size(1)).unsqueeze(0).to(device) >= padding_start.unsqueeze(1))
            #print(mask)
            
            ##doing a logical OR with the mask makes sure that the padding tokens do not affect the correctness of the word
            tot_correct_word_preds += (torch.all(torch.logical_or(multi_step_pred_correctness,mask),dim=1).int().sum()).item()
            tot_words += num_words

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    epoch_loss = round(total_loss / len(dataloader),4)
    epoch_accuracy = round(tot_correct_word_preds*100/tot_words,2)

    return epoch_loss,epoch_accuracy



In [11]:
def train(train_dataloader,valid_dataloader, encoder, decoder, n_epochs,padding_idx,optimiser = "adam",loss="crossentropy",weight_decay=0, lr=0.001,teacher_forcing = False,teacher_forcing_ratio = 0,device = 'cpu',print_every=100, plot_every=100,wandb_logging = False):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    ## specify the optimiser
    if optimiser.lower() == "adam":
        encoder_optimizer = optim.Adam(encoder.parameters(), lr=lr,weight_decay=weight_decay)
        decoder_optimizer = optim.Adam(decoder.parameters(), lr=lr,weight_decay=weight_decay)

    elif optimiser.lower() == "nadam":
        encoder_optimizer = optim.NAdam(encoder.parameters(), lr=lr,weight_decay=weight_decay)
        decoder_optimizer = optim.NAdam(decoder.parameters(), lr=lr,weight_decay=weight_decay)

    elif optimiser.lower() == "rmsprop":
        encoder_optimizer = optim.RMSprop(encoder.parameters(), lr=lr,weight_decay=weight_decay)
        decoder_optimizer = optim.RMSprop(decoder.parameters(), lr=lr,weight_decay=weight_decay)
        
    ## Specify the loss criteria
    if loss.lower() == "crossentropy":
        criterion = nn.CrossEntropyLoss(ignore_index = padding_idx).to(device)

    lp = train_dataloader.dataset.lp


    for epoch in tqdm(range(1, n_epochs + 1)):
        train_loss,train_accuracy = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion,teacher_forcing_ratio)
        print_loss_total += train_loss
        plot_loss_total += train_loss
        val_loss,_,val_word_level_accuracy = compute_accuracy(valid_dataloader,encoder,decoder,criterion,padding_token_id = lp.source_char2id['<pad>'],end_token_id = lp.source_char2id['<end>'],ignore_padding=True,device=device)

        print(f"Epoch {epoch}\t Train Loss : {train_loss}\t Train Acc : {train_accuracy}% \t Val Loss : {val_loss}\t Val Acc : {val_word_level_accuracy}%")
        
        if wandb_logging:
            wandb.log({'epoch': epoch,'train loss': train_loss, 'train accuracy': train_accuracy, 'Validation loss': val_loss, 'Validation accuracy': val_word_level_accuracy})
        #print(f"JV Char Accuracy:{JV_char_accuracy}%\t JV Word Accuracy:{JV_word_accuracy}%\t Loss : {loss}")

In [12]:
def compute_accuracy(dataloader,encoder,decoder,loss_criterion,padding_token_id,end_token_id = 1,ignore_padding = True,device='cpu'):

    """
    Method to compute the accuracy using the model (encoder-decoder) using dataloader.

    This method returns word and character level accuracy.

        Word Level Accuracy : Accuracy is computed at the word level and a word is right iff every character is predicted correctly.
        Char Level Accuracy : Accuracy is computed by comparing each predicted character wrt the correct char.

    Params:

        dataloader : The train/test/valid dataloader.
        encoder : The encoder 
        decoder : The decoder
        padding_token_id : The id of the padding token.
        ignore_padding : If True, then in word level accuracy, the padding characters are ignored in computing the word level accuracy.
                        char level accuracy, the padding characters are not considered at all.

                        If false, padding is considered to be a part of the word (for word level accuracy) and 
    """

    word_level_accuracy = 0

    tot_words = 0

    tot_correct_word_preds = 0

    loss = 0

    criterion = loss_criterion.to(device)

    with torch.no_grad():

        train = 0

        if encoder.training and decoder.training: ## reset the the model back to train mode
            train = 1

        encoder.eval()
        decoder.eval()

        for data in dataloader:
            
            input_tensor, target_tensor,_,tar_max_len = data

            max_len = torch.max(tar_max_len).item()

            batch_size = data[0].shape[0]
    
            if encoder.rnn_type == "LSTM":
                encoder_hidden = torch.zeros(encoder.num_layers*encoder.D, batch_size, encoder.hidden_size, device=device)
                encoder_cell = torch.zeros(encoder.num_layers*encoder.D, batch_size, encoder.hidden_size, device=device)
            else:
                encoder_hidden = None
                encoder_cell = None
    
            encoder_outputs, encoder_hidden, encoder_cell = encoder(input_tensor,encoder_hidden,encoder_cell)

            ## even though we are passing target tensor, the teacher forcing ratio is 0, so no teacher forcing
            decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden,encoder_cell, target_tensor,max_len,teacher_forcing_ratio = 0)
            
            loss += criterion(
                decoder_outputs.view(-1, decoder_outputs.size(-1)),
                target_tensor.view(-1)).item()
    
            ## For a batch, for each character find the most probable output word.
            multi_step_preds = torch.argmax(decoder_outputs,dim=2)
            multi_step_pred_correctness = (multi_step_preds ==  target_tensor)
            num_chars = multi_step_preds.numel() ##find the total number of characters in the current batch
            num_words = batch_size ##find the total number of words in the current batch.
    
            if ignore_padding: ## if padding has to be ignored.
    
                ## for each word, based on the padding token ID, find the first occurance of the padding token, marking the begining of padding.
                
                ## argmax is not supported for bool on cuda, hence casting it to long.
                padding_start = torch.argmax((target_tensor == pad_token_id).to(torch.long),dim=1).to(device)
                ## Creating a mask with 1's in each position of a padding token
                mask = (torch.arange(target_tensor.size(1)).unsqueeze(0).to(device) >= padding_start.unsqueeze(1))
                #print(mask)
                
                ##doing a logical OR with the mask makes sure that the padding tokens do not affect the correctness of the word
                tot_correct_word_preds += (torch.all(torch.logical_or(multi_step_pred_correctness,mask),dim=1).int().sum()).item()
                tot_words += num_words
    
            else: ##otherwise.
    
                tot_correct_word_preds += (torch.all(multi_step_pred_correctness,dim=1).int().sum()).item()
                tot_words += num_words
                
                tot_correct_char_preds += (multi_step_pred_correctness.int().sum()).item()
                tot_chars += num_chars

        #print(tot_correct_char_preds,tot_chars)
        #print(tot_correct_word_preds,tot_words)
    
        word_lvl_accuracy = round(tot_correct_word_preds*100/tot_words,2)

        loss /= dataloader.dataset.data.shape[0]

        if train:

            encoder.train()
            decoder.train()
    
        return round(loss,4),None,word_lvl_accuracy

In [13]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [14]:
def setup_and_start_expt(config,wandb_log = True,kaggle=False):
    
    batch_size = config['batch_size']
    target_lang = "tel"


    if kaggle:
        base_dir = "/kaggle/input/aksharantar-sampled/aksharantar_sampled/"
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    else:
        base_dir = "aksharantar_sampled/"
        device = torch.device("mps")
    
    use_meta_tokens = True
    append_eos = 1
    
    lang_dir = base_dir + target_lang + "/"
    
    ##creating train loader
    train_lp = LanguageProcessor(language_directory=lang_dir,target_lang_name=target_lang,mode="train",meta_tokens=use_meta_tokens)
    ## the ids of these tokens are the same in the source and target language
    start_token_id = train_lp.source_char2id['<start>']
    end_token_id = train_lp.source_char2id['<end>']
    pad_token_id = train_lp.source_char2id['<pad>']
    
    train_dataset = WordDataset(train_lp,device=device)
    train_loader = DataLoader(train_dataset, batch_size=batch_size,collate_fn=collate_fn, shuffle=True)
    
    ## creating test loader
    test_lp = LanguageProcessor(language_directory=lang_dir,target_lang_name=target_lang,mode="test",meta_tokens=use_meta_tokens)
    
    ## to make sure that the same vocabulary and dictionaries are used everywhere
    test_lp.source_char2id = train_lp.source_char2id
    test_lp.source_id2char = train_lp.source_id2char
    test_lp.target_char2id = train_lp.target_char2id
    test_lp.target_id2char = train_lp.target_id2char
    
    test_dataset = WordDataset(test_lp,device=device)
    test_loader = DataLoader(test_dataset, batch_size=batch_size,collate_fn=collate_fn, shuffle=True)
    
    ## creating validation loader
    valid_lp = LanguageProcessor(language_directory=lang_dir,target_lang_name=target_lang,mode="valid",meta_tokens=use_meta_tokens)
    valid_lp.source_char2id = train_lp.source_char2id
    valid_lp.source_id2char = train_lp.source_id2char
    valid_lp.target_char2id = train_lp.target_char2id
    valid_lp.target_id2char = train_lp.target_id2char
    
    valid_dataset = WordDataset(valid_lp,device=device)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size,collate_fn=collate_fn, shuffle=True)
    
    ##in principle these are all fixed across train/test/valid data
    
    #source_max_len = train_lp.source_max_len
    #target_max_len = train_lp.target_max_len
    
    input_size = len(list(train_lp.source_char2id.keys()))
    output_size = len(list(train_lp.target_char2id.keys()))
    
    hidden_size = config['hidden_size']
    embedding_size = hidden_size
    
    epochs = config['epochs']
    
    optimiser = config['optimiser']
    
    weight_decay = config['weight_decay']
    
    lr = config['lr']
    
    num_encoder_layers = config['num_layers']
    num_decoder_layers = num_encoder_layers
    
    ## Allowed Values : "GRU"/"RNN"/"LSTM" (not case sensitive)
    rnn_type = config['rnn_type'].upper()
    
    bidirectional = config['bidirectional']
    teacher_forcing_ratio = config['teacher_forcing_ratio']

    teacher_forcing = False
    
    if teacher_forcing_ratio>0:
        teacher_forcing = True
    
    
    #loss_criterion =  nn.CrossEntropyLoss(ignore_index=pad_token_id)
    
    dropout=config['dropout']
    
    encoder = EncoderRNN(input_size = input_size, hidden_size = hidden_size,embedding_size=embedding_size,rnn_type = rnn_type,padding_index=pad_token_id,num_layers=num_encoder_layers,bidirectional=bidirectional,dropout=dropout).to(device)
    
    decoder = AttnDecoderRNN(hidden_size = hidden_size,embedding_size=embedding_size, output_size = output_size,rnn_type = rnn_type,max_len = train_lp.max_len,batch_size = batch_size,start_token_id = start_token_id, padding_index = None,num_layers = num_decoder_layers,bidirectional = bidirectional,dropout=dropout,device=device).to(device)
    
    #train(train_loader,valid_loader, encoder, decoder, 3,loss_criterion=loss_criterion, print_every=3, plot_every=5,device=device,teacher_forcing = teacher_forcing,teacher_forcing_ratio=teacher_forcing_ratio)

    train(train_loader,valid_loader, encoder, decoder, n_epochs = epochs,padding_idx = pad_token_id,optimiser = optimiser,weight_decay=weight_decay, lr=lr,teacher_forcing = teacher_forcing,teacher_forcing_ratio = teacher_forcing_ratio,device=device,wandb_logging = wandb_log)

In [22]:
"""config = {
    'hidden_size':128,
        
        'embedding_size':128,

        'rnn_type' : "gru",
        
        'batch_size':256,
        
        'optimiser': "adam",

        'num_layers' : 2,

        'lr':  1e-3,

        'dropout' : 0.1,
        
        'epochs' : 15,

        'teacher_forcing_ratio' : 0.3,

        'bidirectional' :  True,
    
        'weight_decay': 0
}"""

config = {
    'hidden_size':512,
        
        'embedding_size':512,

        'rnn_type' : "lstm",
        
        'batch_size':64,
        
        'optimiser': "nadam",

        'num_layers' : 3,

        'lr':  1e-3,

        'dropout' : 0.4,
        
        'epochs' : 15,

        'teacher_forcing_ratio' : 0.4,

        'bidirectional' :  True,
    
        'weight_decay': 1e-5
}

setup_and_start_expt(config,wandb_log = False,kaggle=False)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/800 [00:00<?, ?it/s]

Epoch 1	 Train Loss : 0.9818	 Train Acc : 25.38% 	 Val Loss : 0.0117	 Val Acc : 44.36%


  0%|          | 0/800 [00:00<?, ?it/s]

Epoch 2	 Train Loss : 0.388	 Train Acc : 50.45% 	 Val Loss : 0.0109	 Val Acc : 49.39%


  0%|          | 0/800 [00:00<?, ?it/s]

Epoch 3	 Train Loss : 0.3316	 Train Acc : 54.96% 	 Val Loss : 0.0104	 Val Acc : 50.93%


  0%|          | 0/800 [00:00<?, ?it/s]

Epoch 4	 Train Loss : 0.3085	 Train Acc : 57.15% 	 Val Loss : 0.0101	 Val Acc : 51.27%


  0%|          | 0/800 [00:00<?, ?it/s]

Epoch 5	 Train Loss : 0.2933	 Train Acc : 58.81% 	 Val Loss : 0.01	 Val Acc : 51.39%


  0%|          | 0/800 [00:00<?, ?it/s]

Epoch 6	 Train Loss : 0.2844	 Train Acc : 59.66% 	 Val Loss : 0.0098	 Val Acc : 52.32%


  0%|          | 0/800 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [16]:
wandb.login(key="")

sweep_config = {
    'method': 'random',
    'name' : 'PA3 Hyper Sweep Attention GRU',
    'metric': {
      'name': 'Validation accuracy',
      'goal': 'maximize'
    },
    'parameters': {
        
         'hidden_size':{
            'values':[64,128,256,512]
        },
        
        'embedding_size':{
            'values':[64,128,256,512]
        },

        'rnn_type':{
            'values':['gru','lstm','rnn']
        },
        
        'batch_size':{
            'values':[64,128,256,512]
        },
        
        'optimiser': {
            'values': ["adam","rmsprop","nadam"]
        },

        'num_layers' :{
            'values' : [1,2,3,4]
        },

        'lr': {
            'values': [1e-2,1e-3,1e-4,3e-4]
        },

        'dropout' : {

            'values' : [0,0.1,0.2,0.3,0.4]
        },
        
        'epochs' : {

            'values' : [15]
        },

        'teacher_forcing_ratio' : {
            'values' : [0,0.1,0.2,0.3,0.4,0.5]
        },

        'bidirectional' : {
            'values' : [True,False]
        },
        'weight_decay': {
            'values': [0,1e-3,1e-4,1e-5]
        },
        }
    }

sweep_id = wandb.sweep(sweep=sweep_config, project='JV_CS23M036_TEJASVI_DL_ASSIGNMENT3')

[34m[1mwandb[0m: Currently logged in as: [33mcs23m036[0m ([33mtmajestical[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/tejasmalladi/.netrc


Create sweep with ID: 8haktxy5
Sweep URL: https://wandb.ai/tmajestical/JV_CS23M036_TEJASVI_DL_ASSIGNMENT3/sweeps/8haktxy5


In [17]:
def main():
    '''
    WandB calls main function each time with differnet combination.

    We can retrive the same and use the same values for our hypermeters.

    '''


    with wandb.init() as run:

        run_name="-hl_"+str(wandb.config.num_layers)+"-hs_"+str(wandb.config.hidden_size)+"-es_"+str(wandb.config.hidden_size)+"-biDir_"+str(wandb.config.bidirectional)

        run_name = run_name+"-rnn_type_"+str(wandb.config.rnn_type)+run_name+"-optim_"+str(wandb.config.optimiser)+"-lr_"+str(wandb.config.lr)+"-reg_"+str(wandb.config.weight_decay)+"-epochs_"+str(wandb.config.epochs)+"-tf_ratio_"+str(wandb.config.teacher_forcing_ratio)

        run_name = run_name+"-dropout_"+str(wandb.config.dropout)

        wandb.run.name=run_name

        setup_and_start_expt(wandb.config,wandb_log = True,kaggle=False)


wandb.agent(sweep_id, function=main,count=10) # calls main function for count number of times.
wandb.finish()

[34m[1mwandb[0m: Agent Starting Run: eibr0j71 with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	lr: 0.0003
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimiser: nadam
[34m[1mwandb[0m: 	rnn_type: rnn
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Epoch 1	 Train Loss : 2.9608	 Train Acc : 1.07% 	 Val Loss : 0.0087	 Val Acc : 1.39%


Traceback (most recent call last):
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/2210584461.py", line 20, in main
    setup_and_start_expt(wandb.config,wandb_log = True,kaggle=False)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/982323058.py", line 95, in setup_and_start_expt
    train(train_loader,valid_loader, encoder, decoder, n_epochs = epochs,padding_idx = pad_token_id,optimiser = optimiser,weight_decay=weight_decay, lr=lr,teacher_forcing = teacher_forcing,teacher_forcing_ratio = teacher_forcing_ratio,device=device,wandb_logging = wandb_log)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/1550435551.py", line 35, in train
    wandb.log({'epoch': epoch,'train loss': train_loss, 'train accuracy': train_accuracy, 'Validation loss': val_loss, 'Validation accuracy': val_accuracy})
NameError: name 'val_accuracy' is not defined


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Run eibr0j71 errored:
Traceback (most recent call last):
  File "/Users/tejasmalladi/miniconda3/envs/dl/lib/python3.8/site-packages/wandb/agents/pyagent.py", line 308, in _run_job
    self._function()
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/2210584461.py", line 20, in main
    setup_and_start_expt(wandb.config,wandb_log = True,kaggle=False)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/982323058.py", line 95, in setup_and_start_expt
    train(train_loader,valid_loader, encoder, decoder, n_epochs = epochs,padding_idx = pad_token_id,optimiser = optimiser,weight_decay=weight_decay, lr=lr,teacher_forcing = teacher_forcing,teacher_forcing_ratio = teacher_forcing_ratio,device=device,wandb_logging = wandb_log)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/1550435551.py", line 35, in train
    wandb.log({'epoch': epoch,'train loss': train_loss, 'train accuracy': train_accuracy, 'Validation loss': val_loss, 'Val

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/800 [00:00<?, ?it/s]

Epoch 1	 Train Loss : 1.9392	 Train Acc : 3.88% 	 Val Loss : 0.0303	 Val Acc : 4.15%


Traceback (most recent call last):
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/2210584461.py", line 20, in main
    setup_and_start_expt(wandb.config,wandb_log = True,kaggle=False)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/982323058.py", line 95, in setup_and_start_expt
    train(train_loader,valid_loader, encoder, decoder, n_epochs = epochs,padding_idx = pad_token_id,optimiser = optimiser,weight_decay=weight_decay, lr=lr,teacher_forcing = teacher_forcing,teacher_forcing_ratio = teacher_forcing_ratio,device=device,wandb_logging = wandb_log)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/1550435551.py", line 35, in train
    wandb.log({'epoch': epoch,'train loss': train_loss, 'train accuracy': train_accuracy, 'Validation loss': val_loss, 'Validation accuracy': val_accuracy})
NameError: name 'val_accuracy' is not defined


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Run 9ygb3s5f errored:
Traceback (most recent call last):
  File "/Users/tejasmalladi/miniconda3/envs/dl/lib/python3.8/site-packages/wandb/agents/pyagent.py", line 308, in _run_job
    self._function()
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/2210584461.py", line 20, in main
    setup_and_start_expt(wandb.config,wandb_log = True,kaggle=False)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/982323058.py", line 95, in setup_and_start_expt
    train(train_loader,valid_loader, encoder, decoder, n_epochs = epochs,padding_idx = pad_token_id,optimiser = optimiser,weight_decay=weight_decay, lr=lr,teacher_forcing = teacher_forcing,teacher_forcing_ratio = teacher_forcing_ratio,device=device,wandb_logging = wandb_log)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/1550435551.py", line 35, in train
    wandb.log({'epoch': epoch,'train loss': train_loss, 'train accuracy': train_accuracy, 'Validation loss': val_loss, 'Val

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/800 [00:00<?, ?it/s]

Epoch 1	 Train Loss : 2.7948	 Train Acc : 2.49% 	 Val Loss : 0.0299	 Val Acc : 4.1%


Traceback (most recent call last):
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/2210584461.py", line 20, in main
    setup_and_start_expt(wandb.config,wandb_log = True,kaggle=False)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/982323058.py", line 95, in setup_and_start_expt
    train(train_loader,valid_loader, encoder, decoder, n_epochs = epochs,padding_idx = pad_token_id,optimiser = optimiser,weight_decay=weight_decay, lr=lr,teacher_forcing = teacher_forcing,teacher_forcing_ratio = teacher_forcing_ratio,device=device,wandb_logging = wandb_log)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/1550435551.py", line 35, in train
    wandb.log({'epoch': epoch,'train loss': train_loss, 'train accuracy': train_accuracy, 'Validation loss': val_loss, 'Validation accuracy': val_accuracy})
NameError: name 'val_accuracy' is not defined


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Run 2cb1t69t errored:
Traceback (most recent call last):
  File "/Users/tejasmalladi/miniconda3/envs/dl/lib/python3.8/site-packages/wandb/agents/pyagent.py", line 308, in _run_job
    self._function()
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/2210584461.py", line 20, in main
    setup_and_start_expt(wandb.config,wandb_log = True,kaggle=False)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/982323058.py", line 95, in setup_and_start_expt
    train(train_loader,valid_loader, encoder, decoder, n_epochs = epochs,padding_idx = pad_token_id,optimiser = optimiser,weight_decay=weight_decay, lr=lr,teacher_forcing = teacher_forcing,teacher_forcing_ratio = teacher_forcing_ratio,device=device,wandb_logging = wandb_log)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/1550435551.py", line 35, in train
    wandb.log({'epoch': epoch,'train loss': train_loss, 'train accuracy': train_accuracy, 'Validation loss': val_loss, 'Val

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/800 [00:00<?, ?it/s]

Epoch 1	 Train Loss : 3.507	 Train Acc : 2.42% 	 Val Loss : 0.0539	 Val Acc : 2.17%


Traceback (most recent call last):
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/2210584461.py", line 20, in main
    setup_and_start_expt(wandb.config,wandb_log = True,kaggle=False)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/982323058.py", line 95, in setup_and_start_expt
    train(train_loader,valid_loader, encoder, decoder, n_epochs = epochs,padding_idx = pad_token_id,optimiser = optimiser,weight_decay=weight_decay, lr=lr,teacher_forcing = teacher_forcing,teacher_forcing_ratio = teacher_forcing_ratio,device=device,wandb_logging = wandb_log)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/1550435551.py", line 35, in train
    wandb.log({'epoch': epoch,'train loss': train_loss, 'train accuracy': train_accuracy, 'Validation loss': val_loss, 'Validation accuracy': val_accuracy})
NameError: name 'val_accuracy' is not defined


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Run 1vwzofhh errored:
Traceback (most recent call last):
  File "/Users/tejasmalladi/miniconda3/envs/dl/lib/python3.8/site-packages/wandb/agents/pyagent.py", line 308, in _run_job
    self._function()
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/2210584461.py", line 20, in main
    setup_and_start_expt(wandb.config,wandb_log = True,kaggle=False)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/982323058.py", line 95, in setup_and_start_expt
    train(train_loader,valid_loader, encoder, decoder, n_epochs = epochs,padding_idx = pad_token_id,optimiser = optimiser,weight_decay=weight_decay, lr=lr,teacher_forcing = teacher_forcing,teacher_forcing_ratio = teacher_forcing_ratio,device=device,wandb_logging = wandb_log)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/1550435551.py", line 35, in train
    wandb.log({'epoch': epoch,'train loss': train_loss, 'train accuracy': train_accuracy, 'Validation loss': val_loss, 'Val

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

Epoch 1	 Train Loss : 2.7038	 Train Acc : 1.46% 	 Val Loss : 0.0161	 Val Acc : 2.17%


Traceback (most recent call last):
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/2210584461.py", line 20, in main
    setup_and_start_expt(wandb.config,wandb_log = True,kaggle=False)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/982323058.py", line 95, in setup_and_start_expt
    train(train_loader,valid_loader, encoder, decoder, n_epochs = epochs,padding_idx = pad_token_id,optimiser = optimiser,weight_decay=weight_decay, lr=lr,teacher_forcing = teacher_forcing,teacher_forcing_ratio = teacher_forcing_ratio,device=device,wandb_logging = wandb_log)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/1550435551.py", line 35, in train
    wandb.log({'epoch': epoch,'train loss': train_loss, 'train accuracy': train_accuracy, 'Validation loss': val_loss, 'Validation accuracy': val_accuracy})
NameError: name 'val_accuracy' is not defined


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Run 7d0eq1el errored:
Traceback (most recent call last):
  File "/Users/tejasmalladi/miniconda3/envs/dl/lib/python3.8/site-packages/wandb/agents/pyagent.py", line 308, in _run_job
    self._function()
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/2210584461.py", line 20, in main
    setup_and_start_expt(wandb.config,wandb_log = True,kaggle=False)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/982323058.py", line 95, in setup_and_start_expt
    train(train_loader,valid_loader, encoder, decoder, n_epochs = epochs,padding_idx = pad_token_id,optimiser = optimiser,weight_decay=weight_decay, lr=lr,teacher_forcing = teacher_forcing,teacher_forcing_ratio = teacher_forcing_ratio,device=device,wandb_logging = wandb_log)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/1550435551.py", line 35, in train
    wandb.log({'epoch': epoch,'train loss': train_loss, 'train accuracy': train_accuracy, 'Validation loss': val_loss, 'Val

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

Epoch 1	 Train Loss : 1.5307	 Train Acc : 11.87% 	 Val Loss : 0.0085	 Val Acc : 31.76%


Traceback (most recent call last):
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/2210584461.py", line 20, in main
    setup_and_start_expt(wandb.config,wandb_log = True,kaggle=False)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/982323058.py", line 95, in setup_and_start_expt
    train(train_loader,valid_loader, encoder, decoder, n_epochs = epochs,padding_idx = pad_token_id,optimiser = optimiser,weight_decay=weight_decay, lr=lr,teacher_forcing = teacher_forcing,teacher_forcing_ratio = teacher_forcing_ratio,device=device,wandb_logging = wandb_log)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/1550435551.py", line 35, in train
    wandb.log({'epoch': epoch,'train loss': train_loss, 'train accuracy': train_accuracy, 'Validation loss': val_loss, 'Validation accuracy': val_accuracy})
NameError: name 'val_accuracy' is not defined


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Run bb23h3rf errored:
Traceback (most recent call last):
  File "/Users/tejasmalladi/miniconda3/envs/dl/lib/python3.8/site-packages/wandb/agents/pyagent.py", line 308, in _run_job
    self._function()
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/2210584461.py", line 20, in main
    setup_and_start_expt(wandb.config,wandb_log = True,kaggle=False)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/982323058.py", line 95, in setup_and_start_expt
    train(train_loader,valid_loader, encoder, decoder, n_epochs = epochs,padding_idx = pad_token_id,optimiser = optimiser,weight_decay=weight_decay, lr=lr,teacher_forcing = teacher_forcing,teacher_forcing_ratio = teacher_forcing_ratio,device=device,wandb_logging = wandb_log)
  File "/var/folders/z4/__x0n5jn4f1c4z527hs4r4_00000gn/T/ipykernel_8571/1550435551.py", line 35, in train
    wandb.log({'epoch': epoch,'train loss': train_loss, 'train accuracy': train_accuracy, 'Validation loss': val_loss, 'Val

#### To Do

1. ~Add support for multi-layers [currently bidirectional is only supported]~
2. ~Ignore Padding Index~
3. Add support for LSTM and RNN
4. Include Teacher forcing, with ratio.
5. Add support for eval mode : to predict for individual words