In [1]:
#JV

In [2]:
import numpy as np

import pandas as pd

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset,TensorDataset, DataLoader, RandomSampler
from torch.nn.utils.rnn import pad_sequence
import random
from functools import partial

import os
import gc
import time
import math

import matplotlib.pyplot as plt
#plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

from tqdm.notebook import tqdm

#!python3 -m pip install wandb
import wandb

from Core_Utils import *

In [3]:
seed = 23

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [4]:
class BahdanauAttention(nn.Module):
    """
    The class to implement Additive attention aka Bhadanau Attention. As seen in lectures.    
    """
    def __init__(self, hidden_size,D,expected_dim,batch_size):
        super(BahdanauAttention, self).__init__()
        self.U_att = nn.Linear(hidden_size*expected_dim, hidden_size)
        self.W_att = nn.Linear(hidden_size*D, hidden_size)
        self.V_att = nn.Linear(hidden_size, 1)
        self.batch_size = batch_size

    def forward(self, decoder_prev_hidden, encoder_contexts):

        """
        The method that takes decoder hiddenstates and encoder hidden contexts to produce attention weighted context vector.
        Params:
            decoder_prev_hidden : The decoder's hidden state at t-1.
            encoder_contexts : The encoder hidden states from all time steps.
            
        """

        decoder_prev_hidden = decoder_prev_hidden.reshape(self.batch_size,1,-1)
        
        ## Following the same expression as seen in lectures. [Slide 256 in https://www.cse.iitm.ac.in/~miteshk/CS7015/Slides/Teaching/pdf/Lecture16.pdf]
        scores = self.V_att(torch.tanh(self.U_att(decoder_prev_hidden.reshape(self.batch_size,1,-1)) + self.W_att(encoder_contexts))).squeeze(2).unsqueeze(1)
        weights = F.softmax(scores, dim=-1)
        ##compute context of each word in the batch, by considering attention
        context = torch.bmm(weights, encoder_contexts)

        return context, weights

In [5]:
class Encoder(nn.Module):
    """
    The class that implements the encoder using Recurrent Units RNN/LSTM/GRU, as needed, by extending the nn.Module class from torch.    
    
    """
    def __init__(self, source_vocab_size,hidden_size,embedding_size,rnn_type = "GRU",padding_idx = None ,dropout=0.1,num_layers = 1,bidirectional = False):
        
        """
        The constructor of the Encoder Class.

        Params:

            source_vocab_size : The vocabulary size of the source language.
            hidden_size : The dimension of the hidden state of the recurrent cell.
            embedding_size : The dimension of the embedding used.
            rnn_type : "GRU"/"LSTM"/"RNN", case INsensitive. Default : "GRU".
            padding_idx : The id corresponding to the token <pad>.
            dropout : Droput probability. Default : 0.1
            num_layers(int) : The number of encoder (recurrence unit) layers. Default : 1
            bidirectional : True/False. If True, encoding is done by parsing input L->R and R->L, hence doubling the hiddenstate size. Default False.

        Return:
            None.
        
        """
        
        super(Encoder, self).__init__()

        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.rnn_type = rnn_type

        self.D = 1 ##the number of directions in which the input is viewed.
        if bidirectional:
            self.D = 2

        self.rnn_dropout = 0
        if self.num_layers>1:
            self.rnn_dropout = dropout

        ##create an embedding layer, and ignore padding index
        self.embedding = nn.Embedding(source_vocab_size, self.embedding_size,padding_idx = padding_idx)
        
        if self.rnn_type == "GRU":
            self.rnn = nn.GRU(self.embedding_size, hidden_size, batch_first=True,num_layers = num_layers,bidirectional = bidirectional,dropout=self.rnn_dropout)
        elif self.rnn_type == "RNN":
            self.rnn = nn.RNN(self.embedding_size, hidden_size, batch_first=True,num_layers = num_layers,bidirectional = bidirectional,dropout=self.rnn_dropout)
        elif self.rnn_type == "LSTM":
            self.rnn = nn.LSTM(self.embedding_size, hidden_size, batch_first=True,num_layers = num_layers,bidirectional = bidirectional,dropout=self.rnn_dropout)
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, input,hidden=None,cell=None):

        """
        The method to perform forward pass of the encoder.

        Params : 
            Input : The encoded batch of input tensors.
            hidden : Default is None. If the unit is LSTM, it is the previous hidden state.
            cell : Default is None. If the unit is LSTM, it is the cell state.

        Returns : 
            output, hidden, cell.
            hidden and cell are current hidden and cell states in case of LSTM and they are None in other cases.
            
        """
        
        input_embedding = self.dropout(self.embedding(input))
        
        if self.rnn_type == "LSTM":
            output, (hidden, cell) = self.rnn(input_embedding, (hidden, cell))
        else:
            output, hidden = self.rnn(input_embedding)
            cell = None
        
        return output, hidden, cell

In [19]:
class Decoder(nn.Module):

    """
    The class to implement Decoder in the encoder-decoder architecture using "RNN"/"LSTM"/"GRU".

    While the code is flexible enough to support separate types of recurrent units for encoder and decoder,
    In this assignment, I have chosen to use the same type recurrent unit for both.
    
    """

    def __init__(self, hidden_size,embedding_size,target_vocab_size,rnn_type,batch_size,use_attention = True,padding_idx = None,num_layers = 1,bidirectional = False,dropout=0,device = "cpu"):
        
        """
        The constructor of this class. Perfoms setup necessary for training.

        hidden_size : The dimension of the hidden state of the recurrent cell.
        embedding_size : The dimension of the embedding used.
        target_vocab_size : The vocabulary size of the target language.
        rnn_type : "GRU"/"LSTM"/"RNN", case INsensitive. Default : "GRU".
        batch_size : The batch size used for training. This is needed to resize dimensions in the BahdanauAttention's forward pass.
        use_attention : Boolean variable, default True, indicating to make use of BahdanauAttention.
        padding_idx : The id corresponding to the token <pad>.
        dropout : Droput probability. Default : 0.1
        num_layers(int) : The number of encoder (recurrence unit) layers. Default : 1
        bidirectional : True/False. If True, encoding is done by parsing input L->R and R->L, hence doubling the hiddenstate size. Default False.
        device : The device on which the processing happens. Default : "cpu".
        
        """
        
        super(Decoder, self).__init__()

        self.num_layers = num_layers
        self.rnn_type = rnn_type
        self.device = device
        self.D = 1 ##the number of directions in which the input is viewed.
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.use_attention = use_attention
        if bidirectional:
            self.D = 2

        ## In h0 (the input to the decoder) first dimension expected is number of directions X number of layers
        self.expected_h0_dim1 = self.D*self.num_layers

        ##create an embedding layer, and ignore padding index
        self.embedding = nn.Embedding(target_vocab_size, self.embedding_size,padding_idx = padding_idx)

        if self.use_attention:
            self.attention = BahdanauAttention(hidden_size,self.D,self.expected_h0_dim1,batch_size)
            recurrent_unit_input_dim = self.embedding_size + self.D*hidden_size

        else:
            recurrent_unit_input_dim = self.embedding_size


        self.rnn_dropout = 0
        if self.num_layers>1:
            self.rnn_dropout = dropout

        #self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True,num_layers = num_layers,bidirectional = bidirectional)
        if self.rnn_type == "GRU":
            self.rnn = nn.GRU(recurrent_unit_input_dim, hidden_size, batch_first=True,num_layers = num_layers,bidirectional = bidirectional,dropout=self.rnn_dropout)
        elif self.rnn_type == "RNN":
            self.rnn = nn.RNN(recurrent_unit_input_dim, hidden_size, batch_first=True,num_layers = num_layers,bidirectional = bidirectional,dropout=self.rnn_dropout)
        elif self.rnn_type == "LSTM":
            self.rnn = nn.LSTM(recurrent_unit_input_dim, hidden_size, batch_first=True,num_layers = num_layers,bidirectional = bidirectional,dropout=self.rnn_dropout)

        ## Passing the hidden state through a fully conencted layer and then applying softmax
        self.output_layer = nn.Linear(self.hidden_size*self.D, target_vocab_size)

    def forward(self, encoder_hidden_contexts, encoder_last_hidden,encoder_cell,target_tensor,eval_mode = False,teacher_forcing_ratio=0):
        """
        Method that Implements the forward pass of the decoder.

        Params:

            encoder_hidden_contexts : The hidden contexts from all time steps of the encoder.
            encoder_last_hidden : The the last hidden state of the encoder, the context, passed as the first decoder_hidden.
            encoder_cell : It is relevant iff the recurrent unit used is LSTM, all other times it would be 0.
            eval_mode : Boolean variable, if true, it adjusts the dimensions to predict for a single word.
            teacher_forcing_ratio : value in [0,1]. It is essentially the probability, with which true input is fed into the decoder at a time step. Default is 0.

            Returns decoder_outputs,decoder_hidden

        """

        batch_size = encoder_hidden_contexts.size(0)
        if not eval_mode:
            max_word_len = target_tensor.size(1)

        ## eval mode is for looking at a specific word that is predicted to compare with the correct word.
        if eval_mode:
            batch_size = 1
            max_word_len = 30 ## an arbitrary number, larger in expecected sense.

        #the decoder predicts one character at a time, and hence we use a list to store all the predictions.
        decoder_outputs = []
        if self.use_attention:
            attentions = []
        else:
            attentions = None

        ## At the first time step <SOS> token (which has an id 0, is fed as an input to the decoder).
        decoder_input = torch.zeros((batch_size, 1), dtype=torch.long, device=self.device)
        decoder_hidden = encoder_last_hidden ## in the first time step of the decoder, the output of the encoder is the input.
        decoder_cell = encoder_cell ## the cell state, which is initially same as that of encoder, (applies to LSTM unit only)

        for step in range(max_word_len):

            ## eval mode is for looking at a specific word that is predicted to compare with the correct word.
            if eval_mode:
                decoder_input = decoder_input.view(1,-1)

            embedding = self.embedding(decoder_input)


            if decoder_hidden.shape[0] != self.expected_h0_dim1:
                reshaped_hidden = decoder_hidden.repeat(self.expected_h0_dim1,1,1)
            else:
                reshaped_hidden = decoder_hidden


            if self.use_attention:
                ## the attention part.
                decoder_prev_hidden = reshaped_hidden.permute(1, 0, 2)
                context_vector, attention_weights = self.attention(decoder_prev_hidden, encoder_hidden_contexts)
                tmp_input = torch.cat((embedding, context_vector), dim=2)
            else:
                ## introducing non-lineartiy through ReLU activation
                activated_embedding = F.relu(embedding)
                tmp_input = activated_embedding


            if self.rnn_type == "LSTM":
                tmp_output, (decoder_hidden, decoder_cell) = self.rnn(tmp_input, (reshaped_hidden, decoder_cell))
            else:
                tmp_output, decoder_hidden = self.rnn(tmp_input, reshaped_hidden)
                cell = None

            decoder_output = self.output_layer(tmp_output.squeeze(0))

            ## randomly sample a number in (0,1) and if the number is less than the teacher forcing ratio
            ## apply teacher forcing at the current step
            apply_teacher_forcing = random.random() < teacher_forcing_ratio

            if (target_tensor is not None) and (apply_teacher_forcing):

                ## Teacher forcing: Feed the target as the next input
                ## extract the 't'th token from th target string to feed as input at "t"th time step.
                decoder_input = target_tensor[:, step].unsqueeze(1) # Teacher forcing
            else:
                ##greedily pick predictions, i.e pick the character corresponding to the hightest probability
                _,preds = torch.max(decoder_output,dim=2)
                decoder_input = preds.detach()

            decoder_outputs.append(decoder_output)
            if self.use_attention:
                attentions.append(attention_weights)

        ## concatenate the predictions across all the timesteps into a singel tensor
        ## found in literature that log_softmax does better than softmax, hence going with that.
        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)

        ## the idea is to have a common API for both attention and normal decoder, achiveing ease of use.
        return decoder_outputs, decoder_hidden, attentions

In [7]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,decoder_optimizer, criterion,teacher_forcing_ratio,ignore_padding=True,device='cpu'):

    tot_correct_word_preds = 0
    tot_words = 0
    
    total_loss = 0
    for data in tqdm(dataloader):
        ## print(f"Data Shape : {data[0].shape,data[1].shape}")
        input_tensor, target_tensor,_,target_max_len = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        batch_size = data[0].shape[0]

        if encoder.rnn_type == "LSTM":
            encoder_hidden = torch.zeros(encoder.num_layers*encoder.D, batch_size, encoder.hidden_size, device=device)
            encoder_cell = torch.zeros(encoder.num_layers*encoder.D, batch_size, encoder.hidden_size, device=device)
        else:
            encoder_hidden = None
            encoder_cell = None
        

        encoder_hidden_contexts, encoder_last_hidden, encoder_cell = encoder(input_tensor,encoder_hidden,encoder_cell)
        
        decoder_outputs, _, _ = decoder(encoder_hidden_contexts, encoder_last_hidden,encoder_cell, target_tensor=target_tensor,teacher_forcing_ratio = teacher_forcing_ratio)

        multi_step_preds = torch.argmax(decoder_outputs,dim=2)
        multi_step_pred_correctness = (multi_step_preds ==  target_tensor)
        num_words = multi_step_preds.shape[0]
        
        if ignore_padding: ## if padding has to be ignored.

            ## for each word, based on the padding token ID, find the first occurance of the padding token, marking the begining of padding.
            
            ## argmax is not supported for bool on cuda, hence casting it to long.
            padding_start = torch.argmax((target_tensor == dataloader.dataset.pad_token_id).to(torch.long),dim=1).to(device)
            ## Creating a mask with 1's in each position of a padding token
            mask = (torch.arange(target_tensor.size(1)).unsqueeze(0).to(device) >= padding_start.unsqueeze(1))
            #print(mask)
            
            ##doing a logical OR with the mask makes sure that the padding tokens do not affect the correctness of the word
            tot_correct_word_preds += (torch.all(torch.logical_or(multi_step_pred_correctness,mask),dim=1).int().sum()).item()
            tot_words += num_words

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    epoch_loss = round(total_loss / len(dataloader),4)
    epoch_accuracy = round(tot_correct_word_preds*100/tot_words,2)

    return epoch_loss,epoch_accuracy

In [8]:
def compute_accuracy(dataloader,encoder,decoder,criterion,padding_token_id,end_token_id = 1,ignore_padding = True,device='cpu'):

    """
    Method to compute the accuracy using the model (encoder-decoder) using dataloader.

    This method returns word and character level accuracy.

        Word Level Accuracy : Accuracy is computed at the word level and a word is right iff every character is predicted correctly.
        Char Level Accuracy : Accuracy is computed by comparing each predicted character wrt the correct char.

    Params:

        dataloader : The train/test/valid dataloader.
        encoder : The encoder 
        decoder : The decoder
        padding_token_id : The id of the padding token.
        ignore_padding : If True, then in word level accuracy, the padding characters are ignored in computing the word level accuracy.
                        char level accuracy, the padding characters are not considered at all.

                        If false, padding is considered to be a part of the word (for word level accuracy) and 
    """

    char_lvl_accuracy = 0
    word_level_accuracy = 0

    tot_chars = 0
    tot_words = 0

    tot_correct_char_preds = 0
    tot_correct_word_preds = 0

    loss = 0

    #criterion = loss_criterion.to(device)

    with torch.no_grad():

        train = 0

        if encoder.training and decoder.training: ## reset the the model back to train mode
            train = 1

        encoder.eval()
        decoder.eval()

        for data in dataloader:
            
            input_tensor, target_tensor,_,target_max_len = data

            batch_size = data[0].shape[0]
    
            if encoder.rnn_type == "LSTM":
                encoder_hidden = torch.zeros(encoder.num_layers*encoder.D, batch_size, encoder.hidden_size, device=device)
                encoder_cell = torch.zeros(encoder.num_layers*encoder.D, batch_size, encoder.hidden_size, device=device)
            else:
                encoder_hidden = None
                encoder_cell = None
            
    
            encoder_hidden_contexts, encoder_last_hidden, encoder_cell = encoder(input_tensor,encoder_hidden,encoder_cell)
            ## even though we are passing target tensor, the teacher forcing ratio is 0, so no teacher forcing
            decoder_outputs, _, _ = decoder(encoder_hidden_contexts, encoder_last_hidden,encoder_cell, target_tensor = target_tensor,teacher_forcing_ratio = 0)
    
            loss += criterion(
                decoder_outputs.view(-1, decoder_outputs.size(-1)),
                target_tensor.view(-1)).item()
    
            ## For a batch, for each character find the most probable output word.
            multi_step_preds = torch.argmax(decoder_outputs,dim=2)
            multi_step_pred_correctness = (multi_step_preds ==  target_tensor)
            num_chars = multi_step_preds.numel() ##find the total number of characters in the current batch
            num_words = multi_step_preds.shape[0] ##find the total number of words in the current batch.
    
            if ignore_padding: ## if padding has to be ignored.
    
                ## for each word, based on the padding token ID, find the first occurance of the padding token, marking the begining of padding.
                
                ## argmax is not supported for bool on cuda, hence casting it to long.
                padding_start = torch.argmax((target_tensor == dataloader.dataset.pad_token_id).to(torch.long),dim=1).to(device)
                ## Creating a mask with 1's in each position of a padding token
                mask = (torch.arange(target_tensor.size(1)).unsqueeze(0).to(device) >= padding_start.unsqueeze(1))
                #print(mask)
                
                ##doing a logical OR with the mask makes sure that the padding tokens do not affect the correctness of the word
                tot_correct_word_preds += (torch.all(torch.logical_or(multi_step_pred_correctness,mask),dim=1).int().sum()).item()
                tot_words += num_words
    
                ##creating a complement of the mask so to mark padding tokens as irrelevant
                complement_mask = (1-mask.int()).bool()
                num_pad_chars = mask.int().sum().item()
                ##counting number of non_pad_chars to compute accuracy.
                num_non_pad_chars = num_chars - num_pad_chars
    
                tot_correct_char_preds += (torch.logical_and(multi_step_pred_correctness,complement_mask).int().sum()).item()
                tot_chars += num_non_pad_chars
                
        
            else: ##otherwise.
    
                tot_correct_word_preds += (torch.all(multi_step_pred_correctness,dim=1).int().sum()).item()
                tot_words += num_words
                
                tot_correct_char_preds += (multi_step_pred_correctness.int().sum()).item()
                tot_chars += num_chars

        #print(tot_correct_char_preds,tot_chars)
        #print(tot_correct_word_preds,tot_words)
    
        char_lvl_accuracy = round(tot_correct_char_preds*100/tot_chars,2)
        word_lvl_accuracy = round(tot_correct_word_preds*100/tot_words,2)

        loss /= dataloader.dataset.data.shape[0]

        if train:

            encoder.train()
            decoder.train()
    
        return round(loss,4),char_lvl_accuracy,word_lvl_accuracy

In [9]:
def train(train_dataloader,valid_loader, encoder, decoder, n_epochs,padding_idx,optimiser = "adam",loss="crossentropy",weight_decay=0, lr=0.001,teacher_forcing = False,teacher_forcing_ratio = 0,print_every=100, plot_every=100,device='cpu',wandb_logging = False):
    
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    ## specify the optimiser
    if optimiser.lower() == "adam":
        encoder_optimizer = optim.Adam(encoder.parameters(), lr=lr,weight_decay=weight_decay)
        decoder_optimizer = optim.Adam(decoder.parameters(), lr=lr,weight_decay=weight_decay)

    elif optimiser.lower() == "nadam":
        encoder_optimizer = optim.NAdam(encoder.parameters(), lr=lr,weight_decay=weight_decay)
        decoder_optimizer = optim.NAdam(decoder.parameters(), lr=lr,weight_decay=weight_decay)

    elif optimiser.lower() == "rmsprop":
        encoder_optimizer = optim.RMSprop(encoder.parameters(), lr=lr,weight_decay=weight_decay)
        decoder_optimizer = optim.RMSprop(decoder.parameters(), lr=lr,weight_decay=weight_decay)
        
    ## Specify the loss criteria
    if loss.lower() == "crossentropy":
        criterion = nn.CrossEntropyLoss(ignore_index = padding_idx).to(device)

    lp = train_dataloader.dataset.lp
    
    #criterion = loss_criterion.to(device)

    for epoch in tqdm(range(1, n_epochs + 1)):

        train_loss,train_accuracy = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion,teacher_forcing_ratio,device=device)
        print_loss_total += train_loss
        plot_loss_total += train_loss
        val_loss,_,val_accuracy = compute_accuracy(valid_loader,encoder,decoder,criterion,padding_token_id = lp.source_char2id['<pad>'],end_token_id = lp.source_char2id['<end>'],ignore_padding=True,device=device)

        print(f"Epoch {epoch}\t Train Loss : {train_loss}\t Train Acc : {train_accuracy}% \t Val Loss : {val_loss}\t Val Acc : {val_accuracy}%")
        if wandb_logging:
            wandb.log({'epoch': epoch,'train loss': train_loss, 'train accuracy': train_accuracy, 'Validation loss': val_loss, 'Validation accuracy': val_accuracy})


In [10]:
def evaluate(encoder, decoder, word, language_processor,device = "cpu"):

    lp = language_processor
    
    with torch.no_grad():

        train = 0

        if encoder.training and decoder.training: ## reset the the model back to train mode
            train = 1

        encoder.eval()
        decoder.eval()
        
        input_tensor = torch.tensor(lp.encode_word(word,lp.source_lang,padding=False,append_eos = True)).to(device).view(1,-1)

        batch_size = 1

        if encoder.rnn_type == "LSTM":
                encoder_hidden = torch.zeros(encoder.num_layers*encoder.D, batch_size, encoder.hidden_size, device=device)
                encoder_cell = torch.zeros(encoder.num_layers*encoder.D, batch_size, encoder.hidden_size, device=device)
        else:
            encoder_hidden = None
            encoder_cell = None
        

        encoder_hidden_contexts, encoder_last_hidden, encoder_cell = encoder(input_tensor,encoder_hidden,encoder_cell)

        """if encoder_hidden.shape[0] != decoder.expected_h0_dim1:
            reshaped_encoder_hidden = encoder_hidden.repeat(decoder.expected_h0_dim1,1,1)
        else:
            reshaped_encoder_hidden = encoder_hidden"""

        #print(encoder_hidden.shape,encoder_hidden.shape)

        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_hidden_contexts, encoder_last_hidden,encoder_cell ,eval_mode = True,target_tensor = None)

        output_size = len(list(train_lp.target_char2id.keys()))
        decoder_outputs = decoder_outputs.view(28,-1)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_chars = []
        """for idx in decoded_ids:
            if idx.item() == end_token_id:
                break
            decoded_chars.append(lp.target_id2char[idx.item()])"""

        decoded_word = lp.decode_word(decoded_ids.cpu().numpy(),lp.target_lang)

    if train:
        encoder.train()
        decoder.train()

    
    return decoded_word, decoder_attn

In [11]:
def setup_and_start_expt(config,wandb_log = True,kaggle=False):
    
    batch_size = config['batch_size']
    target_lang = "tel"

    if kaggle:
        base_dir = "/kaggle/input/aksharantar-sampled/aksharantar_sampled/"
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    else:
        base_dir = "aksharantar_sampled/"
        device = torch.device("mps")

    use_meta_tokens = True
    append_eos = 1
    
    lang_dir = base_dir + target_lang + "/"
    
    ##creating train loader
    train_lp = LanguageProcessor(language_directory=lang_dir,target_lang_name=target_lang,mode="train",meta_tokens=use_meta_tokens)
    ## the ids of these tokens are the same in the source and target language
    start_token_id = train_lp.source_char2id['<start>']
    end_token_id = train_lp.source_char2id['<end>']
    pad_token_id = train_lp.source_char2id['<pad>']

    collate_fn_ptr = partial(collate_fn,pad_token_id=pad_token_id,device=device)
    
    train_dataset = WordDataset(train_lp,device=device)
    train_loader = DataLoader(train_dataset, batch_size=batch_size,collate_fn=collate_fn_ptr, shuffle=True)
    
    ## creating test loader
    test_lp = LanguageProcessor(language_directory=lang_dir,target_lang_name=target_lang,mode="test",meta_tokens=use_meta_tokens)
    
    ## to make sure that the same vocabulary and dictionaries are used everywhere
    test_lp.source_char2id = train_lp.source_char2id
    test_lp.source_id2char = train_lp.source_id2char
    test_lp.target_char2id = train_lp.target_char2id
    test_lp.target_id2char = train_lp.target_id2char
    
    test_dataset = WordDataset(test_lp,device=device)
    test_loader = DataLoader(test_dataset, batch_size=batch_size,collate_fn=collate_fn_ptr, shuffle=True)
    
    ## creating validation loader
    valid_lp = LanguageProcessor(language_directory=lang_dir,target_lang_name=target_lang,mode="valid",meta_tokens=use_meta_tokens)
    valid_lp.source_char2id = train_lp.source_char2id
    valid_lp.source_id2char = train_lp.source_id2char
    valid_lp.target_char2id = train_lp.target_char2id
    valid_lp.target_id2char = train_lp.target_id2char
    
    valid_dataset = WordDataset(valid_lp,device=device)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size,collate_fn=collate_fn_ptr, shuffle=True)
    
    ##in principle these are all fixed across train/test/valid data
    
    #source_max_len = train_lp.source_max_len
    #target_max_len = train_lp.target_max_len
    
    source_vocab_size = len(list(train_lp.source_char2id.keys()))
    target_vocab_size = len(list(train_lp.target_char2id.keys()))
    
    hidden_size = config['hidden_size']
    embedding_size = hidden_size
    
    epochs = config['epochs']
    
    optimiser = config['optimiser']
    
    weight_decay = config['weight_decay']
    
    lr = config['lr']
    
    num_encoder_layers = config['num_layers']
    num_decoder_layers = num_encoder_layers
    
    ## Allowed Values : "GRU"/"RNN"/"LSTM" (not case sensitive)
    rnn_type = config['rnn_type'].upper()
    
    bidirectional = config['bidirectional']
    teacher_forcing_ratio = config['teacher_forcing_ratio']

    teacher_forcing = False
    
    if teacher_forcing_ratio>0:
        teacher_forcing = True
    
    
    #loss_criterion =  nn.CrossEntropyLoss(ignore_index=pad_token_id)
    
    dropout=config['dropout']
    
    encoder = Encoder(source_vocab_size = source_vocab_size, hidden_size = hidden_size,embedding_size=embedding_size,rnn_type = rnn_type,padding_idx=pad_token_id,num_layers=num_encoder_layers,bidirectional=bidirectional,dropout=dropout).to(device)
    
    decoder = Decoder(hidden_size = hidden_size,embedding_size=embedding_size, target_vocab_size = target_vocab_size,batch_size = batch_size,rnn_type = rnn_type, padding_idx = None,num_layers = num_decoder_layers,bidirectional = bidirectional,dropout=dropout,device=device).to(device)
    
    #train(train_loader,valid_loader, encoder, decoder, 3,loss_criterion=loss_criterion, print_every=3, plot_every=5,device=device,teacher_forcing = teacher_forcing,teacher_forcing_ratio=teacher_forcing_ratio)
    train(train_loader,valid_loader, encoder, decoder, epochs,padding_idx = pad_token_id,optimiser = optimiser,weight_decay=weight_decay, lr=lr,teacher_forcing = teacher_forcing,teacher_forcing_ratio = teacher_forcing_ratio,device=device,wandb_logging = wandb_log)

    return encoder,decoder

In [20]:
"""config = {
    'hidden_size':128,
        
        'embedding_size':128,

        'rnn_type' : "gru",
        
        'batch_size':256,
        
        'optimiser': "adam",

        'num_layers' : 2,

        'lr':  1e-3,

        'dropout' : 0.1,
        
        'epochs' : 15,

        'teacher_forcing_ratio' : 0.3,

        'bidirectional' :  True,
    
        'weight_decay': 0
}

config = {
    'hidden_size':512,
        
        'embedding_size':512,

        'rnn_type' : "lstm",
        
        'batch_size':64,
        
        'optimiser': "nadam",

        'num_layers' : 3,

        'lr':  1e-3,

        'dropout' : 0.4,
        
        'epochs' : 15,

        'teacher_forcing_ratio' : 0.4,

        'bidirectional' :  True,
    
        'weight_decay': 1e-5
}"""

config = {

        'batch_size':64,

        'bidirectional' :  True,

        'dropout' : 0.4,

        'embedding_size':128,

        'epochs' : 1,

        'hidden_size':512,

        'lr':  3e-4,

        'num_layers' : 4,

        'optimiser': "nadam",

        'rnn_type' : "lstm",


        'teacher_forcing_ratio' : 0.4,

        'weight_decay': 1e-5

}

encoder,decoder = setup_and_start_expt(config,wandb_log = False,kaggle=False)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/800 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
wandb.login(key="")

sweep_config = {
    'method': 'random',
    'name' : 'PA3 Hyper Sweep GRU',
    'metric': {
      'name': 'Validation accuracy',
      'goal': 'maximize'
    },
    'parameters': {
        
         'hidden_size':{
            'values':[128]#[64,128,256,512]
        },
        
        'embedding_size':{
            'values':[128]#[64,128,256,512]
        },

        'rnn_type':{
            'values':["gru"]#['lstm','rnn','gru']
        },
        
        'batch_size':{
            'values':[64]#[32,64,128,256]
        },
        
        'optimiser': {
            'values': ["adam"]#,"rmsprop","nadam"]
        },

        'num_layers' :{
            'values' : [2]#[1,2,3,4,5]
        },

        'lr': {
            'values': [1e-3]#[1e-2,1e-3,1e-4,3e-4]
        },

        'dropout' : {

            'values' : [0.1]#[0,0.1,0.2,0.3,0.4]
        },
        
        'epochs' : {

            'values' : [15]
        },

        'teacher_forcing_ratio' : {
            'values' : [0.3]#[0,0.1,0.2,0.3,0.4,0.5]
        },

        'bidirectional' : {
            'values' : [True]#[True,False]
        },
        'weight_decay': {
            'values': [0,1e-3,]#,1e-3,5e-3,5e-4]
        },
        }
    }

sweep_id = wandb.sweep(sweep=sweep_config, project='JV_CS23M036_TEJASVI_DL_ASSIGNMENT3')

In [None]:
def main():
    '''
    WandB calls main function each time with differnet combination.

    We can retrive the same and use the same values for our hypermeters.

    '''


    with wandb.init() as run:

        run_name="-hl_"+str(wandb.config.num_layers)+"-hs_"+str(wandb.config.hidden_size)+"-es_"+str(wandb.config.hidden_size)+"-biDir_"+str(wandb.config.bidirectional)

        run_name = run_name+"-rnn_type_"+str(wandb.config.rnn_type)+run_name+"-optim_"+str(wandb.config.optimiser)+"-lr_"+str(wandb.config.lr)+"-reg_"+str(wandb.config.weight_decay)+"-epochs_"+str(wandb.config.epochs)+"-tf_ratio_"+str(wandb.config.teacher_forcing_ratio)

        run_name = run_name+"-dropout_"+str(wandb.config.dropout)

        wandb.run.name=run_name

        setup_and_start_expt(wandb.config,wandb_log = True,kaggle=False)


wandb.agent(sweep_id, function=main,count=400) # calls main function for count number of times.
wandb.finish()

In [None]:
"""op1,_ = evaluate(encoder, decoder, word="srirama", language_processor=train_lp,device = device)

op2,_ = evaluate(encoder, decoder, word="tejasvi", language_processor=train_lp,device = device)

op1_string = "".join(op1)
print(op1_string[:8])

op2_string = "".join(op2)
print(op2_string[:7])

print()
"""

### JV

###### To Do:

1. Add LSTM,RNN support : Train
2. Model parameter Initialization.
3. Attention.
4. Now create a seq2seq class, specify attention = True for attention to work.

Hyper Params:

1. Batch size
2. Hidden layer size
3. Embedding size
4. number of encoder layers
5. number of decoder layers
6. bidirectional
7. tf ratio ; 0/0.1/0.2/0.3/0.5
8. Optimizer
9. Learning Rate
10. Batch size
11. Dropout

In [None]:
"""
Dropout within GR/LSTM/RNN could be applied only when num_layers>1, so if we have 1 layer dropout of 0 is applied.
However, the specified dropout is applied to embedding.

considering encoder_layers = decoder_layers = num_layers.

considering embedding_size = hidden_size.


"""