In [1]:
#JV

In [2]:
import numpy as np

import pandas as pd

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset,TensorDataset, DataLoader, RandomSampler
from torch.nn.utils.rnn import pad_sequence
import random
from functools import partial

import os
import gc
import time
import math

import matplotlib.pyplot as plt
#plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

from tqdm.notebook import tqdm

#!python3 -m pip install wandb
import wandb

from Core_Utils import *

from Encoder_Decoder_Architecture import *

from Machine_Translator import *

In [3]:
seed = 23

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [8]:
def evaluate(encoder, decoder, word, language_processor,device = "cpu"):

    lp = language_processor
    
    with torch.no_grad():

        train = 0

        if encoder.training and decoder.training: ## reset the the model back to train mode
            train = 1

        encoder.eval()
        decoder.eval()
        
        input_tensor = torch.tensor(lp.encode_word(word,lp.source_lang,padding=False,append_eos = True)).to(device).view(1,-1)

        batch_size = 1

        if encoder.rnn_type == "LSTM":
                encoder_hidden = torch.zeros(encoder.num_layers*encoder.D, batch_size, encoder.hidden_size, device=device)
                encoder_cell = torch.zeros(encoder.num_layers*encoder.D, batch_size, encoder.hidden_size, device=device)
        else:
            encoder_hidden = None
            encoder_cell = None
        

        encoder_hidden_contexts, encoder_last_hidden, encoder_cell = encoder(input_tensor,encoder_hidden,encoder_cell)

        """if encoder_hidden.shape[0] != decoder.expected_h0_dim1:
            reshaped_encoder_hidden = encoder_hidden.repeat(decoder.expected_h0_dim1,1,1)
        else:
            reshaped_encoder_hidden = encoder_hidden"""

        #print(encoder_hidden.shape,encoder_hidden.shape)

        decoder_outputs, _, attentions = decoder(encoder_hidden_contexts, encoder_last_hidden,encoder_cell ,eval_mode = True,target_tensor = None)

        output_size = len(list(lp.target_char2id.keys()))
        decoder_outputs = decoder_outputs.view(30,-1)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_chars = []
        """for idx in decoded_ids:
            if idx.item() == end_token_id:
                break
            decoded_chars.append(lp.target_id2char[idx.item()])"""

        decoded_word = lp.decode_word(decoded_ids.cpu().numpy(),lp.target_lang)

    if train:
        encoder.train()
        decoder.train()

    
    return decoded_word, attentions

In [19]:
def setup_and_start_expt(config,wandb_log = True,kaggle=False):
    
    batch_size = config['batch_size']
    target_lang = "tel"

    if kaggle:
        base_dir = "/kaggle/input/aksharantar-sampled/aksharantar_sampled/"
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    else:
        base_dir = "aksharantar_sampled/"
        device = torch.device("mps")

    use_meta_tokens = True
    append_eos = 1
    
    lang_dir = base_dir + target_lang + "/"
    
    ##creating train loader
    train_lp = LanguageProcessor(language_directory=lang_dir,target_lang_name=target_lang,mode="train",meta_tokens=use_meta_tokens)
    ## the ids of these tokens are the same in the source and target language
    start_token_id = train_lp.source_char2id['<start>']
    end_token_id = train_lp.source_char2id['<end>']
    pad_token_id = train_lp.source_char2id['<pad>']

    collate_fn_ptr = partial(collate_fn,pad_token_id=pad_token_id,device=device)
    
    train_dataset = WordDataset(train_lp,device=device)
    train_loader = DataLoader(train_dataset, batch_size=batch_size,collate_fn=collate_fn_ptr, shuffle=True)
    
    ## creating test loader
    test_lp = LanguageProcessor(language_directory=lang_dir,target_lang_name=target_lang,mode="test",meta_tokens=use_meta_tokens)
    
    ## to make sure that the same vocabulary and dictionaries are used everywhere
    test_lp.source_char2id = train_lp.source_char2id
    test_lp.source_id2char = train_lp.source_id2char
    test_lp.target_char2id = train_lp.target_char2id
    test_lp.target_id2char = train_lp.target_id2char
    
    test_dataset = WordDataset(test_lp,device=device)
    test_loader = DataLoader(test_dataset, batch_size=batch_size,collate_fn=collate_fn_ptr, shuffle=True)
    
    ## creating validation loader
    valid_lp = LanguageProcessor(language_directory=lang_dir,target_lang_name=target_lang,mode="valid",meta_tokens=use_meta_tokens)
    valid_lp.source_char2id = train_lp.source_char2id
    valid_lp.source_id2char = train_lp.source_id2char
    valid_lp.target_char2id = train_lp.target_char2id
    valid_lp.target_id2char = train_lp.target_id2char
    
    valid_dataset = WordDataset(valid_lp,device=device)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size,collate_fn=collate_fn_ptr, shuffle=True)
    
    ##in principle these are all fixed across train/test/valid data
    
    #source_max_len = train_lp.source_max_len
    #target_max_len = train_lp.target_max_len
    
    source_vocab_size = len(list(train_lp.source_char2id.keys()))
    target_vocab_size = len(list(train_lp.target_char2id.keys()))
    
    hidden_size = config['hidden_size']
    embedding_size = hidden_size
    
    epochs = config['epochs']
    
    optimiser = config['optimiser']
    
    weight_decay = config['weight_decay']
    
    lr = config['lr']
    
    num_layers = config['num_layers']
    #num_decoder_layers = num_encoder_layers
    
    ## Allowed Values : "GRU"/"RNN"/"LSTM" (not case sensitive)
    rnn_type = config['rnn_type'].upper()
    
    bidirectional = config['bidirectional']
    teacher_forcing_ratio = config['teacher_forcing_ratio']

    dropout=config['dropout']

    use_attention = config['use_attention']
    
    #encoder = Encoder(source_vocab_size = source_vocab_size, hidden_size = hidden_size,embedding_size=embedding_size,rnn_type = rnn_type,padding_idx=pad_token_id,num_layers=num_encoder_layers,bidirectional=bidirectional,dropout=dropout).to(device)
    
    #decoder = Decoder(hidden_size = hidden_size,embedding_size=embedding_size, target_vocab_size = target_vocab_size,batch_size = batch_size,rnn_type = rnn_type,use_attention = use_attention, padding_idx = None,num_layers = num_decoder_layers,bidirectional = bidirectional,dropout=dropout,device=device).to(device)
    
    model = MachineTranslator(source_vocab_size,target_vocab_size,hidden_size,embedding_size,rnn_type,batch_size,pad_token_id,dropout,num_layers,bidirectional,use_attention,device)

    #train(train_loader,valid_loader, encoder, decoder, 3,loss_criterion=loss_criterion, print_every=3, plot_every=5,device=device,teacher_forcing = teacher_forcing,teacher_forcing_ratio=teacher_forcing_ratio)
    model.train(train_loader,valid_loader,model.encoder, model.decoder, epochs,padding_idx = pad_token_id,optimiser = optimiser,weight_decay=weight_decay, lr=lr,teacher_forcing_ratio = teacher_forcing_ratio,device=device,wandb_logging = wandb_log)

    return model

In [20]:
"""config = {
    'hidden_size':128,
        
        'embedding_size':128,

        'rnn_type' : "gru",
        
        'batch_size':256,
        
        'optimiser': "adam",

        'num_layers' : 2,

        'lr':  1e-3,

        'dropout' : 0.1,
        
        'epochs' : 15,

        'teacher_forcing_ratio' : 0.3,

        'bidirectional' :  True,
    
        'weight_decay': 0
}

config = {
    'hidden_size':512,
        
        'embedding_size':512,

        'rnn_type' : "lstm",
        
        'batch_size':64,
        
        'optimiser': "nadam",

        'num_layers' : 3,

        'lr':  1e-3,

        'dropout' : 0.4,
        
        'epochs' : 15,

        'teacher_forcing_ratio' : 0.4,

        'bidirectional' :  True,
    
        'weight_decay': 1e-5
}"""

config = {

        'batch_size':64,

        'bidirectional' :  True,

        'dropout' : 0.4,

        'embedding_size':128,

        'epochs' : 1,

        'hidden_size':512,

        'lr':  3e-4,

        'num_layers' : 4,

        'optimiser': "nadam",

        'rnn_type' : "lstm",

        'teacher_forcing_ratio' : 0.4,

        'weight_decay': 1e-5,

        'use_attention' : True

}

model = setup_and_start_expt(config,wandb_log = False,kaggle=False)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/800 [00:00<?, ?it/s]

Epoch 1	 Train Loss : 1.532	 Train Acc : 9.21% 	 Val Loss : 0.0169	 Val Acc : 26.34%


In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


batch_size = config['batch_size']
kaggle = False

target_lang = "tel"

if kaggle:
    base_dir = "/kaggle/input/aksharantar-sampled/aksharantar_sampled/"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else:
    base_dir = "aksharantar_sampled/"
    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = torch.device("mps")


use_meta_tokens = True
append_eos = 1

lang_dir = base_dir + target_lang + "/"


##creating train loader
train_lp = LanguageProcessor(language_directory=lang_dir,target_lang_name=target_lang,mode="train",meta_tokens=use_meta_tokens)
## the ids of these tokens are the same in the source and target language
start_token_id = train_lp.source_char2id['<start>']
end_token_id = train_lp.source_char2id['<end>']
pad_token_id = train_lp.source_char2id['<pad>']

collate_fn_ptr = partial(collate_fn,pad_token_id=pad_token_id,device=device)

train_dataset = WordDataset(train_lp,device=device)
train_loader = DataLoader(train_dataset, batch_size=batch_size,collate_fn=collate_fn_ptr, shuffle=True)

## creating test loader
test_lp = LanguageProcessor(language_directory=lang_dir,target_lang_name=target_lang,mode="test",meta_tokens=use_meta_tokens)

## to make sure that the same vocabulary and dictionaries are used everywhere
test_lp.source_char2id = train_lp.source_char2id
test_lp.source_id2char = train_lp.source_id2char
test_lp.target_char2id = train_lp.target_char2id
test_lp.target_id2char = train_lp.target_id2char

test_dataset = WordDataset(test_lp,device=device)
test_loader = DataLoader(test_dataset, batch_size=batch_size,collate_fn=collate_fn_ptr, shuffle=True)

criterion = nn.CrossEntropyLoss(ignore_index = 2).to(device)

test_loss,_,test_accuracy = compute_accuracy(test_loader,encoder,decoder,criterion,padding_token_id = test_lp.source_char2id['<pad>'],end_token_id = test_lp.source_char2id['<end>'],ignore_padding=True,device=device)


In [17]:
word = "srirama"

decoded_word, attentions = evaluate(encoder, decoder, word, train_lp,device = "mps")


IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)

In [None]:
wandb.login(key="")

sweep_config = {
    'method': 'random',
    'name' : 'PA3 Hyper Sweep GRU',
    'metric': {
      'name': 'Validation accuracy',
      'goal': 'maximize'
    },
    'parameters': {
        
         'hidden_size':{
            'values':[128]#[64,128,256,512]
        },
        
        'embedding_size':{
            'values':[128]#[64,128,256,512]
        },

        'rnn_type':{
            'values':["gru"]#['lstm','rnn','gru']
        },
        
        'batch_size':{
            'values':[64]#[32,64,128,256]
        },
        
        'optimiser': {
            'values': ["adam"]#,"rmsprop","nadam"]
        },

        'num_layers' :{
            'values' : [2]#[1,2,3,4,5]
        },

        'lr': {
            'values': [1e-3]#[1e-2,1e-3,1e-4,3e-4]
        },

        'dropout' : {

            'values' : [0.1]#[0,0.1,0.2,0.3,0.4]
        },
        
        'epochs' : {

            'values' : [15]
        },

        'teacher_forcing_ratio' : {
            'values' : [0.3]#[0,0.1,0.2,0.3,0.4,0.5]
        },

        'bidirectional' : {
            'values' : [True]#[True,False]
        },
        'weight_decay': {
            'values': [0,1e-3,]#,1e-3,5e-3,5e-4]
        },
        }
    }

sweep_id = wandb.sweep(sweep=sweep_config, project='JV_CS23M036_TEJASVI_DL_ASSIGNMENT3')

In [None]:
def main():
    '''
    WandB calls main function each time with differnet combination.

    We can retrive the same and use the same values for our hypermeters.

    '''


    with wandb.init() as run:

        run_name="-hl_"+str(wandb.config.num_layers)+"-hs_"+str(wandb.config.hidden_size)+"-es_"+str(wandb.config.hidden_size)+"-biDir_"+str(wandb.config.bidirectional)

        run_name = run_name+"-rnn_type_"+str(wandb.config.rnn_type)+run_name+"-optim_"+str(wandb.config.optimiser)+"-lr_"+str(wandb.config.lr)+"-reg_"+str(wandb.config.weight_decay)+"-epochs_"+str(wandb.config.epochs)+"-tf_ratio_"+str(wandb.config.teacher_forcing_ratio)

        run_name = run_name+"-dropout_"+str(wandb.config.dropout)

        wandb.run.name=run_name

        setup_and_start_expt(wandb.config,wandb_log = True,kaggle=False)


wandb.agent(sweep_id, function=main,count=400) # calls main function for count number of times.
wandb.finish()

In [None]:
"""op1,_ = evaluate(encoder, decoder, word="srirama", language_processor=train_lp,device = device)

op2,_ = evaluate(encoder, decoder, word="tejasvi", language_processor=train_lp,device = device)

op1_string = "".join(op1)
print(op1_string[:8])

op2_string = "".join(op2)
print(op2_string[:7])

print()
"""

### JV

###### To Do:

1. Add LSTM,RNN support : Train
2. Model parameter Initialization.
3. Attention.
4. Now create a seq2seq class, specify attention = True for attention to work.

Hyper Params:

1. Batch size
2. Hidden layer size
3. Embedding size
4. number of encoder layers
5. number of decoder layers
6. bidirectional
7. tf ratio ; 0/0.1/0.2/0.3/0.5
8. Optimizer
9. Learning Rate
10. Batch size
11. Dropout

In [None]:
"""
Dropout within GR/LSTM/RNN could be applied only when num_layers>1, so if we have 1 layer dropout of 0 is applied.
However, the specified dropout is applied to embedding.

considering encoder_layers = decoder_layers = num_layers.

considering embedding_size = hidden_size.


"""