In [None]:
import torch
import pandas as pd
import copy
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
import random
import heapq
import torch.optim as optim
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Preprocessing

In [None]:
class lang:
    def __init__(self,path_train,path_val,path_test):
        self.path_train = path_train
        self.path_val = path_val
        self.path_test = path_test
        self.trainfile = pd.read_csv(path_train,header=None, encoding='utf-8')
        self.valfile = pd.read_csv(path_val,header=None, encoding='utf-8')
        self.testfile = pd.read_csv(path_test,header=None, encoding='utf-8')

    def datasetencoder(self,file):

        # Update file[0]
        file[0] = [x + '>' for x in file[0]]

        # Update file[1]
        file[1] = ['<' + x + '>' for x in file[1]]

        # Calculate maximum length of unique elements in file[0]
        ipmax = 0
        for x in file[0].unique():
            if len(x) > ipmax:
                ipmax = len(x)

        # Calculate maximum length of unique elements in file[1]
        opmax = 0
        for x in file[1].unique():
            if len(x) > opmax:
                opmax = len(x)


        return ipmax,opmax,file

    def dictionary_create(self,data):

        data.discard('<')
        data.discard('>')
        chartoint = {"": 0, '<':1, '>':2}
        inttochar = {}

        for ci, c in enumerate(sorted(data), len(chartoint)):
            chartoint[c] = ci
        for c, ci in chartoint.items():
            inttochar[ci] = c

        return chartoint,inttochar

    def convert_tensor_element(self,data , length , chartoint):

        data_enc = np.zeros(length)
        encoder = []
        for char in data:
            encoder.append(chartoint[char])
        encoder = np.array(encoder)

        if len(encoder) < length:
          length = len(encoder)

        data_enc[:length] = encoder[:length]

        return torch.tensor(data_enc, dtype=torch.int64)

    def convert_tensor_data(self,data,maxlength_ip, chartoint_ip,maxlength_op, chartoint_op):

        tensor_obj_input = []
        tensor_obj_output = []

        for ip, op in zip(data[0], data[1]):

            temp_input,temp_output = self.convert_tensor_element(ip, maxlength_ip, chartoint_ip),self.convert_tensor_element(op, maxlength_op, chartoint_op)
            tensor_obj_output.append(temp_output)
            tensor_obj_input.append(temp_input)

        tensor_obj_input,tensor_obj_output =  torch.stack(tensor_obj_input),torch.stack(tensor_obj_output)


        return tensor_obj_input , tensor_obj_output

    def preparedata(self):

        train_ipmax , train_opmax , train = self.datasetencoder(self.trainfile)
        val_ipmax , val_opmax , val =self.datasetencoder(self.valfile)
        test_ipmax , test_opmax , test =self.datasetencoder(self.testfile)

        input_char_to_int,input_int_to_char  = self.dictionary_create(set(''.join(train[0]) + ''.join(val[0]) + ''.join(test[0])))
        output_char_to_int ,output_int_to_char= self.dictionary_create(set(''.join(train[1]) + ''.join(val[1]) + ''.join(test[1])))

        ipmax = max(train_ipmax ,val_ipmax ,test_ipmax)
        opmax = max(train_opmax ,val_opmax , test_opmax)

        train_tensor_ip, train_tensor_op = self.convert_tensor_data(train,ipmax,input_char_to_int,opmax,output_char_to_int)
        val_tensor_ip, val_tensor_op = self.convert_tensor_data(val,ipmax,input_char_to_int,opmax,output_char_to_int)
        test_tensor_ip, test_tensor_op = self.convert_tensor_data(test,ipmax,input_char_to_int,opmax,output_char_to_int)

        #transpose data tensor

        train_tensor_ip, train_tensor_op = train_tensor_ip.t(),train_tensor_op.t()
        val_tensor_ip, val_tensor_op = val_tensor_ip.t(), val_tensor_op.t()
        test_tensor_ip, test_tensor_op = test_tensor_ip.t() , test_tensor_op.t()

        len_max = max(ipmax,opmax)

        return train_tensor_ip, train_tensor_op ,val_tensor_ip, val_tensor_op,test_tensor_ip, test_tensor_op,input_char_to_int,input_int_to_char,output_char_to_int ,output_int_to_char,val,len_max

        #do all func call in this and return final datasets


# pathtrain = "/kaggle/input/akshatantra/aksharantar_sampled/hin/hin_train.csv"
# pathval = "/kaggle/input/akshatantra/aksharantar_sampled/hin/hin_valid.csv"
# pathtest = "/kaggle/input/akshatantra/aksharantar_sampled/hin/hin_test.csv"

# language = lang(pathtrain , pathval, pathtest)
# train_ip, train_op ,val_ip, val_op,test_ip, test_op,input_char_to_int,input_int_to_char,output_char_to_int ,output_int_to_char,df_val,len_max = language.preparedata()

# print(val_ip.shape)


## Modules for encoder and decoder

In [None]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()

    def forward(self, hidden, encoder_outputs):

        attention_score = torch.sum(hidden * encoder_outputs, dim=2)
        attention_score = attention_score.t()
        attention_wt = F.softmax(attention_score, dim=1)
        return attention_wt.unsqueeze(1)

class EncoderModule(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, layers, dropout, bidirectional, module_type):


        super(EncoderModule, self).__init__()
        self.mng = -1
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional
        self.dropout = nn.Dropout(dropout)
        self.count = 0
        self.module_dict = ["GRU" , "LSTM" , "RNN"]
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.layers = layers
        self.module_type = module_type




        if module_type == self.module_dict[0]:
            self.rnn = nn.GRU(embedding_size, hidden_size, layers, dropout=dropout, bidirectional=bidirectional)
        if module_type == self.module_dict[2]:
            self.rnn = nn.RNN(embedding_size, hidden_size, layers, dropout=dropout, bidirectional=bidirectional)
        if module_type == self.module_dict[1]:
            self.rnn = nn.LSTM(embedding_size, hidden_size, layers, dropout=dropout, bidirectional=bidirectional)


    def forward(self, x):

        embedding = self.dropout(self.embedding(x))

        if self.module_type == "GRU":

            outputs, hidden = self.rnn(embedding)
            if self.bidirectional == True:

                outputs,hidden = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:] , torch.cat((hidden[: self.layers], hidden[self.layers:]), dim=0)



            return outputs, hidden
        if self.module_type == "LSTM":
            # Pass through the LSTM layer
            outputs, (hidden, cell) = self.rnn(embedding) # outputs shape: (seq_length, N, hidden_size)
            if self.bidirectional == True:
                # Sum the bidirectional outputs
              outputs,hidden = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:] , torch.cat((hidden[: self.layers], hidden[self.layers:]), dim=0)

            return outputs, hidden, cell

            # Return hidden state and cell state

        if self.module_type == "RNN":
            # Pass through the RNN/GRU layer
            outputs, hidden = self.rnn(embedding) # outputs shape: (seq_length, N, hidden_size)
            if self.bidirectional == True:
                # Sum the bidirectional outputs
              outputs,hidden = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:] , torch.cat((hidden[: self.layers], hidden[self.layers:]), dim=0)

            # Return output (max_seq, N, hidden size)
            return outputs, hidden





class DecoderModule(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, layers, dropout, bidirection_type, cell_type):
        super(DecoderModule, self).__init__()
        self.mng = -1
        self.counttemp = 0
        self.dropout = nn.Dropout(dropout)
        self.module_dict = ["GRU" , "LSTM" , "RNN"]
        self.bidirectional = bidirection_type
        self.hidden_size = hidden_size
        self.cell_type = cell_type
        self.embedding_size = embedding_size
        self.layers = layers



        self.embedding = nn.Embedding(input_size, embedding_size)

        if module_type == self.module_dict[0]:
            self.rnn = nn.GRU(embedding_size, hidden_size, layers, dropout=dropout)
        if module_type == self.module_dict[1]:
            self.rnn = nn.LSTM(embedding_size, hidden_size, layers, dropout=dropout)
        if module_type == self.module_dict[2]:
            self.rnn = nn.RNN(embedding_size, hidden_size, layers, dropout=dropout)



        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
        self.log_softmax = nn.LogSoftmax(dim=1)
        self.attn = Attention(hidden_size)

    def forward(self, x, encoder_outputs, hidden, cell):

        embedding = self.embedding(x.unsqueeze(0))
        embedding = self.dropout(embedding)


        if self.cell_type == "GRU" :

            outputs, hidden = self.rnn(embedding, hidden)


            attention_weights = self.attn(outputs, encoder_outputs)
            encoder_outputs=encoder_outputs.transpose(0, 1)
            context = attention_weights.bmm(encoder_outputs)

            concat_input = torch.cat((outputs.squeeze(0), context.squeeze(1)), 1)
            concat_input = self.concat(concat_input)
            concat_output = torch.tanh(concat_input)



            out = self.fc(concat_output)
            predictions = self.log_softmax(out)

            return predictions, hidden,attention_weights.squeeze(1)

        if self.cell_type == "LSTM":
            outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))


            attention_weights = self.attn(outputs, encoder_outputs)
            encoder_outputs = encoder_outputs.transpose(0, 1)
            context =attention_weights.bmm(encoder_outputs)
            concat_input = torch.cat((outputs.squeeze(0), context.squeeze(1)), 1)
            concat_input = self.concat(concat_input)
            concat_output = torch.tanh(concat_input)


            out = self.fc(concat_output)
            predictions = self.log_softmax(out)

            return predictions, hidden, cell, attention_weights.squeeze(1)

        if self.cell_type == "RNN":

            outputs, hidden = self.rnn(embedding, hidden)


            attention_weights = self.attn(outputs, encoder_outputs)
            encoder_outputs=encoder_outputs.transpose(0, 1)
            context = attention_weights.bmm(encoder_outputs)

            concat_input = self.concat(concat_input)
            concat_output = torch.tanh(concat_input)



            out = self.fc(concat_output)
            predictions = self.log_softmax(out)

            return predictions, hidden,attention_weights.squeeze(1)




## Seq2Seq Class

In [None]:


class Seq2Seq(nn.Module):

    def __init__(self, encoder, decoder, output_char_to_int, teacher_forcing, module_type):

        super(Seq2Seq, self).__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.mng = -1
        self.counttemp = 0
        self.teacher_force_ratio = teacher_forcing
        self.module_type = module_type
        self.target_vocab_size = len(output_char_to_int)

    def forward(self, source, target):


        target_vocab_size = self.target_vocab_size
        teacher_force_ratio = self.teacher_force_ratio
        batch_size = source.shape[1]



        outputs = torch.zeros(target.shape[0], batch_size, target_vocab_size).to(source.device)

        x = target[0,:]

        if self.module_type == 'GRU':
            encoder_outputs, hidden = self.encoder(source)
            hidden =  hidden[:self.decoder.layers]
        if self.module_type == 'RNN':
            encoder_outputs, hidden = self.encoder(source)
            hidden =  hidden[:self.decoder.layers]
        if self.module_type == 'LSTM':
            encoder_outputs, hidden, cell = self.encoder(source)
            hidden,cell =  hidden[:self.decoder.layers],cell[:self.decoder.layers]

        for t in range(1, target.shape[0]):

            if self.module_type == 'LSTM':
                output, hidden, cell, _ = self.decoder(x, encoder_outputs, hidden, cell)
            if self.module_type == 'RNN':
                output, hidden, _ = self.decoder(x, encoder_outputs, hidden, None)
            if self.module_type == 'GRU':
                output, hidden, _ = self.decoder(x, encoder_outputs, hidden, None)



            outputs[t] =  output
            best_guess = output.argmax(1)

            if random.random() >= teacher_force_ratio:
              x = best_guess
            else:
              x = target[t]
        return outputs

# TRAINING

In [None]:
def beam_search(model, input_seq, max_length, input_char_index, output_char_index, reverse_target_char_index, beam_width, length_penalty, cell_type):

    lengthip = len(input_seq)
    check = False
    if lengthip > max_length:
        return ""


    input_data = np.zeros((max_length, 1), dtype=int) # (N,1)
    count = 0

    # Encode the input
    for idx, char in enumerate(input_seq):
        input_data[idx, 0] = input_char_index[char]
        count += 1
    input_data[idx + 1, 0] = input_char_index[">"]

    # Convert to tensor
    if(count < 0 ):
      input_tensor = torch.tensor(input_data, dtype=torch.int64).to(device)
    else :
      input_tensor = torch.tensor(input_data, dtype=torch.int64).to(device)

    with torch.no_grad():

        if cell_type == 'GRU':
            encoder_outputs, hidden = model.encoder(input_tensor)
            check = True
            hidden =  hidden[:model.decoder.layers]
        if cell_type == 'RNN':
            encoder_outputs, hidden = model.encoder(input_tensor)
            check = True
            hidden =  hidden[:model.decoder.layers]
        if cell_type == 'LSTM':
            encoder_outputs, hidden, cell = model.encoder(input_tensor)
            check = True
            hidden,cell =  hidden[:model.decoder.layers],cell[:model.decoder.layers]

    # Initialize beam
    out_t = output_char_index['<']
    out_reshape = np.array(out_t).reshape(1,)
    initial_sequence = torch.tensor(out_reshape).to(device)
    beam = [(0.0, initial_sequence, hidden.unsqueeze(0))]
    lcn = len(output_char_index)
    for _ in range(lcn):
        candidates = []
        counttem = 1
        for score, seq, hidden in beam:

            if  output_char_index['>'] == seq[-1].item():

                counttem = 0
                candidates.append((score, seq, hidden))
                continue

            last_token = np.array(seq[-1].item())
            x = torch.tensor(last_token.reshape(1,)).to(device)

            if cell_type == 'LSTM':
                output, hidden, cell, _ = model.decoder(x, encoder_outputs, hidden.squeeze(0), cell)
            if cell_type == 'GRU':
                output, hidden, _ = model.decoder(x, encoder_outputs, hidden.squeeze(0), None)
            if cell_type == 'RNN':
                output, hidden, _ = model.decoder(x, encoder_outputs, hidden.squeeze(0), None)


            probabilities = F.softmax(output, dim=1)
            topk_probs, topk_tokens = torch.topk(probabilities, k=beam_width)
            itrtopk = topk_probs[0]
            itrtoktokens =  topk_tokens[0]
            for prob, token in zip(itrtopk, itrtoktokens):
                tokentemp = token.unsqueeze(0)
                new_seq = torch.cat((seq, tokentemp), dim=0)
                length_newsq = len(new_seq)
                seq_length_norm_factor = (length_newsq  - 1)
                seq_length_norm_factor = seq_length_norm_factor/5
                candidate_score_deno =  (seq_length_norm_factor ** length_penalty)
                candidate_score = score + torch.log(prob).item() / candidate_score_deno
                candidates.append((candidate_score, new_seq, hidden.unsqueeze(0)))

        keyq = lambda x: x[0]
        beam = heapq.nlargest(beam_width, candidates, key = keyq)

    best_score = float('-inf')
    best_sequence = None

    for score, sequence, _ in beam:
        if score > best_score:
            best_score = score
            best_sequence = sequence

    result = []
    for token in best_sequence[1:]:
        char = reverse_target_char_index[token.item()]
        result.append(char)

    final_string = ''.join(result)

    return final_string




# TRAINING FUNCTION
def train(model, num_epochs, criterion, optimizer, train_batch_x, train_batch_y, val_batch_x, val_batch_y, df_val, input_char_to_int, output_char_to_int, output_int_to_char, beam_width, length_penalty, module_type, max_length, wandb_log):

    for epoch in range(num_epochs):
        total_loss = 0
        total_words = 0
        accuracy = 0
        correct_pred = 0
        model.train()


        # train_data_iterator =

        for (x, y) in tqdm(zip(train_batch_x, train_batch_y), total=len(train_batch_x)):



            optimizer.zero_grad()
            output = model(x.to(device), y.to(device))

            target = y.to(device)
            target = target.reshape(-1)
            output = output.reshape(-1, output.shape[2])
            loss = criterion(output[(target != 0)], target[(target != 0)])


            loss.backward()


            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
            optimizer.step()

            total_loss = total_loss + loss.item()

            total_words = total_words + target.size(0)
            checkq = torch.argmax(output, dim=1) == target
            correct_pred = correct_pred + torch.sum(checkq).item()

        avg_loss = total_loss / len(train_batch_x)
        acc = correct_pred / total_words
        acc *= 100

        model.eval()

        with torch.no_grad():
            val_total_loss = 0
            val_total_words = 0
            val_correct_pred = 0


            # val_data_iterator =

            for x_val, y_val in tqdm(zip(val_batch_x, val_batch_y),total =len(val_batch_x)):
                target_val = y_val.to(device)
                inp_data_val = x_val.to(device)

                output_val = model(inp_data_val, target_val)


                target_val = target_val.reshape(-1)
                opvalshape = output_val.shape[2]
                output_val = output_val.reshape(-1, opvalshape)

                pad_mask = (target_val != 0)
                target_val = target_val[pad_mask]
                output_val = output_val[pad_mask]

                val_loss = criterion(output_val, target_val)
                val_total_loss = val_total_loss+ val_loss.item()
                val_total_words = val_total_words+ target_val.size(0)
                checkq = torch.argmax(output_val, dim=1) == target_val
                val_correct_pred = val_correct_pred+ torch.sum(checkq).item()


            val_acc = val_correct_pred / val_total_words
            val_acc = 100*val_acc
            val_avg_loss = val_total_loss / len(val_batch_x)


        beam_val_pred = 0
        beam_val = 0
        itrtqdm = tqdm(range(df_val.shape[0]))
        for i in itrtqdm:
            input_seq,true_seq  = df_val.iloc[i, 0][:-1],df_val.iloc[i, 1][1:-1]
            predicted_output = beam_search(model, input_seq, max_length, input_char_to_int, output_char_to_int, output_int_to_char, beam_width, length_penalty, module_type)
            cod = true_seq == predicted_output[:-1]
            if cod:
                beam_val_pred = beam_val_pred + 1
        beam_val = beam_val_pred/df_val.shape[0]
        beam_val = 100*beam_val

        print("========================================================================")
        print(f"---------------------------- Epoch : ",epoch+1,"------------------------")
        print(f"Train accuracy Character: ",acc)
        print(f"Train Average Loss: ",avg_loss)
        print(f"Validation accuracy Character: ",val_acc)
        print(f"Validation Average Loss: ",val_avg_loss)
        print(f"Beam Val Word accuracy: " ,beam_val)
        print(f"Correct Prediction : {beam_val_pred}/{df_val.shape[0]}")
        print("========================================================================")

        # if wandb_log == 1:
        #     wandb.log({
        #         "train_accuracy_char": accuracy,
        #         "train_loss": avg_loss,
        #         "val_acc_char": val_acc,
        #         "val_loss": val_avg_loss,
        #         "beam_val_acc_word" : beam_val,
        #     })


    return model, beam_val

In [None]:
def assign_opt(optimizer):
  if optimizer == 'adam':
      return optim.Adam(model.parameters(), lr=learning_rate)
  if optimizer == 'sgd':
      return optim.SGD(model.parameters(), lr=learning_rate)
  if optimizer == 'rmsprop':
      return optim.RMSprop(model.parameters(), lr=learning_rate)
  if optimizer == 'nadam':
      return optim.Adam(model.parameters(), lr=learning_rate)
  if optimizer == 'adagrad':
      return optim.Adagrad(model.parameters(), lr=learning_rate)



pathtrain = "/content/drive/MyDrive/aksharantar_sampled/aksharantar_sampled/hin/hin_train.csv"
pathval = "/content/drive/MyDrive/aksharantar_sampled/aksharantar_sampled/hin/hin_valid.csv"
pathtest = "/content/drive/MyDrive/aksharantar_sampled/aksharantar_sampled/hin/hin_test.csv"
language = lang(pathtrain , pathval, pathtest)
train_ip, train_op ,val_ip, val_op,test_ip, test_op,input_char_to_int,input_int_to_char,output_char_to_int ,output_int_to_char,df_val,max_length = language.preparedata()

# df_train, train_input_len, train_out_len = load_dataset(pathtrain)
# df_val, val_input_len, val_out_len = load_dataset(pathval)
# df_test, test_input_len, test_out_len = load_dataset(pathtest)

# input_max_len = max(train_input_len, val_input_len, test_input_len)
# output_max_len = max(train_out_len, val_out_len, test_out_len)


# # Create Look Up Table
# input_char_to_int, input_int_to_char = look_up_table(df_train[0], df_val[0], df_test[0])
# output_char_to_int, output_int_to_char = look_up_table(df_train[1], df_val[1], df_test[1])

# # print("Input Lookup Table:", input_char_to_int)
# # print("\n\n Output Lookup Table", output_char_to_int)

# # Data Embedding and Converting them into Tensor
# train_inputs, train_outputs = get_tensor_object(df_train, input_max_len, input_max_len, input_char_to_int, output_char_to_int)
# val_inputs, val_outputs = get_tensor_object(df_val, input_max_len, input_max_len, input_char_to_int, output_char_to_int)
# test_inputs, test_outputs = get_tensor_object(df_test, input_max_len, input_max_len, input_char_to_int, output_char_to_int)

# # Transpose column wise
# train_ip, train_op = torch.transpose(train_inputs, 0, 1), torch.transpose(train_outputs, 0, 1)
# val_ip, val_op = torch.transpose(val_inputs, 0, 1), torch.transpose(val_outputs, 0, 1)
# test_ip, test_op = torch.transpose(test_inputs, 0, 1), torch.transpose(test_outputs, 0, 1)

# max_length = max(input_max_len, output_max_len)

# Initialize Hyperparameters
input_size = len(input_char_to_int)
output_size = len(output_char_to_int)
embedding_size = 64
hidden_size = 512
enc_layers = 2
dec_layers = 2
module_type = "GRU"
dropout = 0.3
learning_rate = 0.01
batch_size = 64
num_epochs = 1
optimizer = "adagrad"
beam_width = 1
bidirectional_type = True
length_penalty = 0.6
teacher_forcing = 0.5
total_params = 0

# Create train data batch
val_batch_x = torch.split(val_ip, batch_size, dim=1)
val_batch_y =  torch.split(val_op, batch_size, dim=1)
train_batch_x = torch.split(train_ip, batch_size, dim=1)
train_batch_y =  torch.split(train_op, batch_size, dim=1)



# Intialize encoder, decoder and seq2seq model
encoder = EncoderModule(input_size, embedding_size, hidden_size, enc_layers, dropout, bidirectional_type, module_type).to(device)
decoder = DecoderModule(output_size, embedding_size, hidden_size, output_size, dec_layers, dropout, bidirectional_type, module_type).to(device)
model = Seq2Seq(encoder, decoder, output_char_to_int, teacher_forcing, module_type).to(device)


for p in model.parameters():
    if p.requires_grad:
        total_params += p.numel()

print(model)
print(f'Total Trainable Parameters: {total_params}')


# Loss function and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = assign_opt(optimizer)


# TRAINING
model, acc = train(model, num_epochs, criterion, optimizer, train_batch_x, train_batch_y, val_batch_x, val_batch_y, df_val, input_char_to_int, output_char_to_int, output_int_to_char, beam_width, length_penalty, module_type, max_length,1)


Seq2Seq(
  (encoder): EncoderModule(
    (dropout): Dropout(p=0.3, inplace=False)
    (embedding): Embedding(29, 64)
    (rnn): GRU(64, 512, num_layers=2, dropout=0.3, bidirectional=True)
  )
  (decoder): DecoderModule(
    (dropout): Dropout(p=0.3, inplace=False)
    (embedding): Embedding(68, 64)
    (rnn): GRU(64, 512, num_layers=2, dropout=0.3)
    (concat): Linear(in_features=1024, out_features=512, bias=True)
    (fc): Linear(in_features=512, out_features=68, bias=True)
    (log_softmax): LogSoftmax(dim=1)
    (attn): Attention()
  )
)
Total Trainable Parameters: 9529988


100%|██████████| 800/800 [00:49<00:00, 16.22it/s]
100%|██████████| 64/64 [00:01<00:00, 58.40it/s]
100%|██████████| 4096/4096 [00:47<00:00, 85.77it/s]

---------------------------- Epoch :  1 ------------------------
Train accuracy Character:  20.0126953125
Train Average Loss:  2.085016617923975
Validation accuracy Character:  63.319051699514425
Validation Average Loss:  1.3349453192204237
Beam Val Word accuracy:  20.166015625
Correct Prediction : 826/4096





In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## SWEEP CONFIGURATION

In [None]:
# def main():
#     wandb.init(project='DL_Assignment_3')
#     config = wandb.config
#     wandb.run.name = 'cell_' + config.module_type + '_bs_' + str(config.batch_size) + '_ep_' + str(config.num_epochs) + '_op_' + str(config.optimizer) + '_drop_' + str(config.dropout) + '_bsw_' + str(config.beam_search_width) +'_emb_' + str(config.embedding_size) + '_hs_' + str(config.hidden_size) + '_elayer_' + str(config.layers) + '_dlayer_' + str(config.layers)

#     # Load Dataset
#     df_train, train_input_len, train_out_len = load_dataset('/kaggle/input/hinid-dataset/aksharantar_sampled/hin/hin_train.csv')
#     df_val, val_input_len, val_out_len = load_dataset('/kaggle/input/hinid-dataset/aksharantar_sampled/hin/hin_valid.csv')
#     df_test, test_input_len, test_out_len = load_dataset('/kaggle/input/hinid-dataset/aksharantar_sampled/hin/hin_test.csv')

#     input_max_len = max(train_input_len, val_input_len, test_input_len)
#     output_max_len = max(train_out_len, val_out_len, test_out_len)

#     max_length = max(input_max_len, output_max_len)

#     # Create Look Up Table
#     input_char_to_int, input_int_to_char = look_up_table(df_train[0], df_val[0], df_test[0])
#     output_char_to_int, output_int_to_char = look_up_table(df_train[1], df_val[1], df_test[1])

#     # Data Embedding and Converting them into Tensor
#     train_inputs, train_outputs = get_tensor_object(df_train, max_length, max_length, input_char_to_int, output_char_to_int)
#     val_inputs, val_outputs = get_tensor_object(df_val, max_length, max_length, input_char_to_int, output_char_to_int)
#     test_inputs, test_outputs = get_tensor_object(df_test, max_length, max_length, input_char_to_int, output_char_to_int)

#     # Transpose column wise
#     train_inputs, train_outputs = torch.transpose(train_inputs, 0, 1), torch.transpose(train_outputs, 0, 1)
#     val_inputs, val_outputs = torch.transpose(val_inputs, 0, 1), torch.transpose(val_outputs, 0, 1)
#     test_inputs, test_outputs = torch.transpose(test_inputs, 0, 1), torch.transpose(test_outputs, 0, 1)


#     # Initialize Hyperparameters
#     input_size = len(input_char_to_int)
#     output_size = len(output_char_to_int)
#     embedding_size = config.embedding_size
#     hidden_size = config.hidden_size
#     enc_layers = config.layers
#     dec_layers = config.layers
#     module_type = config.module_type
#     dropout = config.dropout
#     learning_rate = config.learning_rate
#     batch_size = config.batch_size
#     num_epochs = config.num_epochs
#     optimizer = config.optimizer
#     beam_width = config.beam_search_width
#     bidirectional_type = config.bidirectional_type
#     length_penalty = config.length_penalty
#     teacher_forcing = config.teacher_forcing
#     learning_rate = config.learning_rate

#     # Create train data batch
#     train_batch_x, train_batch_y = torch.split(train_inputs, batch_size, dim=1), torch.split(train_outputs, batch_size, dim=1)
#     # Validation data batch
#     val_batch_x, val_batch_y = torch.split(val_inputs, batch_size, dim=1), torch.split(val_outputs, batch_size, dim=1)


#     # Intialize encoder, decoder and seq2seq model
#     encoder = EncoderModule(input_size, embedding_size, hidden_size, enc_layers, dropout, bidirectional_type, module_type).to(device)
#     decoder = DecoderModule(output_size, embedding_size, hidden_size, output_size, dec_layers, dropout, bidirectional_type, module_type).to(device)
#     model = Seq2Seq(encoder, decoder, output_char_to_int, teacher_forcing, module_type).to(device)

#     # Print total number of parameters in the model
#     total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
#     print(model)
#     print(f'Total Trainable Parameters: {total_params}')


#     # Loss function and Optimizer
#     criterion = nn.CrossEntropyLoss()
#     if optimizer == 'adam':
#         optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#     elif optimizer == 'sgd':
#         optimizer = optim.SGD(model.parameters(), lr=learning_rate)
#     elif optimizer == 'rmsprop':
#         optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)
#     elif optimizer == 'nadam':
#         optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#     elif optimizer == 'adagrad':
#         optimizer = optim.Adagrad(model.parameters(), lr=learning_rate)
#     else:
#         print("Incorrect Optmizer !!!!")

#     # TRAINING
#     model, acc = train(model, num_epochs, criterion, optimizer, train_batch_x, train_batch_y, val_batch_x, val_batch_y, df_val, input_char_to_int, output_char_to_int, output_int_to_char, beam_width, length_penalty, module_type, max_length, 1)
#     wandb.log({
#             "accuracy": acc,
#         })

# # SWEEP CONFIG
# sweep_config = {
#     'name': 'sweep_1',
#     'method': 'grid',
#     'metric': {'name': 'accuracy', 'goal': 'maximize'},
#     'parameters': {
#         'embedding_size': {'values': [64, 256]},
#         'hidden_size': {'values': [256, 512]},
#         'layers': {'values': [2, 3]},
#         'module_type': {'values':['LSTM', "GRU", "RNN"]}, # RNN, LSTM, GRU
#         'dropout': {'values': [0.3, 0.5]},
#         'learning_rate': {'values': [0.01, 0.001]},
#         'batch_size': {'values': [32]},
#         'num_epochs': {'values': [10]},
#         'optimizer': {'values': ['sgd', 'rmsprop', 'adam', 'nadam', 'adagrad']}, # ['sgd', 'rmsprop', 'adam', 'nadam']
#         'beam_search_width': {'values': [1, 3, 5]},
#         'length_penalty' : {'values': [0.6]},
#         'bidirectional_type': {'values': [False, True]},
#         'teacher_forcing': {'values': [0.5, 0.7]}
#     }
# }

# # RUN SWEEP ID with agent
# sweep_id = wandb.sweep(sweep_config, project = 'DL_Assignment_3')
# wandb.agent(sweep_id, main, count = 30)
# wandb.finish()

In [None]:
wandb.finish()

## Prediction

In [None]:
# if __name__ == "__main__":
#     parser.add_argument('-dp', '--data_path', type=str, default='kaggle/input/hinid-dataset/aksharantar_sampled/hin', help='Path to the data folder')
#     parser.add_argument('-l', '--lang', type=str, default='hin', help='Language for which training is to be done')
#     parser.add_argument('-es', '--embedding_size', type=int, default=256, help='Embedding size')
#     parser.add_argument('-hs', '--hidden_size', type=int, default=512, help='Hidden size')
#     parser.add_argument('-nl', '--layers', type=int, default=2, help='Number of layers')
#     parser.add_argument('-ct', '--module_type', type=str, default='LSTM', choices=['RNN', 'LSTM', 'GRU'], help='Cell type (RNN, LSTM, GRU)')
#     parser.add_argument('-dr', '--dropout', type=float, default=0.3, help='Dropout rate')
#     parser.add_argument('-lr', '--learning_rate', type=float, default=0.01, help='Learning rate')
#     parser.add_argument('-bs', '--batch_size', type=int, default=32, help='Batch size')
#     parser.add_argument('-ne', '--num_epochs', type=int, default=10, help='Number of epochs')
#     parser.add_argument('-op', '--optimizer', type=str, default='adagrad', choices=['adam', 'sgd', 'rmsprop', 'nadam', 'adagrad'], help='Optimizer (adam, sgd, rmsprop, nadam, adagrad)')
#     parser.add_argument('-bw', '--beam_search_width', type=int, default=1, help='Beam search width')
#     parser.add_argument('-lp', '--length_penalty', type=float, default=0.6, help='Length penalty')
#     parser.add_argument('-tf', '--teacher_forcing', type=float, default=0.7, help='Teacher forcing ratio')
#     parser.add_argument('-bi', '--bidirectional_type', action='store_true', default=True, help='Use bidirectional_type encoder')
#     parser.add_argument('--wandb_log', type=int, default=0, help='Whether to log to WandB (1 for yes, 0 for no)')


#     config = parser.parse_args()
#     data_path = config.data_path
#     lang = config.lang


#     # Load Dataset
#     df_train, train_input_len, train_out_len = load_dataset(f'/{data_path}/{lang}/{lang}_train.csv')
#     df_val, val_input_len, val_out_len = load_dataset(f'/{data_path}/{lang}/{lang}_valid.csv')
#     df_test, test_input_len, test_out_len = load_dataset(f'/{data_path}/{lang}/{lang}_test.csv')

#     input_max_len = max(train_input_len, val_input_len, test_input_len)
#     output_max_len = max(train_out_len, val_out_len, test_out_len)

#     max_length = max(input_max_len, output_max_len)

#     # Create Look Up Table
#     input_char_to_int, input_int_to_char = look_up_table(df_train[0], df_val[0], df_test[0])
#     output_char_to_int, output_int_to_char = look_up_table(df_train[1], df_val[1], df_test[1])

#     # Data Embedding and Converting them into Tensor
#     train_inputs, train_outputs = get_tensor_object(df_train, max_length, max_length, input_char_to_int, output_char_to_int)
#     val_inputs, val_outputs = get_tensor_object(df_val, max_length, max_length, input_char_to_int, output_char_to_int)
#     test_inputs, test_outputs = get_tensor_object(df_test, max_length, max_length, input_char_to_int, output_char_to_int)

#     # Transpose column wise
#     train_inputs, train_outputs = torch.transpose(train_inputs, 0, 1), torch.transpose(train_outputs, 0, 1)
#     val_inputs, val_outputs = torch.transpose(val_inputs, 0, 1), torch.transpose(val_outputs, 0, 1)
#     test_inputs, test_outputs = torch.transpose(test_inputs, 0, 1), torch.transpose(test_outputs, 0, 1)

#     # Initialize Hyperparameters
#     input_size = len(input_char_to_int)
#     output_size = len(output_char_to_int)
#     embedding_size = config.embedding_size
#     hidden_size = config.hidden_size
#     enc_layers = config.layers
#     dec_layers = config.layers
#     module_type = config.module_type
#     dropout = config.dropout
#     learning_rate = config.learning_rate
#     batch_size = config.batch_size
#     num_epochs = config.num_epochs
#     optimizer = config.optimizer
#     beam_width = config.beam_search_width
#     bidirectional_type = config.bidirectional_type
#     length_penalty = config.length_penalty
#     teacher_forcing = config.teacher_forcing
#     learning_rate = config.learning_rate

#     # Create train data batch
#     train_batch_x, train_batch_y = torch.split(train_inputs, batch_size, dim=1), torch.split(train_outputs, batch_size, dim=1)
#     # Validation data batch
#     val_batch_x, val_batch_y = torch.split(val_inputs, batch_size, dim=1), torch.split(val_outputs, batch_size, dim=1)


#     # Intialize encoder, decoder and seq2seq model
#     encoder = EncoderModule(input_size, embedding_size, hidden_size, enc_layers, dropout, bidirectional_type, module_type).to(device)
#     decoder = DecoderModule(output_size, embedding_size, hidden_size, output_size, dec_layers, dropout, bidirectional_type, module_type).to(device)
#     model = Seq2Seq(encoder, decoder, output_char_to_int, teacher_forcing, module_type).to(device)

#     # Print total number of parameters in the model
#     total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
#     print(model)
#     print(f'Total Trainable Parameters: {total_params}')


#     # Loss function and Optimizer
#     criterion = nn.CrossEntropyLoss()
#     if optimizer == 'adam':
#         optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#     elif optimizer == 'sgd':
#         optimizer = optim.SGD(model.parameters(), lr=learning_rate)
#     elif optimizer == 'rmsprop':
#         optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)
#     elif optimizer == 'nadam':
#         optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#     elif optimizer == 'adagrad':
#         optimizer = optim.Adagrad(model.parameters(), lr=learning_rate)

#     # TRAINING

#     if config.wandb_log == 1:
#         wandb.init(project='DL_Assignment_3')
#         wandb.run.name = 'cell_' + config.module_type + '_bs_' + str(config.batch_size) + '_ep_' + str(config.num_epochs) + '_op_' + str(config.optimizer) + '_drop_' + str(config.dropout) + '_bsw_' + str(config.beam_search_width) +'_emb_' + str(config.embedding_size) + '_hs_' + str(config.hidden_size) + '_elayer_' + str(config.layers) + '_dlayer_' + str(config.layers)

#     model, acc = train(model, num_epochs, criterion, optimizer, train_batch_x, train_batch_y, val_batch_x, val_batch_y, df_val, input_char_to_int, output_char_to_int, output_int_to_char, beam_width, length_penalty, module_type, max_length, config.wandb_log)
#     if config.wandb_log == 1:
#         wandb.log({
#                 "accuracy": acc,
#             })

In [None]:
# # Example usage
# for i in range(10):
#     input_seq = df_train.iloc[i, 0][:-1]
#     predicted_output = beam_search(model, input_seq, input_char_to_int, output_char_to_int, output_int_to_char, 1, 0.6, "RNN")

#     print(f"Input Sequence {i+1}: {input_seq}")
#     print(f"Predicted Output Sequence {i+1}: {predicted_output}\n")
