In [None]:
# Import Lib

import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import torch
from torch.autograd import Variable
import copy
import torch.nn.functional as F
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
import random
import heapq

# Set device (CUDA if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# Preprocessing

In [None]:
class lang:
    def __init__(self,path_train,path_val,path_test):
        self.path_train = path_train
        self.path_val = path_val
        self.path_test = path_test
        self.trainfile = pd.read_csv(path_train,header=None, encoding='utf-8')
        self.valfile = pd.read_csv(path_val,header=None, encoding='utf-8')
        self.testfile = pd.read_csv(path_test,header=None, encoding='utf-8')

    def datasetencoder(self,file):
        file[0] = file[0].apply(lambda x: x + '>')
        file[1] = file[1].apply(lambda x: '<' + x + '>')

        # Calculate maximum length of unique elements in file[0]
        ipmax = 0
        for x in file[0].unique():
            if len(x) > ipmax:
                ipmax = len(x)

        # Calculate maximum length of unique elements in file[1]
        opmax = 0
        for x in file[1].unique():
            if len(x) > opmax:
                opmax = len(x)


        return ipmax,opmax,file

    def dictionary_create(self,data):
        data.discard('<')
        data.discard('>')
        chartoint = {"": 0, '<':1, '>':2}
        inttochar = {}

        for ci, c in enumerate(sorted(data), len(chartoint)):
            chartoint[c] = ci
        for c, ci in chartoint.items():
            inttochar[ci] = c

        return chartoint,inttochar

    def convert_tensor_element(self,data , length , chartoint):

        data_enc = np.zeros(length)
        encoder = []
        for char in data:
            encoder.append(chartoint[char])
        encoder = np.array(encoder)
        length = min(length, len(encoder))
        data_enc[:length] = encoder[:length]

        return torch.tensor(data_enc, dtype=torch.int64)

    def convert_tensor_data(self,data,maxlength_ip, chartoint_ip,maxlength_op, chartoint_op):

        tensor_obj_input = []
        tensor_obj_output = []

        for ip, op in zip(data[0], data[1]):
            # Encode input string
            temp_input = self.convert_tensor_element(ip, maxlength_ip, chartoint_ip)
            tensor_obj_input.append(temp_input)

            # Encode output string
            temp_output = self.convert_tensor_element(op, maxlength_op, chartoint_op)
            tensor_obj_output.append(temp_output)

        tensor_obj_input =  torch.stack(tensor_obj_input)
        tensor_obj_output = torch.stack(tensor_obj_output)


        return tensor_obj_input , tensor_obj_output

    def preparedata(self):

        train_ipmax , train_opmax , train = self.datasetencoder(self.trainfile)
        val_ipmax , val_opmax , val =self.datasetencoder(self.valfile)
        test_ipmax , test_opmax , test =self.datasetencoder(self.testfile)

        input_char_to_int,input_int_to_char  = self.dictionary_create(set(''.join(train[0]) + ''.join(val[0]) + ''.join(test[0])))
        output_char_to_int ,output_int_to_char= self.dictionary_create(set(''.join(train[1]) + ''.join(val[1]) + ''.join(test[1])))

        # print(input_char_to_int)
        # print(output_char_to_int)

        ipmax = max(train_ipmax ,val_ipmax ,test_ipmax)
        opmax = max(train_opmax ,val_opmax , test_opmax)

        train_tensor_ip, train_tensor_op = self.convert_tensor_data(train,ipmax,input_char_to_int,opmax,output_char_to_int)
        val_tensor_ip, val_tensor_op = self.convert_tensor_data(val,ipmax,input_char_to_int,opmax,output_char_to_int)
        test_tensor_ip, test_tensor_op = self.convert_tensor_data(test,ipmax,input_char_to_int,opmax,output_char_to_int)

        #transpose data tensor

        train_tensor_ip, train_tensor_op = train_tensor_ip.t(),train_tensor_op.t()
        val_tensor_ip, val_tensor_op = val_tensor_ip.t(), val_tensor_op.t()
        test_tensor_ip, test_tensor_op = test_tensor_ip.t() , test_tensor_op.t()

        len_max = max(ipmax,opmax)

        return train_tensor_ip, train_tensor_op ,val_tensor_ip, val_tensor_op,test_tensor_ip, test_tensor_op,input_char_to_int,input_int_to_char,output_char_to_int ,output_int_to_char,val,len_max

        #do all func call in this and return final datasets


# pathtrain = "/kaggle/input/akshatantra/aksharantar_sampled/hin/hin_train.csv"
# pathval = "/kaggle/input/akshatantra/aksharantar_sampled/hin/hin_valid.csv"
# pathtest = "/kaggle/input/akshatantra/aksharantar_sampled/hin/hin_test.csv"

# language = lang(pathtrain , pathval, pathtest)
# train_ip, train_op ,val_ip, val_op,test_ip, test_op,input_char_to_int,input_int_to_char,output_char_to_int ,output_int_to_char,df_val,len_max = language.preparedata()

# print(val_ip.shape)


# Create Seq2Seq Model

## encoder and decoder

In [32]:
class EncoderModule(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, layers, dropout, bidirectional, module_type):
        super(EncoderModule, self).__init__()
        self.bidirectional = bidirectional
        self.dropout = nn.Dropout(dropout)
        self.layers = layers
        self.hidden_size = hidden_size
        self.module_type = module_type


        self.embedding = nn.Embedding(input_size, embedding_size)


        if module_type == 'GRU':
            self.rnn = nn.GRU(embedding_size, hidden_size, layers, dropout=dropout, bidirectional=bidirectional)
        if module_type == 'RNN':
            self.rnn = nn.RNN(embedding_size, hidden_size, layers, dropout=dropout, bidirectional=bidirectional)
        if module_type == 'LSTM':
            self.rnn = nn.LSTM(embedding_size, hidden_size, layers, dropout=dropout, bidirectional=bidirectional)


    def forward(self, x): # x shape: (seq_length, N) where N is batch size
        # Perform dropout on the input
        embedding = self.embedding(x)
        embedding = self.dropout(embedding) # embedding shape: (seq_length, N, embedding_size)

        if self.module_type == "LSTM":
            # Pass through the LSTM layer
            outputs, (hidden, cell) = self.rnn(embedding) # outputs shape: (seq_length, N, hidden_size)
            if self.bidirectional == True:
                # Sum the bidirectional outputs
                outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
                hidden = torch.cat((hidden[: self.layers], hidden[self.layers:]), dim=0)
            # Return hidden state and cell state
            return hidden, cell

        if self.module_type == "GRU" :
            # Pass through the RNN/GRU layer
            outputs, hidden = self.rnn(embedding) # outputs shape: (seq_length, N, hidden_size)
            if self.bidirectional == True:
                # Sum the bidirectional outputs
                outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
                hidden = torch.cat((hidden[: self.layers], hidden[self.layers:]), dim=0)

            # Return hidden state and cell state
            return hidden

        if self.module_type == "RNN":
            # Pass through the RNN/GRU layer
            outputs, hidden = self.rnn(embedding) # outputs shape: (seq_length, N, hidden_size)
            if self.bidirectional == True:
                # Sum the bidirectional outputs
                outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
                hidden = torch.cat((hidden[: self.layers], hidden[self.layers:]), dim=0)

            # Return hidden state and cell state
            return hidden


class DecoderModule(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, layers, dropout, bidirectional, module_type):
        super(DecoderModule, self).__init__()
        self.bidirectional = bidirectional
        self.dropout = nn.Dropout(dropout)
        self.layers = layers
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.module_type = module_type

        # Define embedding layer
        self.embedding = nn.Embedding(input_size, embedding_size)

        # Define RNN layer with specific cell type
        if module_type == 'RNN':
            self.rnn = nn.RNN(embedding_size, hidden_size, layers, dropout=dropout, bidirectional=bidirectional)
        if module_type == 'GRU':
            self.rnn = nn.GRU(embedding_size, hidden_size, layers, dropout=dropout, bidirectional=bidirectional)
        if module_type == 'LSTM':
            self.rnn = nn.LSTM(embedding_size, hidden_size, layers, dropout=dropout, bidirectional=bidirectional)


        if bidirectional:
          input_size = hidden_size * 2
        else:
          input_size = hidden_size

        self.fc = nn.Linear(input_size, output_size)


        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, x, hidden, cell): # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length

        # Ensure x has the shape (1, N)

        x = x.unsqueeze(0)
        # Perform dropout on the input

        embedding = self.dropout(self.embedding(x))  # embedding shape: (1, N, embedding_size)

        if self.module_type == "LSTM":
            # Pass through the LSTM layer
            outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))  # outputs shape: (1, N, hidden_size * num_directions)

            # Pass through fully connected layer
            out = self.fc(outputs).squeeze(0)
            predictions = self.log_softmax(out)

            return predictions, hidden, cell
        if self.module_type == "GRU":
            # Pass through the RNN/GRU layer
            outputs, hidden = self.rnn(embedding, hidden)  # outputs shape: (1, N, hidden_size * num_directions)

            # Pass through fully connected layer
            out = self.fc(outputs).squeeze(0)
            predictions = self.log_softmax(out)

            return predictions, hidden

        if self.module_type == "RNN":
            # Pass through the RNN/GRU layer
            outputs, hidden = self.rnn(embedding, hidden)  # outputs shape: (1, N, hidden_size * num_directions)

            # Pass through fully connected layer
            out = self.fc(outputs).squeeze(0)
            predictions = self.log_softmax(out)

            return predictions, hidden

## Seq2Seq Class

In [33]:
class Seq2Seq(nn.Module):

    def __init__(self, encoder, decoder, output_char_to_int, teacher_forcing, module_type):

        super(Seq2Seq, self).__init__()
        # Initialize encoder and decoder
        self.decoder = decoder
        self.encoder = encoder
        self.module_type = module_type
        self.target_vocab_size = len(output_char_to_int)
        self.teacher_force_ratio = teacher_forcing

    def forward(self, source, target):
        # Get batch size, target length, and target vocabulary size
        batch_size = source.shape[1]

        target_vocab_size = self.target_vocab_size
        teacher_force_ratio = self.teacher_force_ratio

        # Initialize outputs tensor
        outputs = torch.zeros(target.shape[0], batch_size, target_vocab_size).to(source.device)

        x = target[0]
        # Get hidden state and cell state from encoder
        if self.module_type == 'LSTM':
            hidden, cell = self.encoder(source)
        if self.module_type == 'GRU':
            hidden = self.encoder(source)
        if self.module_type == 'RNN':
            hidden = self.encoder(source)


        for t in range(1, target.shape[0]):
            # Use previous hidden and cell states as context from encoder at start
            if self.module_type == 'LSTM':
                output, hidden, cell = self.decoder(x, hidden, cell)
            if self.module_type == 'GRU':
                output, hidden = self.decoder(x, hidden, None)
            if self.module_type == 'RNN':
                output, hidden = self.decoder(x, hidden, None)

            # Store next output prediction
            outputs[t] = output
            # Get the best word the DecoderModule predicted (index in the vocabulary)
            best_guess = output.argmax(1)
            # Update input for next time step based on teacher forcing ratio
            if random.random() >= teacher_force_ratio:
              x = best_guess
            else:
              x = target[t]


        return outputs

# TRAINING

In [43]:
def beam_search(model, input_seq, max_length, input_char_index, output_char_index, reverse_target_char_index, beam_width, length_penalty, cell_type):
    """
    Perform beam search to generate a sequence using the provided model.

    Args:
    - model (nn.Module): The Seq2Seq model.
    - input_seq (str): The input sequence.
    - max_length (int): Maximum length of the input sequence.
    - input_char_index (dict): Mapping from characters to integers for the input vocabulary.
    - output_char_index (dict): Mapping from characters to integers for the output vocabulary.
    - reverse_target_char_index (dict): Reverse mapping from integers to characters for the output vocabulary.
    - beam_width (int): Beam width for beam search.
    - length_penalty (float): Length penalty for beam search.
    - cell_type (str): Type of RNN cell used in the model ('LSTM', 'GRU', or 'RNN').

    Returns:
    - str: The generated output sequence.
    """
    if len(input_seq) > max_length:
        print("Input Length is exceeding max length!!!!")
        return ""

    # Create np array of zeros of length input
    input_data = np.zeros((max_length, 1), dtype=int)  # (N,1)

    # Encode the input
    for idx, char in enumerate(input_seq):
        input_data[idx, 0] = input_char_index[char]
    input_data[idx + 1, 0] = input_char_index[">"]  # EOS

    # Convert to tensor
    input_tensor = torch.tensor(input_data, dtype=torch.int64).to(device)  # N,1

    with torch.no_grad():
        if cell_type == 'LSTM':
            hidden, cell = model.encoder(input_tensor)

        else:
            hidden = model.encoder(input_tensor)

    # Initialize beam
    out_t = output_char_index['<']
    out_reshape = np.array(out_t).reshape(1,)
    hidden_par = hidden.unsqueeze(0)
    initial_sequence = torch.tensor(out_reshape).to(device)
    beam = [(0.0, initial_sequence, hidden_par)]  # [(score, sequence, hidden)]

    for _ in range(len(output_char_index)):
        candidates = []
        for score, seq, hidden in beam:
            if seq[-1].item() == output_char_index['>']:
                # If the sequence ends with the end token, add it to the candidates
                candidates.append((score, seq, hidden))
                continue

            last_token = np.array(seq[-1].item()).reshape(1,)
            x = torch.tensor(last_token).to(device)

            if cell_type == 'LSTM':
                output, hidden, cell,  = model.decoder(x, hidden.squeeze(0), cell)
            else:
                output, hidden,  = model.decoder(x, hidden.squeeze(0), None)

            probabilities = F.softmax(output, dim=1)
            topk_probs, topk_tokens = torch.topk(probabilities, k=beam_width)

            for prob, token in zip(topk_probs[0], topk_tokens[0]):
                new_seq = torch.cat((seq, token.unsqueeze(0)), dim=0)
                seq_length_norm_factor = (len(new_seq) - 1) / 5
                candidate_score = score + torch.log(prob).item() / (seq_length_norm_factor ** length_penalty)
                candidates.append((candidate_score, new_seq, hidden.unsqueeze(0)))

        # Select top-k candidates based on the accumulated scores
        beam = heapq.nlargest(beam_width, candidates, key=lambda x: x[0])

    best_score, best_sequence, _ = max(beam, key=lambda x: x[0])  # Select the best sequence from the beam as the output

    # Convert the best sequence indices to characters
    return ''.join([reverse_target_char_index[token.item()] for token in best_sequence[1:]])



# TRAINING FUNCTION
def train(model, num_epochs, criterion, optimizer, train_batch_x, train_batch_y, val_batch_x, val_batch_y, df_val, input_char_to_int, output_char_to_int, output_int_to_char, beam_width, length_penalty, module_type, max_length, wandb_log):
    """
    Train the Seq2Seq model.

    Args:
    - model (nn.Module): The Seq2Seq model.
    - num_epochs (int): Number of training epochs.
    - criterion: Loss criterion for training.
    - optimizer: Optimizer for training.
    - train_batch_x: Training input data.
    - train_batch_y: Training target data.
    - val_batch_x: Validation input data.
    - val_batch_y: Validation target data.
    - df_val: DataFrame for validation data.
    - input_char_to_int (dict): Mapping from characters to integers for the input vocabulary.
    - output_char_to_int (dict): Mapping from characters to integers for the output vocabulary.
    - output_int_to_char (dict): Reverse mapping from integers to characters for the output vocabulary.
    - beam_width (int): Beam width for beam search.
    - length_penalty (float): Length penalty for beam search.
    - module_type (str): Type of RNN cell used in the model ('LSTM', 'GRU', or 'RNN').
    - max_length (int): Maximum length of sequences.
    - wandb_log (int): Whether to log to wandb (1 or 0).
    Returns:
    - nn.Module: The trained model.
    - float: Validation accuracy.
    """
    for epoch in range(num_epochs):
        total_words = 0
        correct_pred = 0
        total_loss = 0
        accuracy = 0
        model.train()

        # Use tqdm for progress tracking
        train_data_iterator = tqdm(zip(train_batch_x, train_batch_y), total=len(train_batch_x))

        for (x, y) in train_data_iterator:
            # Get input and targets and move to device
            target, inp_data = y.to(device), x.to(device)

            # Forward propagation
            optimizer.zero_grad()
            output = model(inp_data, target)

            target = target.reshape(-1)
            output = output.reshape(-1, output.shape[2])

            pad_mask = (target != 0)
            target = target[pad_mask] # Select non-padding elements
            output = output[pad_mask]

            # Calculate loss
            loss = criterion(output, target)

            # Backpropagation
            loss.backward()

            # Clip gradients to avoid exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

            # Update parameters
            optimizer.step()

            # Accumulate total loss
            total_loss = total_loss + loss.item()
            # Update total words processed
            total_words = total_words + target.size(0)
            # Calculate number of correct predictions
            correct_pred = correct_pred + torch.sum(torch.argmax(output, dim=1) == target).item()

        # Calculate average loss per batch
        avg_loss = total_loss / len(train_batch_x)
        # Calculate accuracy
        acc = correct_pred / total_words
        acc *= 100

        # Validation
        model.eval()
        with torch.no_grad():
            val_total_loss = 0
            val_total_words = 0
            val_correct_pred = 0

            val_data_iterator = tqdm(zip(val_batch_x, val_batch_y), total=len(val_batch_x))
            for x_val, y_val in val_data_iterator:
                target_val, inp_data_val = y_val.to(device), x_val.to(device)
                output_val = model(inp_data_val, target_val)


                target_val = target_val.reshape(-1)
                output_val = output_val.reshape(-1, output_val.shape[2])

                pad_mask = (target_val != 0)
                target_val = target_val[pad_mask] # Select non-padding elements
                output_val = output_val[pad_mask]

                val_loss = criterion(output_val, target_val)
                val_total_loss = val_total_loss+ val_loss.item()
                val_total_words = val_total_words+ target_val.size(0)
                val_correct_pred = val_correct_pred+ torch.sum(torch.argmax(output_val, dim=1) == target_val).item()

            # Calculate validation statistics
            val_acc = val_correct_pred / val_total_words
            val_acc = 100*val_acc
            val_avg_loss = val_total_loss / len(val_batch_x)



        # Total word predict correct over training
        beam_val_pred = 0
        beam_val = 0
        for i in tqdm(range(df_val.shape[0])):
            input_seq = df_val.iloc[i, 0][:-1]
            true_seq = df_val.iloc[i, 1][1:-1]
            predicted_output = beam_search(model, input_seq, max_length, input_char_to_int, output_char_to_int, output_int_to_char, beam_width, length_penalty, module_type)
            if true_seq == predicted_output[:-1]:
                beam_val_pred+=1
        beam_val = beam_val_pred/df_val.shape[0]
        beam_val = 100*beam_val
        # Print statistics

        # print(f"Train Accuracy Char: {accuracy:.4f}, Train Average Loss: {avg_loss:.4f}")
        # print(f"Validation Accuracy Char: {val_acc:.4f}, Validation Average Loss: {val_avg_loss:.4f}")
        # print(f"Beam Val Word Accuracy: {beam_val:.4f} Correct Prediction : {beam_val_pred}/{df_val.shape[0]}")

        print("========================================================================")
        print(f"---------------------------- Epoch : ",epoch+1,"------------------------")
        print(f"Train accuracy Character: ",acc)
        print(f"Train Average Loss: ",avg_loss)
        print(f"Validation accuracy Character: ",val_acc)
        print(f"Validation Average Loss: ",val_avg_loss)
        print(f"Beam Val Word accuracy: " ,beam_val)
        print(f"Correct Prediction : {beam_val_pred}/{df_val.shape[0]}")
        print("========================================================================")

        # if wandb_log == 1:
        #     wandb.log({
        #         "train_accuracy_char": accuracy,
        #         "train_loss": avg_loss,
        #         "val_acc_char": val_acc,
        #         "val_loss": val_avg_loss,
        #         "beam_val_acc_word" : beam_val,
        #     })


    return model, beam_val

In [44]:
pathtrain = "/content/drive/MyDrive/aksharantar_sampled/aksharantar_sampled/hin/hin_train.csv"
pathval = "/content/drive/MyDrive/aksharantar_sampled/aksharantar_sampled/hin/hin_valid.csv"
pathtest = "/content/drive/MyDrive/aksharantar_sampled/aksharantar_sampled/hin/hin_test.csv"
language = lang(pathtrain , pathval, pathtest)
train_ip, train_op ,val_ip, val_op,test_ip, test_op,input_char_to_int,input_int_to_char,output_char_to_int ,output_int_to_char,df_val,max_length = language.preparedata()

# df_train, train_input_len, train_out_len = load_dataset(pathtrain)
# df_val, val_input_len, val_out_len = load_dataset(pathval)
# df_test, test_input_len, test_out_len = load_dataset(pathtest)

# input_max_len = max(train_input_len, val_input_len, test_input_len)
# output_max_len = max(train_out_len, val_out_len, test_out_len)


# # Create Look Up Table
# input_char_to_int, input_int_to_char = look_up_table(df_train[0], df_val[0], df_test[0])
# output_char_to_int, output_int_to_char = look_up_table(df_train[1], df_val[1], df_test[1])

# # print("Input Lookup Table:", input_char_to_int)
# # print("\n\n Output Lookup Table", output_char_to_int)

# # Data Embedding and Converting them into Tensor
# train_inputs, train_outputs = get_tensor_object(df_train, input_max_len, input_max_len, input_char_to_int, output_char_to_int)
# val_inputs, val_outputs = get_tensor_object(df_val, input_max_len, input_max_len, input_char_to_int, output_char_to_int)
# test_inputs, test_outputs = get_tensor_object(df_test, input_max_len, input_max_len, input_char_to_int, output_char_to_int)

# # Transpose column wise
# train_ip, train_op = torch.transpose(train_inputs, 0, 1), torch.transpose(train_outputs, 0, 1)
# val_ip, val_op = torch.transpose(val_inputs, 0, 1), torch.transpose(val_outputs, 0, 1)
# test_ip, test_op = torch.transpose(test_inputs, 0, 1), torch.transpose(test_outputs, 0, 1)

# max_length = max(input_max_len, output_max_len)

# Initialize Hyperparameters
input_size = len(input_char_to_int)
output_size = len(output_char_to_int)
embedding_size = 64
hidden_size = 256
enc_layers = 2
dec_layers = 2
module_type = "LSTM"
dropout = 0.3
learning_rate = 0.1
batch_size = 64
num_epochs = 1
optimizer = "adagrad"
beam_width = 1
bidirectional_type = True
length_penalty = 0.6
teacher_forcing = 0.5

# Create train data batch
train_batch_x, train_batch_y = torch.split(train_ip, batch_size, dim=1), torch.split(train_op, batch_size, dim=1)
# Validation data batch
val_batch_x, val_batch_y = torch.split(val_ip, batch_size, dim=1), torch.split(val_op, batch_size, dim=1)


# Intialize encoder, decoder and seq2seq model
encoder = EncoderModule(input_size, embedding_size, hidden_size, enc_layers, dropout, bidirectional_type, module_type).to(device)
decoder = DecoderModule(output_size, embedding_size, hidden_size, output_size, dec_layers, dropout, bidirectional_type, module_type).to(device)
model = Seq2Seq(encoder, decoder, output_char_to_int, teacher_forcing, module_type).to(device)

# Print total number of parameters in the model
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(model)
print(f'Total Trainable Parameters: {total_params}')


# Loss function and Optimizer
criterion = nn.CrossEntropyLoss()
if optimizer == 'adam':
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
elif optimizer == 'sgd':
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
elif optimizer == 'rmsprop':
    optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)
elif optimizer == 'nadam':
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
elif optimizer == 'adagrad':
    optimizer = optim.Adagrad(model.parameters(), lr=learning_rate)


# TRAINING
model, acc = train(model, num_epochs, criterion, optimizer, train_batch_x, train_batch_y, val_batch_x, val_batch_y, df_val, input_char_to_int, output_char_to_int, output_int_to_char, beam_width, length_penalty, module_type, max_length, 1)


{'': 0, '<': 1, '>': 2, 'a': 3, 'b': 4, 'c': 5, 'd': 6, 'e': 7, 'f': 8, 'g': 9, 'h': 10, 'i': 11, 'j': 12, 'k': 13, 'l': 14, 'm': 15, 'n': 16, 'o': 17, 'p': 18, 'q': 19, 'r': 20, 's': 21, 't': 22, 'u': 23, 'v': 24, 'w': 25, 'x': 26, 'y': 27, 'z': 28}
{'': 0, '<': 1, '>': 2, 'ँ': 3, 'ं': 4, 'ः': 5, 'अ': 6, 'आ': 7, 'इ': 8, 'ई': 9, 'उ': 10, 'ऊ': 11, 'ऋ': 12, 'ए': 13, 'ऐ': 14, 'ऑ': 15, 'ओ': 16, 'औ': 17, 'क': 18, 'ख': 19, 'ग': 20, 'घ': 21, 'ङ': 22, 'च': 23, 'छ': 24, 'ज': 25, 'झ': 26, 'ञ': 27, 'ट': 28, 'ठ': 29, 'ड': 30, 'ढ': 31, 'ण': 32, 'त': 33, 'थ': 34, 'द': 35, 'ध': 36, 'न': 37, 'प': 38, 'फ': 39, 'ब': 40, 'भ': 41, 'म': 42, 'य': 43, 'र': 44, 'ल': 45, 'ळ': 46, 'व': 47, 'श': 48, 'ष': 49, 'स': 50, 'ह': 51, '़': 52, 'ऽ': 53, 'ा': 54, 'ि': 55, 'ी': 56, 'ु': 57, 'ू': 58, 'ृ': 59, 'ॅ': 60, 'े': 61, 'ै': 62, 'ॉ': 63, 'ॊ': 64, 'ो': 65, 'ौ': 66, '्': 67}
Seq2Seq(
  (decoder): DecoderModule(
    (dropout): Dropout(p=0.3, inplace=False)
    (embedding): Embedding(68, 64)
    (rnn): LSTM(64, 256, num_l

100%|██████████| 800/800 [00:37<00:00, 21.08it/s]
100%|██████████| 64/64 [00:00<00:00, 73.16it/s]
100%|██████████| 4096/4096 [00:39<00:00, 103.91it/s]

---------------------------- Epoch :  1 ------------------------
Train accuracy Character:  37.76266441760726
Train Average Loss:  2.370012532174587
Validation accuracy Character:  55.421308197657815
Validation Average Loss:  1.5726487282663584
Beam Val Word accuracy:  12.109375
Correct Prediction : 496/4096





In [45]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## SWEEP CONFIGURATION

In [None]:
# def main():
#     wandb.init(project='DL_Assignment_3')
#     config = wandb.config
#     wandb.run.name = 'cell_' + config.module_type + '_bs_' + str(config.batch_size) + '_ep_' + str(config.num_epochs) + '_op_' + str(config.optimizer) + '_drop_' + str(config.dropout) + '_bsw_' + str(config.beam_search_width) +'_emb_' + str(config.embedding_size) + '_hs_' + str(config.hidden_size) + '_elayer_' + str(config.layers) + '_dlayer_' + str(config.layers)

#     # Load Dataset
#     df_train, train_input_len, train_out_len = load_dataset('/kaggle/input/hinid-dataset/aksharantar_sampled/hin/hin_train.csv')
#     df_val, val_input_len, val_out_len = load_dataset('/kaggle/input/hinid-dataset/aksharantar_sampled/hin/hin_valid.csv')
#     df_test, test_input_len, test_out_len = load_dataset('/kaggle/input/hinid-dataset/aksharantar_sampled/hin/hin_test.csv')

#     input_max_len = max(train_input_len, val_input_len, test_input_len)
#     output_max_len = max(train_out_len, val_out_len, test_out_len)

#     max_length = max(input_max_len, output_max_len)

#     # Create Look Up Table
#     input_char_to_int, input_int_to_char = look_up_table(df_train[0], df_val[0], df_test[0])
#     output_char_to_int, output_int_to_char = look_up_table(df_train[1], df_val[1], df_test[1])

#     # Data Embedding and Converting them into Tensor
#     train_inputs, train_outputs = get_tensor_object(df_train, max_length, max_length, input_char_to_int, output_char_to_int)
#     val_inputs, val_outputs = get_tensor_object(df_val, max_length, max_length, input_char_to_int, output_char_to_int)
#     test_inputs, test_outputs = get_tensor_object(df_test, max_length, max_length, input_char_to_int, output_char_to_int)

#     # Transpose column wise
#     train_inputs, train_outputs = torch.transpose(train_inputs, 0, 1), torch.transpose(train_outputs, 0, 1)
#     val_inputs, val_outputs = torch.transpose(val_inputs, 0, 1), torch.transpose(val_outputs, 0, 1)
#     test_inputs, test_outputs = torch.transpose(test_inputs, 0, 1), torch.transpose(test_outputs, 0, 1)


#     # Initialize Hyperparameters
#     input_size = len(input_char_to_int)
#     output_size = len(output_char_to_int)
#     embedding_size = config.embedding_size
#     hidden_size = config.hidden_size
#     enc_layers = config.layers
#     dec_layers = config.layers
#     module_type = config.module_type
#     dropout = config.dropout
#     learning_rate = config.learning_rate
#     batch_size = config.batch_size
#     num_epochs = config.num_epochs
#     optimizer = config.optimizer
#     beam_width = config.beam_search_width
#     bidirectional_type = config.bidirectional_type
#     length_penalty = config.length_penalty
#     teacher_forcing = config.teacher_forcing
#     learning_rate = config.learning_rate

#     # Create train data batch
#     train_batch_x, train_batch_y = torch.split(train_inputs, batch_size, dim=1), torch.split(train_outputs, batch_size, dim=1)
#     # Validation data batch
#     val_batch_x, val_batch_y = torch.split(val_inputs, batch_size, dim=1), torch.split(val_outputs, batch_size, dim=1)


#     # Intialize encoder, decoder and seq2seq model
#     encoder = EncoderModule(input_size, embedding_size, hidden_size, enc_layers, dropout, bidirectional_type, module_type).to(device)
#     decoder = DecoderModule(output_size, embedding_size, hidden_size, output_size, dec_layers, dropout, bidirectional_type, module_type).to(device)
#     model = Seq2Seq(encoder, decoder, output_char_to_int, teacher_forcing, module_type).to(device)

#     # Print total number of parameters in the model
#     total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
#     print(model)
#     print(f'Total Trainable Parameters: {total_params}')


#     # Loss function and Optimizer
#     criterion = nn.CrossEntropyLoss()
#     if optimizer == 'adam':
#         optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#     elif optimizer == 'sgd':
#         optimizer = optim.SGD(model.parameters(), lr=learning_rate)
#     elif optimizer == 'rmsprop':
#         optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)
#     elif optimizer == 'nadam':
#         optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#     elif optimizer == 'adagrad':
#         optimizer = optim.Adagrad(model.parameters(), lr=learning_rate)
#     else:
#         print("Incorrect Optmizer !!!!")

#     # TRAINING
#     model, acc = train(model, num_epochs, criterion, optimizer, train_batch_x, train_batch_y, val_batch_x, val_batch_y, df_val, input_char_to_int, output_char_to_int, output_int_to_char, beam_width, length_penalty, module_type, max_length, 1)
#     wandb.log({
#             "accuracy": acc,
#         })

# # SWEEP CONFIG
# sweep_config = {
#     'name': 'sweep_1',
#     'method': 'grid',
#     'metric': {'name': 'accuracy', 'goal': 'maximize'},
#     'parameters': {
#         'embedding_size': {'values': [64, 256]},
#         'hidden_size': {'values': [256, 512]},
#         'layers': {'values': [2, 3]},
#         'module_type': {'values':['LSTM', "GRU", "RNN"]}, # RNN, LSTM, GRU
#         'dropout': {'values': [0.3, 0.5]},
#         'learning_rate': {'values': [0.01, 0.001]},
#         'batch_size': {'values': [32]},
#         'num_epochs': {'values': [10]},
#         'optimizer': {'values': ['sgd', 'rmsprop', 'adam', 'nadam', 'adagrad']}, # ['sgd', 'rmsprop', 'adam', 'nadam']
#         'beam_search_width': {'values': [1, 3, 5]},
#         'length_penalty' : {'values': [0.6]},
#         'bidirectional_type': {'values': [False, True]},
#         'teacher_forcing': {'values': [0.5, 0.7]}
#     }
# }

# # RUN SWEEP ID with agent
# sweep_id = wandb.sweep(sweep_config, project = 'DL_Assignment_3')
# wandb.agent(sweep_id, main, count = 30)
# wandb.finish()

In [None]:
wandb.finish()

## Prediction

In [None]:
# if __name__ == "__main__":
#     parser.add_argument('-dp', '--data_path', type=str, default='kaggle/input/hinid-dataset/aksharantar_sampled/hin', help='Path to the data folder')
#     parser.add_argument('-l', '--lang', type=str, default='hin', help='Language for which training is to be done')
#     parser.add_argument('-es', '--embedding_size', type=int, default=256, help='Embedding size')
#     parser.add_argument('-hs', '--hidden_size', type=int, default=512, help='Hidden size')
#     parser.add_argument('-nl', '--layers', type=int, default=2, help='Number of layers')
#     parser.add_argument('-ct', '--module_type', type=str, default='LSTM', choices=['RNN', 'LSTM', 'GRU'], help='Cell type (RNN, LSTM, GRU)')
#     parser.add_argument('-dr', '--dropout', type=float, default=0.3, help='Dropout rate')
#     parser.add_argument('-lr', '--learning_rate', type=float, default=0.01, help='Learning rate')
#     parser.add_argument('-bs', '--batch_size', type=int, default=32, help='Batch size')
#     parser.add_argument('-ne', '--num_epochs', type=int, default=10, help='Number of epochs')
#     parser.add_argument('-op', '--optimizer', type=str, default='adagrad', choices=['adam', 'sgd', 'rmsprop', 'nadam', 'adagrad'], help='Optimizer (adam, sgd, rmsprop, nadam, adagrad)')
#     parser.add_argument('-bw', '--beam_search_width', type=int, default=1, help='Beam search width')
#     parser.add_argument('-lp', '--length_penalty', type=float, default=0.6, help='Length penalty')
#     parser.add_argument('-tf', '--teacher_forcing', type=float, default=0.7, help='Teacher forcing ratio')
#     parser.add_argument('-bi', '--bidirectional_type', action='store_true', default=True, help='Use bidirectional_type encoder')
#     parser.add_argument('--wandb_log', type=int, default=0, help='Whether to log to WandB (1 for yes, 0 for no)')


#     config = parser.parse_args()
#     data_path = config.data_path
#     lang = config.lang


#     # Load Dataset
#     df_train, train_input_len, train_out_len = load_dataset(f'/{data_path}/{lang}/{lang}_train.csv')
#     df_val, val_input_len, val_out_len = load_dataset(f'/{data_path}/{lang}/{lang}_valid.csv')
#     df_test, test_input_len, test_out_len = load_dataset(f'/{data_path}/{lang}/{lang}_test.csv')

#     input_max_len = max(train_input_len, val_input_len, test_input_len)
#     output_max_len = max(train_out_len, val_out_len, test_out_len)

#     max_length = max(input_max_len, output_max_len)

#     # Create Look Up Table
#     input_char_to_int, input_int_to_char = look_up_table(df_train[0], df_val[0], df_test[0])
#     output_char_to_int, output_int_to_char = look_up_table(df_train[1], df_val[1], df_test[1])

#     # Data Embedding and Converting them into Tensor
#     train_inputs, train_outputs = get_tensor_object(df_train, max_length, max_length, input_char_to_int, output_char_to_int)
#     val_inputs, val_outputs = get_tensor_object(df_val, max_length, max_length, input_char_to_int, output_char_to_int)
#     test_inputs, test_outputs = get_tensor_object(df_test, max_length, max_length, input_char_to_int, output_char_to_int)

#     # Transpose column wise
#     train_inputs, train_outputs = torch.transpose(train_inputs, 0, 1), torch.transpose(train_outputs, 0, 1)
#     val_inputs, val_outputs = torch.transpose(val_inputs, 0, 1), torch.transpose(val_outputs, 0, 1)
#     test_inputs, test_outputs = torch.transpose(test_inputs, 0, 1), torch.transpose(test_outputs, 0, 1)

#     # Initialize Hyperparameters
#     input_size = len(input_char_to_int)
#     output_size = len(output_char_to_int)
#     embedding_size = config.embedding_size
#     hidden_size = config.hidden_size
#     enc_layers = config.layers
#     dec_layers = config.layers
#     module_type = config.module_type
#     dropout = config.dropout
#     learning_rate = config.learning_rate
#     batch_size = config.batch_size
#     num_epochs = config.num_epochs
#     optimizer = config.optimizer
#     beam_width = config.beam_search_width
#     bidirectional_type = config.bidirectional_type
#     length_penalty = config.length_penalty
#     teacher_forcing = config.teacher_forcing
#     learning_rate = config.learning_rate

#     # Create train data batch
#     train_batch_x, train_batch_y = torch.split(train_inputs, batch_size, dim=1), torch.split(train_outputs, batch_size, dim=1)
#     # Validation data batch
#     val_batch_x, val_batch_y = torch.split(val_inputs, batch_size, dim=1), torch.split(val_outputs, batch_size, dim=1)


#     # Intialize encoder, decoder and seq2seq model
#     encoder = EncoderModule(input_size, embedding_size, hidden_size, enc_layers, dropout, bidirectional_type, module_type).to(device)
#     decoder = DecoderModule(output_size, embedding_size, hidden_size, output_size, dec_layers, dropout, bidirectional_type, module_type).to(device)
#     model = Seq2Seq(encoder, decoder, output_char_to_int, teacher_forcing, module_type).to(device)

#     # Print total number of parameters in the model
#     total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
#     print(model)
#     print(f'Total Trainable Parameters: {total_params}')


#     # Loss function and Optimizer
#     criterion = nn.CrossEntropyLoss()
#     if optimizer == 'adam':
#         optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#     elif optimizer == 'sgd':
#         optimizer = optim.SGD(model.parameters(), lr=learning_rate)
#     elif optimizer == 'rmsprop':
#         optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)
#     elif optimizer == 'nadam':
#         optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#     elif optimizer == 'adagrad':
#         optimizer = optim.Adagrad(model.parameters(), lr=learning_rate)

#     # TRAINING

#     if config.wandb_log == 1:
#         wandb.init(project='DL_Assignment_3')
#         wandb.run.name = 'cell_' + config.module_type + '_bs_' + str(config.batch_size) + '_ep_' + str(config.num_epochs) + '_op_' + str(config.optimizer) + '_drop_' + str(config.dropout) + '_bsw_' + str(config.beam_search_width) +'_emb_' + str(config.embedding_size) + '_hs_' + str(config.hidden_size) + '_elayer_' + str(config.layers) + '_dlayer_' + str(config.layers)

#     model, acc = train(model, num_epochs, criterion, optimizer, train_batch_x, train_batch_y, val_batch_x, val_batch_y, df_val, input_char_to_int, output_char_to_int, output_int_to_char, beam_width, length_penalty, module_type, max_length, config.wandb_log)
#     if config.wandb_log == 1:
#         wandb.log({
#                 "accuracy": acc,
#             })

In [None]:
# # Example usage
# for i in range(10):
#     input_seq = df_train.iloc[i, 0][:-1]
#     predicted_output = beam_search(model, input_seq, input_char_to_int, output_char_to_int, output_int_to_char, 1, 0.6, "RNN")

#     print(f"Input Sequence {i+1}: {input_seq}")
#     print(f"Predicted Output Sequence {i+1}: {predicted_output}\n")
