# **Importing all the necessary libraries**

In [None]:
# # Create Adam optimizer with default parameters
# optimizer = torch.optim.Adam(model.parameters())

# # Modify learning rate
# new_learning_rate = 0.001  # Set your desired learning rate
# for param_group in optimizer.param_groups:
#     param_group['lr'] = new_learning_rate

# # Modify other parameters
# # For example, to change weight decay
# new_weight_decay = 0.01  # Set your desired weight decay value
# for param_group in optimizer.param_groups:
#     param_group['weight_decay'] = new_weight_decay


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import numpy as np
import pandas as pd

## **Encoder class**

In [2]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, rnn_cell='lstm', dropout=0.5, bidirectional=True):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=input_size, embedding_dim=embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        
        rnn_hidden_size = hidden_size // 2 if bidirectional else hidden_size
        
        if rnn_cell.lower() == 'lstm':
            self.rnn = nn.LSTM(embedding_size, rnn_hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout), bidirectional=bidirectional)
        elif rnn_cell.lower() == 'gru':
            self.rnn = nn.GRU(embedding_size, rnn_hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout), bidirectional=bidirectional)
        else:
            self.rnn = nn.RNN(embedding_size, rnn_hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout), bidirectional=bidirectional)
    
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)
        outputs, hidden = self.rnn(embedded)

        if self.bidirectional:
            if isinstance(hidden, tuple):
                h_n, c_n = hidden
#                 print('enc h bef dir',h_n.shape)
#                 print('enc c bef dir',c_n.shape)
                h_n = torch.cat((h_n[0::2], h_n[1::2]), dim=2)
                c_n = torch.cat((c_n[0::2], c_n[1::2]), dim=2)
#                 print('enc h af dir',h_n.shape)
#                 print('enc c af dir',c_n.shape)
                hidden = (h_n, c_n)
            else:
#                 print('enc hidd bef dir',hidden.shape)
                hidden = torch.cat((hidden[0::2], hidden[1::2]), dim=2)
#                 print('after dir enc:',hidden.shape)

        return outputs, hidden

## **Attention Module**

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Attention(nn.Module):
    def __init__(self, hidden_size, rnn_cell):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        self.rnn_cell = rnn_cell
    
    def forward(self, hidden, encoder_outputs):
        timestep = encoder_outputs.size(1)
#         print(hidden.shape)
        if self.rnn_cell == 'lstm':
            hidden = hidden[-1]  # shape: z[batch_size, hidden_size]
        else:
            hidden = hidden
        h = hidden.unsqueeze(1).repeat(1, timestep, 1)
#         print('h',h.shape)
        encoder_outputs = encoder_outputs.permute(0, 1, 2)  # Change to [batch_size, seq_len, hidden_size]
#         print('encoder_outputs',encoder_outputs.shape)
        attn_energies = self.score(h, encoder_outputs)
#         print('attn_energies',attn_energies.shape)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)
    
    def score(self, hidden, encoder_outputs):
#         print('hidden',hidden.shape)
#         print('encoder_outputs',encoder_outputs.shape)
        energy = torch.tanh(self.attn(torch.cat([hidden, encoder_outputs], 2)))
#         print('energy',energy.shape)
        energy = energy.permute(0, 2, 1)  # Change to [batch_size, hidden_size, seq_len]
#         print('energy',energy.shape)
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)
#         print('v',v.shape)
        energy = torch.bmm(v, energy)
#         print('energy',energy.shape)
        return energy.squeeze(1)

## **Decoder class**

In [4]:
class Decoder(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, num_layers, encoder_num_layers, attention, rnn_cell='lstm', dropout=0.5, bidirectional=True):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=output_size, embedding_dim=embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.output_size = output_size
        self.hidden_size = hidden_size * encoder_num_layers if bidirectional else hidden_size
        self.num_layers = num_layers
        self.attention = attention
        
        if rnn_cell.lower() == 'lstm':
            self.rnn = nn.LSTM(embedding_size + hidden_size * encoder_num_layers, self.hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout))
        elif rnn_cell.lower() == 'gru':
            self.rnn = nn.GRU(embedding_size + hidden_size * encoder_num_layers, self.hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout))
        else:
            self.rnn = nn.RNN(embedding_size + hidden_size * encoder_num_layers, self.hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout))
        
        self.fc = nn.Linear(self.hidden_size, output_size)
        
    def forward(self, x, hidden, encoder_outputs):
        x = x.unsqueeze(1)
        embedded = self.dropout(self.embedding(x))
        
        attn_weights = self.attention(hidden[-1], encoder_outputs)
        context = attn_weights.bmm(encoder_outputs)
        rnn_input = torch.cat((embedded, context), 2)
        
        output, hidden = self.rnn(rnn_input, hidden)
        output = self.fc(self.dropout(output.squeeze(1)))
        
        return output, hidden, attn_weights


## **Sequence to Sequence model for the above encoder and decoder**

In [5]:
class Seq_to_Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq_to_Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teaching_force_ratio=0.5):
        batch_size = source.size(0)
        target_len = target.size(1)
        target_vocab_size = self.decoder.output_size
        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(source.device)
#         print(source.shape)#########################
        encoder_outputs, encoder_hidden = self.encoder(source)
        
        if isinstance(encoder_hidden, tuple):
            h_n, c_n = encoder_hidden
            if self.encoder.bidirectional:
                h_n = torch.cat([h_n[i:i+1] for i in range(0, h_n.shape[0], 2)] + [h_n[i:i+1] for i in range(1, h_n.shape[0], 2)], dim=2)
                c_n = torch.cat([c_n[i:i+1] for i in range(0, c_n.shape[0], 2)] + [c_n[i:i+1] for i in range(1, c_n.shape[0], 2)], dim=2)
            
            if h_n.size(0) < self.decoder.num_layers:
                zero_h = torch.zeros(self.decoder.num_layers - h_n.size(0), batch_size, self.encoder.num_layers * self.encoder.hidden_size, device=h_n.device)
                zero_c = torch.zeros(self.decoder.num_layers - c_n.size(0), batch_size, self.encoder.num_layers * self.encoder.hidden_size, device=c_n.device)
                h_n = torch.cat([h_n, zero_h], dim=0)
                c_n = torch.cat([c_n, zero_c], dim=0)

            encoder_hidden = (h_n[:self.decoder.num_layers], c_n[:self.decoder.num_layers])
        else:
            h_n = encoder_hidden
            if self.encoder.bidirectional:
                h_n = torch.cat([h_n[i:i+1] for i in range(0, h_n.shape[0], 2)] + [h_n[i:i+1] for i in range(1, h_n.shape[0], 2)], dim=2)
            
            if h_n.size(0) < self.decoder.num_layers:
                zero_h = torch.zeros(self.decoder.num_layers - h_n.size(0), batch_size, self.encoder.num_layers * self.encoder.hidden_size, device=encoder_hidden.device)
                h_n = torch.cat([h_n, zero_h], dim=0)
            encoder_hidden = h_n[:self.decoder.num_layers]
        
        decoder_input = target[:, 0]
                    
        for t in range(1, target_len):
            decoder_output, encoder_hidden, _ = self.decoder(decoder_input, encoder_hidden, encoder_outputs)
            outputs[:, t] = decoder_output
            teacher_force = torch.rand(1) < teaching_force_ratio
            top1 = decoder_output.argmax(1)
            decoder_input = target[:, t] if teacher_force else top1

        return outputs


# **Printing the model**

In [6]:
INPUT_DIM = 100
OUTPUT_DIM = 100
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
ENC_LAYERS = 1
DEC_LAYERS = 3
ENC_RNN_CELL = 'gru'
DEC_RNN_CELL = 'gru'

encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_LAYERS, ENC_RNN_CELL,dropout=0.3, bidirectional = True)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_LAYERS, encoder.num_layers, DEC_RNN_CELL, dropout=0.3, bidirectional = True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = Seq_to_Seq(encoder, decoder).to(device)
print(model)

Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): GRU(256, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(768, 512, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)


# **A function to create a vocabulary set from the given text**

In [7]:

# Define a function to create a vocabulary set from a given text
def create_vocab(text):
    # Create a set of unique characters found in the text
    # Each word in the text is processed to extract its characters
    vocab = set(char for word in text for char in word)
    # Add a padding token to the vocabulary
    vocab.add('<pad>')
    # Add a start-of-sequence token to the vocabulary
    vocab.add('<sos>')  # Start of sequence token
    # Add an end-of-sequence token to the vocabulary
    vocab.add('<eos>')  # End of sequence token
    # Return the complete set of vocabulary items
    return vocab

# **A function to load data from a CSV file**

In [8]:
# Define a function to load data from a CSV file
def load_data(path):
    # The file has no header and columns are named as 'latin' and 'bangla'
    df = pd.read_csv(path, header=None, names=['latin', 'bangla'])
#     df = df.head(10)
    # Return the columns as two separate Series objects
    return df['latin'], df['bangla']

# **Load Latin and bangla training data**

In [9]:
# Load Latin and bangla training data from specified path
latin_train, bangla_train = load_data('/kaggle/input/aksharantar/aksharantar_sampled/ben/ben_train.csv')

# **Print the loaded Latin and Bangla training data**

In [10]:
# Print the loaded Latin training data
print(latin_train)
print()
# Print the loaded bangla training data
print(bangla_train)

0        namdharirao
1        hindukusher
2        farajikandi
3           moubarak
4             chiung
            ...     
51195       silmadar
51196        jonnote
51197      handibage
51198         borpar
51199     bideshikei
Name: latin, Length: 51200, dtype: object

0            নামধারীরাও
1           হিন্দুকুশের
2           ফরাজীকান্দি
3                মুবারক
4                চিয়ুং
              ...      
51195          সিলমাদার
51196            জন্যতে
51197    হ্যান্ডিব্যাগে
51198             বরপার
51199         বিদেশীকেই
Name: bangla, Length: 51200, dtype: object


# **Create two vocabularies from the Latin and Bangla training data**

In [11]:
# Create a vocabulary from the Latin training data
latin_vocab = create_vocab(latin_train)
# Create a vocabulary from the bangla training data
bangla_vocab = create_vocab(bangla_train)

# **Print the created Latin and Bangla vocabularies**

In [12]:
# Print the created Latin vocabulary
print(latin_vocab)
print()
# Print the created bangla vocabulary
print(bangla_vocab)

{'g', 'o', '<eos>', 'a', 'r', 'n', 'q', 'k', 'y', 'c', 'x', 'j', 'l', '<pad>', 'p', 'i', '<sos>', 'e', 'b', 's', 'd', 'u', 'w', 'm', 'v', 'f', 'z', 'h', 't'}

{'অ', 'স', 'গ', 'ঠ', 'া', 'ফ', '<eos>', 'ঁ', 'ঈ', 'ড', 'ক', 'ৃ', 'ব', 'ং', 'র', 'হ', 'এ', 'ত', 'ূ', 'ঊ', '্', '়', 'ল', '২', 'ঔ', 'ই', 'ঝ', 'ভ', 'ম', 'দ', 'আ', 'ৎ', 'ও', 'চ', 'শ', 'ট', 'ণ', 'ো', 'প', 'ন', 'ঞ', 'থ', '<pad>', 'জ', 'ু', 'ৌ', 'ঢ', 'ী', 'ঃ', '<sos>', 'ধ', 'ঋ', 'ে', 'ঐ', 'য', 'ি', 'উ', 'খ', 'ৈ', 'ছ', 'ঙ', 'ষ', 'ঘ'}


# **Map each token in the Latin and Bangla vocabularies to a unique index and then Print the dictionaries mapping (Latin tokens to indices) and (Bangla tokens to indices)**


In [13]:
# Map each token in the Latin vocabulary to a unique index
latin_token_to_index = {token: index for index, token in enumerate(sorted(latin_vocab))}
# Map each token in the bangla vocabulary to a unique index
bangla_token_to_index = {token: index for index, token in enumerate(sorted(bangla_vocab))}

# Print the dictionary mapping Latin tokens to indices
print(latin_token_to_index)
print()

# Print the dictionary mapping bangla tokens to indices
print(bangla_token_to_index)

{'<eos>': 0, '<pad>': 1, '<sos>': 2, 'a': 3, 'b': 4, 'c': 5, 'd': 6, 'e': 7, 'f': 8, 'g': 9, 'h': 10, 'i': 11, 'j': 12, 'k': 13, 'l': 14, 'm': 15, 'n': 16, 'o': 17, 'p': 18, 'q': 19, 'r': 20, 's': 21, 't': 22, 'u': 23, 'v': 24, 'w': 25, 'x': 26, 'y': 27, 'z': 28}

{'<eos>': 0, '<pad>': 1, '<sos>': 2, 'ঁ': 3, 'ং': 4, 'ঃ': 5, 'অ': 6, 'আ': 7, 'ই': 8, 'ঈ': 9, 'উ': 10, 'ঊ': 11, 'ঋ': 12, 'এ': 13, 'ঐ': 14, 'ও': 15, 'ঔ': 16, 'ক': 17, 'খ': 18, 'গ': 19, 'ঘ': 20, 'ঙ': 21, 'চ': 22, 'ছ': 23, 'জ': 24, 'ঝ': 25, 'ঞ': 26, 'ট': 27, 'ঠ': 28, 'ড': 29, 'ঢ': 30, 'ণ': 31, 'ত': 32, 'থ': 33, 'দ': 34, 'ধ': 35, 'ন': 36, 'প': 37, 'ফ': 38, 'ব': 39, 'ভ': 40, 'ম': 41, 'য': 42, 'র': 43, 'ল': 44, 'শ': 45, 'ষ': 46, 'স': 47, 'হ': 48, '়': 49, 'া': 50, 'ি': 51, 'ী': 52, 'ু': 53, 'ূ': 54, 'ৃ': 55, 'ে': 56, 'ৈ': 57, 'ো': 58, 'ৌ': 59, '্': 60, 'ৎ': 61, '২': 62}


# **Defining a Dataset class for handling Latin and Bangla word pairs**

In [14]:
# Define a Dataset class for handling Latin and Bangla word pairs
class AksharantarDataset(Dataset):
    def __init__(self, latin_words, bangla_words, latin_token_to_index, bangla_token_to_index):
        # Store the lists of Latin and Bangla words
        self.latin_words = latin_words
        self.bangla_words = bangla_words
        # Store the dictionaries that map characters to indices for both languages
        self.latin_token_to_index = latin_token_to_index
        self.bangla_token_to_index = bangla_token_to_index

    def __len__(self):
        # Return the number of word pairs in the dataset
        return len(self.latin_words)

    def __getitem__(self, index):
        # Fetching the Latin and Bangla words at the specified index
        latin_word = self.latin_words.iloc[index]
#         print(latin_word)
        bangla_word = self.bangla_words.iloc[index]
#         print(bangla_word)
        # Convert the Latin word into indices using the latin_token_to_index mapping
        latin_indices = [latin_token_to_index[char] for char in latin_word]
#         print(latin_indices)
        # Convert the Bangla word into indices, adding <sos> and <eos> tokens
        bangla_indices = [bangla_token_to_index['<sos>']] + [bangla_token_to_index[char] for char in bangla_word] + [bangla_token_to_index['<eos>']]
#         print(bangla_indices)
        # Return the indices as tensor objects
        return torch.tensor(latin_indices, dtype=torch.long), torch.tensor(bangla_indices, dtype=torch.long)

# **Defining a function for padding sequences and packing batches**

In [15]:
# Define a function for padding sequences and packing batches
# packet_fn specifies a function to control how batches are created from the individual data items
def packet_fn(batch):
    # Unzip the batch to separate Latin and Bangla indices
    latin, bangla = zip(*batch)
#     print(latin, bangla)
    # Pad the sequences of Latin indices
    latin_padded = pad_sequence(latin, batch_first=True, padding_value=latin_token_to_index['<pad>'])
#     print(latin_padded)
    # Pad the sequences of Bangla indices
    bangla_padded = pad_sequence(bangla, batch_first=True, padding_value=bangla_token_to_index['<pad>'])
#     print(bangla_padded)
    # Return the padded batches
    return latin_padded, bangla_padded

# **Load training data into the AksharantarDataset and then creating the train_loader by Dataloader function**

In [16]:
# Load training data into the AksharantarDataset
train_dataset = AksharantarDataset(latin_train, bangla_train, latin_token_to_index, bangla_token_to_index)
# Create a DataLoader to batch and shuffle the dataset
# packet_fn specifies a function to control how batches are created from the individual data items
train_loader = DataLoader(train_dataset, batch_size = 64, collate_fn=packet_fn, shuffle=True)

# **Print an example from the dataset**

In [17]:
# Print an example from the dataset
print(train_dataset[4000])
# for i,j in train_loader:
#     print(i,'\n\n\n',j)

(tensor([19, 23,  7, 20,  5,  7, 22, 11, 16]), tensor([ 2, 17, 50, 43, 47, 56, 32, 51, 36,  0]))



# **A function for calculating word accuracy per batch, ignoring the padding token**

In [18]:
# Define a word accuracy function for word-level accuracy
def word_accuracy(outputs, targets, ignore_index):
    # Assuming outputs and targets are batched sequences of token indices
    # Ignoring <pad> tokens as specified by `ignore_index`
    correct = 0
    total = 0
    for out, tar in zip(outputs, targets):
        # Ignoring padding in accuracy calculation
#         print('out bef pad:',out)
#         print('tar:',tar)
        out = out[out != ignore_index]
        tar = tar[tar != ignore_index]
        ignore_index_eos = 0
        out = out[out != ignore_index_eos]
        tar = tar[tar != ignore_index_eos]
#         print('out aft pad:',out)
#         print('tar:',tar)
        if torch.equal(out, tar):
            correct += 1
#             print('correct:',correct)
        total += 1
#         print('total:',total)
    return correct / total if total > 0 else 0



# **Defining the Training function**

In [19]:

def train(model, iterator, optimizer, criterion, clip, device, ignore_index):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    
    for source, target in iterator:
        source = source.to(device)
        target = target.to(device)
        
        optimizer.zero_grad()
        output = model(source, target)
        
        output_dim = output.shape[-1]
        # Slice to ignore the <sos> token and keep sequence structure
        output = output[:, 1:, :]
        target = target[:, 1:]
        
        # Flatten all dimensions except for the batch dimension for loss calculation
        output_flat = output.reshape(-1, output_dim)
        target_flat = target.reshape(-1)
        
#         print('trainnnnnnnn')
        
        loss = criterion(output_flat, target_flat)
        # Calculate word-by-word accuracy
        acc = word_accuracy(output.argmax(dim=2), target, ignore_index)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)
######################

# **Defining the Evaluation function**

In [20]:
def evaluate(model, iterator, criterion, device, ignore_index):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    
    with torch.no_grad():
        for source, target in iterator:
            source = source.to(device)
            target = target.to(device)
            
            output = model(source, target, 0)
            output_dim = output.shape[-1]
            output = output[:, 1:, :]
            target = target[:, 1:]
            
            output_flat = output.reshape(-1, output_dim)
            target_flat = target.reshape(-1)
#             print('vallllllll')
            loss = criterion(output_flat, target_flat)
            acc = word_accuracy(output.argmax(dim=2), target, ignore_index)
            
            epoch_loss += loss.item()
            epoch_acc += acc
            
#             break
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

#######################

# **Load validation data into the AksharantarDataset and then creating the valid_loader by Dataloader function**

In [21]:
# Load validation data by reading a CSV file
latin_valid, bangla_valid = load_data('/kaggle/input/aksharantar/aksharantar_sampled/ben/ben_valid.csv')

# Create a validation dataset using the AksharantarDataset class.
valid_dataset = AksharantarDataset(latin_valid, bangla_valid, latin_token_to_index, bangla_token_to_index)

# Create a DataLoader to batch and shuffle the dataset
# 'collate_fn=packet_fn' specifies a function to control how batches are created from the individual data items.
# 'shuffle=True' ensures that the data is shuffled at every epoch which helps to reduce model overfitting
valid_loader = DataLoader(valid_dataset, batch_size=64, collate_fn=packet_fn, shuffle=True)

# **The training process for specified number of epochs**

In [22]:
INPUT_DIM = 100
OUTPUT_DIM = 100
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
ENC_LAYERS = 1
DEC_LAYERS = 1
ENC_RNN_CELL = 'gru'
DEC_RNN_CELL = 'gru'

# Adjust the model initialization and training process as needed
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_LAYERS, ENC_RNN_CELL, dropout=0.3, bidirectional=True)
attention = Attention(HID_DIM, ENC_RNN_CELL)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_LAYERS, encoder.num_layers, attention, DEC_RNN_CELL, dropout=0.3, bidirectional = True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = Seq_to_Seq(encoder, decoder).to(device)
print(model)

NUM_EPOCHS = 1
CLIP = 1
optimizer = torch.optim.Adam(model.parameters())
ignore_index = bangla_token_to_index['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=ignore_index).to(device)

for epoch in range(NUM_EPOCHS):
    train_loss, train_accuracy = train(model, train_loader, optimizer, criterion, CLIP, device, ignore_index)
    val_loss, val_accuracy = evaluate(model, valid_loader, criterion, device, ignore_index)
    
    print(f'Epoch: {epoch+1}')
    print(f'\tTrain_Loss: {train_loss:.3f}, Train_Accuracy: {train_accuracy*100:.2f}%')
    print(f'\tVal_Loss: {val_loss:.3f},  Val_Accuracy: {val_accuracy*100:.2f}%')


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): GRU(256, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (attention): Attention(
      (attn): Linear(in_features=1024, out_features=512, bias=True)
    )
    (rnn): GRU(768, 512, batch_first=True)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)


KeyboardInterrupt: 

# **Load the Test data into the AksharantarDataset and then creating the test_loader by Dataloader function**

In [23]:
# Load the test data from the specified CSV file location
latin_test, bangla_test = load_data('/kaggle/input/aksharantar/aksharantar_sampled/ben/ben_test.csv')

# Create test_dataset using the AksharantarDataset class, initializing it with test data
# and corresponding token-to-index mappings for both Latin and Bangla scripts
test_dataset = AksharantarDataset(latin_test, bangla_test, latin_token_to_index, bangla_token_to_index)

# A DataLoader for the test dataset. Here, the batch size is set to 1, indicates
# that the model will process one item at a time. This is for testing to make
# detailed predictions per sample without batching effects.
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=packet_fn, shuffle=False)
# print(test_dataset[0])



# **A function to convert an array of indices back into a string, excluding any indices corresponding to special tokens like padding, start, or end of sequence tokens, which should not appear in the final output string**

In [24]:
def decode_indices(indices, index_to_token):
    # Filter out indices for padding, start-of-sequence, and end-of-sequence tokens to ensure only valid character indices are decoded
    valid_indices = [index for index in indices if index in index_to_token and index not in (bangla_token_to_index['<pad>'], bangla_token_to_index['<sos>'], bangla_token_to_index['<eos>'])]
    # Convert each index to its corresponding character and join them to form the decoded string
    return ''.join([index_to_token[index] for index in valid_indices])

# **Creating the prediction function to generate outputs for all samples in the test_loader**

In [25]:
def predict(model, iterator, device):
    # Set the model to evaluation mode to disable dropout or batch normalization effects during inference
    model.eval()
    predictions = []
    # Disables gradient calculations for performance improvement since they are not needed in inference
    with torch.no_grad():
        for source, target in iterator:
            # Ensure the source and target tensors are on the correct device (GPU or CPU)
            source = source.to(device)
            target = target.to(device)
            # Obtain model output without teacher forcing (i.e., the model relies entirely on its predictions)
            output = model(source, target, 0)
            # Get the index with the highest probability from output predictions
            output = output.argmax(2)
            # Convert tensors to CPU numpy arrays for easier manipulation and extraction
            source = source.cpu().numpy()
            output = output.cpu().numpy()
            target = target.cpu().numpy()
            # Store the tuple of source and decoded output predictions
            predictions.append((source, target, output))
    # Return all predictions made over the iterator
    return predictions

# **Creating dictionaries to map indices back to its corresponding characters**

In [26]:
# Create dictionaries to map indices back to characters, observing the interpretation of prediction outputs
latin_index_to_token = {index: char for char, index in latin_token_to_index.items()}
bangla_index_to_token = {index: char for char, index in bangla_token_to_index.items()}
# print(latin_index_to_token)
# print(bangla_index_to_token)

# **Displaying results: Each input text from the test dataset and its corresponding predicted output text are printed. This helps in visually assessing the accuracy and quality of the transliterations produced by the model**

In [27]:
# Taking the prediction function to generate outputs for all samples in the test_loader
test_predictions = predict(model, test_loader, device)
# print(test_predictions[1])
# Loop through the list of tuples containing source and output indices from the test predictions
for source_indices, target_indices, output_indices in test_predictions:
    # Iterate through each example in the batch. This is necessary as batches may contain multiple examples
    for i in range(source_indices.shape[0]):
        # Decode the source indices to their corresponding text using the mapping dictionary for Latin script
        input_text = decode_indices(source_indices[i], latin_index_to_token)
        
        target_text = decode_indices(target_indices[i], bangla_index_to_token)

        # Decode the output indices to their corresponding text using the mapping dictionary for Bangla script
        predicted_text = decode_indices(output_indices[i], bangla_index_to_token)
        # Print the original input text and its corresponding predicted transliteration
        print(f'Input Text: {input_text} -> Actual Text: {target_text} -> Predicted Text: {predicted_text}')

RuntimeError: Sizes of tensors must match except in dimension 2. Expected size 1 but got size 32 for tensor number 1 in the list.

In [23]:
import wandb
import numpy as np
from types import SimpleNamespace
import random

# key = input('Enter your API:')
wandb.login(key='25c2257eaf6c22aa056893db14da4ee2bf0a531a')  #25c2257eaf6c22aa056893db14da4ee2bf0a531a

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [24]:
sweep_config = {
    'method': 'bayes',
    'name' : 'sweep attn 1',
    'metric': {
        'name': 'Val_Accuracy',
        'goal': 'maximize'
    },
    'parameters': {
        'input_embed_size': {
            'values': [16,32,64,256,512]
        },
        'num_enc_layers':{
            'values': [1]
        },
        'num_dec_layers':{
            'values': [1]
        },
        'hid_layer_size': {
            'values': [16,32,64,256,512]
        },
        'cell_type': {
            'values': ['rnn', 'gru', 'lstm']
        },
        'bidirectional':{
            'values': [True, False]
        },
        'dropout': {
            'values': [0.2, 0.3]
        },
#       'beam search in decoder with different beam sizes': 
    }
}

sweep_id = wandb.sweep(sweep = sweep_config, project="Deep_Learning_A3")


Create sweep with ID: vmcuui5c
Sweep URL: https://wandb.ai/parthasakhapaul/Deep_Learning_A3/sweeps/vmcuui5c


In [None]:
# import wandb

def main():
    # Initialize a new wandb run
    with wandb.init() as run:
        # Construct run name from configuration
        run_name = "-embed_size-"+str(wandb.config.input_embed_size)+"-layers_enc-"+str(wandb.config.num_enc_layers)+"-layers_dec-"+str(wandb.config.num_dec_layers)+"-hid_size-"+str(wandb.config.hid_layer_size)+"-cell_type-"+wandb.config.cell_type+"-bidirectional-"+str(wandb.config.bidirectional)+"-dropout-"+str(wandb.config.dropout)
        wandb.run.name = run_name

        # Constants defining the dimensions of the input and output character sets
        INPUT_DIM = 100  # size of the Latin character set
        OUTPUT_DIM = 100  # size of the Bangla character set

        # Constants defining the dimensions of the embeddings for encoder and decoder
        ENC_EMB_DIM = wandb.config.input_embed_size  # Encoder embedding dimension
        DEC_EMB_DIM = wandb.config.input_embed_size  # Decoder embedding dimension

        # Constants defining the dimension of the hidden layers for encoder and decoder
        HID_DIM = wandb.config.hid_layer_size  # Hidden dimension size

        # Constants defining the number of layers for encoder and decoder
        ENC_LAYERS = wandb.config.num_enc_layers  # Number of layers in the encoder
        DEC_LAYERS = wandb.config.num_dec_layers  # Number of layers in the decoder
        

        # Constants defining the type of RNN cell to use for encoder and decoder
        ENC_RNN_CELL = wandb.config.cell_type  # RNN cell type for the encoder
        DEC_RNN_CELL = wandb.config.cell_type  # RNN cell type for the decoder
        
#         encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_LAYERS, ENC_RNN_CELL, dropout=0.3, bidirectional=True)
#         decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_LAYERS, encoder.num_layers, attention, DEC_RNN_CELL, dropout=0.3, bidirectional = True)
        # Instantiate the encoder with specified configurations
        encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_LAYERS, ENC_RNN_CELL, dropout = wandb.config.dropout, bidirectional = wandb.config.bidirectional)
        attention = Attention(HID_DIM, ENC_RNN_CELL)
        # Instantiate the decoder with specified configurations
        decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_LAYERS, encoder.num_layers, attention, DEC_RNN_CELL, dropout = wandb.config.dropout, bidirectional = wandb.config.bidirectional)

        # Determine the computing device (CUDA if available, otherwise CPU)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # Print the device will be used
        print(f"Using device: {device}")

        # Instantiate the Seq_to_Seq model and move it to the chosen computing device
        model = Seq_to_Seq(encoder, decoder).to(device)
        print(model)
        
        
        # Setting the number of epochs the training process should run
        NUM_EPOCHS = 7
        # Set the maximum norm of the gradients to 1 to prevent exploding gradients
        CLIP = 1
        # Initialize the optimizer, Adam
        optimizer = torch.optim.Adam(model.parameters())
        # Padding token index should be ignored in loss calculation
        ignore_index = bangla_token_to_index['<pad>']
        # Define the loss function with 'ignore_index' to avoid affecting loss calculation with padding tokens
        criterion = nn.CrossEntropyLoss(ignore_index=ignore_index).to(device)

        # Start the training process for the defined number of epochs
        for epoch in range(NUM_EPOCHS):
            # Doing training on the train dataset and return average loss and accuracy
            train_loss, train_accuracy = train(model, train_loader, optimizer, criterion, CLIP, device, ignore_index)
            # Evaluating the model on the validation dataset and return average loss and accuracy
            val_loss, val_accuracy = evaluate(model, valid_loader, criterion, device, ignore_index)

            # Print the loss and accuracy for each epoch
            print(f'Epoch: {epoch+1}')
            print(f'\tTrain_Loss: {train_loss:.3f}, Train_Accuracy: {train_accuracy*100:.2f}%')
            print(f'\tVal_Loss: {val_loss:.3f},  Val_Accuracy: {val_accuracy*100:.2f}%')
            wandb.log({"train_accuracy": train_accuracy * 100, "training_loss": train_loss})
            wandb.log({"Val_Accuracy": val_accuracy * 100, "Val_Loss": val_loss})


wandb.agent(sweep_id, function=main, count=1)
wandb.finish()

[34m[1mwandb[0m: Agent Starting Run: yq1ebshq with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 16
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 1
[34m[1mwandb[0m: Currently logged in as: [33mparthasakhapaul[0m. Use [1m`wandb login --relogin`[0m to force relogin


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): RNN(16, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (attention): Attention(
      (attn): Linear(in_features=1024, out_features=512, bias=True)
    )
    (rnn): RNN(528, 512, batch_first=True)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.814, Train_Accuracy: 2.80%
	Val_Loss: 1.551,  Val_Accuracy: 9.45%
Epoch: 2
	Train_Loss: 1.244, Train_Accuracy: 5.73%
	Val_Loss: 1.445,  Val_Accuracy: 12.45%
Epoch: 3
	Train_Loss: 1.084, Train_Accuracy: 7.55%
	Val_Loss: 1.415,  Val_Accuracy: 15.53%
Epoch: 4
	Train_Loss: 0.978, Train_Accuracy: 9.11%
	Val_Loss: 1.351,  Val_Accuracy: 13.55%
Epoch: 5
	Train_Loss: 0.924, Train_Accuracy: 10.20%
	Val_Loss: 1.296,  Val_Accuracy: 13.79%
