# **Importing all the necessary libraries**

In [None]:
# # Create Adam optimizer with default parameters
# optimizer = torch.optim.Adam(model.parameters())

# # Modify learning rate
# new_learning_rate = 0.001  # Set your desired learning rate
# for param_group in optimizer.param_groups:
#     param_group['lr'] = new_learning_rate

# # Modify other parameters
# # For example, to change weight decay
# new_weight_decay = 0.01  # Set your desired weight decay value
# for param_group in optimizer.param_groups:
#     param_group['weight_decay'] = new_weight_decay


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import numpy as np
import pandas as pd

## **Encoder class**

In [18]:
# # Define a class Encoder, which is a subclass of nn.Module
# class Encoder(nn.Module):
#     # Constructor with parameters for initialization
#     def __init__(self, input_size, embedding_size, hidden_size, num_layers, rnn_cell='lstm', dropout=0.5, bidirectional=True):
#         # Initialize the base class
#         super(Encoder, self).__init__()
#         # Embedding layer that transforms inputs (word indices) into embeddings of a specified size
#         self.embedding = nn.Embedding(num_embeddings=input_size, embedding_dim=embedding_size)
#         # Dropout layer for regularizing and preventing overfitting
#         self.dropout = nn.Dropout(dropout)
#         # Store RNN configuration parameters
#         self.hidden_size = hidden_size
#         self.num_layers = num_layers
#         self.bidirectional = bidirectional
        
#         # Calculate the actual size of the hidden layer based on bidirectionality
#         rnn_hidden_size = hidden_size // 2 if bidirectional else hidden_size
        
#         # Construction of RNN layers based on specified cell type
#         if rnn_cell.lower() == 'lstm':
#             self.rnn = nn.LSTM(embedding_size, rnn_hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout), bidirectional=bidirectional)
#         elif rnn_cell.lower() == 'gru':
#             self.rnn = nn.GRU(embedding_size, rnn_hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout), bidirectional=bidirectional)
#         else:
#             self.rnn = nn.RNN(embedding_size, rnn_hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout), bidirectional=bidirectional)
    
#     # Define the forward pass which will be called with input x
#     def forward(self, x):
# #         print(x)
#         # Apply the embedding layer to the input x, which transforms word indices into dense vectors
#         embedded = self.embedding(x)
# #         print(embedded)
# #         print('encoder :::after emb',embedded.shape)
# #         print('emb0 shape',embedded[0].shape)
#         # Apply dropout to the embeddings
#         embedded = self.dropout(embedded)
# #         print('drop',embedded.shape)
#         # Pass the embedded and dropout-applied inputs through the RNN layer
#         # Returns the output and the last hidden state
#         outputs, hidden = self.rnn(embedded)
# #         print('after rnn embedded turns into : out and hid ',outputs[0],hidden[0])
#         # Handling hidden states for bidirectional RNNs especially LSTM
#         if self.bidirectional:
#             # For LSTM, hidden is a tuple (h_n, c_n) where each is of shape
#             # (num_layers * num_directions, batch, hidden_size)
#             if isinstance(hidden, tuple):
#                 # We concatenate the hidden states for both directions
#                 h_n, c_n = hidden
# #                 h_n = torch.cat([h_n[i::2] for i in range(2)], dim=2)
# #                 c_n = torch.cat([c_n[i::2] for i in range(2)], dim=2)
#                 h_n = torch.cat([h_n[i:i+1] for i in range(0, h_n.shape[0], 2)] + [h_n[i:i+1] for i in range(1, h_n.shape[0], 2)], dim=2)
#                 c_n = torch.cat([c_n[i:i+1] for i in range(0, c_n.shape[0], 2)] + [c_n[i:i+1] for i in range(1, c_n.shape[0], 2)], dim=2)
#                 hidden = (h_n, c_n)
# #                 print(f'h_n: {h_n.shape},  c_n: {c_n.shape}')
#             else:
#                 # For GRU and RNN, just h_n is returned
#                 hidden = torch.cat([hidden[i::2] for i in range(2)], dim=2)
# #                 print(hidden.shape)
        
#         # Return only the hidden state; outputs are not needed in this implementation
#         return hidden

In [2]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, rnn_cell='lstm', dropout=0.5, bidirectional=True):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=input_size, embedding_dim=embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        
        rnn_hidden_size = hidden_size // 2 if bidirectional else hidden_size
        
        if rnn_cell.lower() == 'lstm':
            self.rnn = nn.LSTM(embedding_size, rnn_hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout), bidirectional=bidirectional)
        elif rnn_cell.lower() == 'gru':
            self.rnn = nn.GRU(embedding_size, rnn_hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout), bidirectional=bidirectional)
        else:
            self.rnn = nn.RNN(embedding_size, rnn_hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout), bidirectional=bidirectional)
    
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)
        outputs, hidden = self.rnn(embedded)

        if self.bidirectional:
            # Simplify concatenation for bidirectional states
            if isinstance(hidden, tuple):  # LSTM
                h_n, c_n = hidden
                # Combine every two layers (forward and backward)
                h_n = torch.cat((h_n[0::2], h_n[1::2]), dim=2)
                c_n = torch.cat((c_n[0::2], c_n[1::2]), dim=2)
                hidden = (h_n, c_n)
            else:  # GRU or RNN
                hidden = torch.cat((hidden[0::2], hidden[1::2]), dim=2)

        return hidden


# **Decoder class**

In [3]:
import torch.nn as nn

class Decoder(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, num_layers, rnn_cell='lstm', dropout=0.5, bidirectional=True):
        super(Decoder, self).__init__()
        
        self.embedding = nn.Embedding(num_embeddings=output_size, embedding_dim=embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.output_size = output_size
        # If the encoder is bidirectional, we assume the hidden state size will be doubled
        self.hidden_size = hidden_size * 2 if bidirectional else hidden_size
        self.num_layers = num_layers
        
        # Select the type of RNN Cell
        if rnn_cell.lower() == 'lstm':
            self.rnn = nn.LSTM(embedding_size, self.hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout))
        elif rnn_cell.lower() == 'gru':
            self.rnn = nn.GRU(embedding_size, self.hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout))
        else:
            self.rnn = nn.RNN(embedding_size, self.hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout))
        
        # Output fully connected layer
        self.fc = nn.Linear(self.hidden_size, output_size)

    def forward(self, x, hidden):
        x = x.unsqueeze(1)
        embedded = self.dropout(self.embedding(x))
        output, hidden = self.rnn(embedded, hidden)
        output = self.fc(self.dropout(output.squeeze(1)))
        return output, hidden


In [4]:
# # Define a class Decoder, which is a subclass of nn.Module
# class Decoder(nn.Module):
#     # Constructor with parameters for initialization
#     def __init__(self, output_size, embedding_size, hidden_size, num_layers, rnn_cell='lstm', dropout=0.5):
#         # Initialize the base class
#         super(Decoder, self).__init__()
        
#         # Embedding layer that maps indices in the target vocabulary to vectors of a specified size
#         self.embedding = nn.Embedding(num_embeddings=output_size, embedding_dim=embedding_size)
        
#         # Dropout layer for regularizing and preventing overfitting
#         self.dropout = nn.Dropout(dropout)
        
#         # Store parameters for configuration of the RNN
#         self.output_size = output_size
#         self.hidden_size = hidden_size
#         self.num_layers = num_layers
        
#         # Construction of RNN layers based on specified cell type
#         if rnn_cell.lower() == 'lstm':
#             # LSTM layer
#             self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout))
#         elif rnn_cell.lower() == 'gru':
#             # GRU layer
#             self.rnn = nn.GRU(embedding_size, hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout))
#         else:
#             # Basic RNN layer 
#             self.rnn = nn.RNN(embedding_size, hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout))
        
#         # Fully connected layer to transform the output of the RNN into the size of the output vocabulary
#         self.fc = nn.Linear(hidden_size, output_size)
# #         self.fc = nn.Linear(2 * hidden_size, output_size)

#     # Define the forward pass method which will be called with input x and the initial hidden state
#     def forward(self, x, hidden):
#         # Prepare input data for the RNN by adding an extra dimension at index 1 (for batch handling)
# #         print('DEcoder:::befunsq',x.shape)
#         x = x.unsqueeze(1)  # Change shape from (batch_size) to (batch_size, 1)
# #         print('afunsq',x.shape)
        
#         # Apply the embedding layer to x and then apply dropout
#         embedded = self.dropout(self.embedding(x))
# #         print('dec_emb',embedded.shape)
        
#         # Pass the embedded, dropout-applied input and the previous hidden state into the RNN
#         output, hidden = self.rnn(embedded, hidden)
# #         print('dec_out hid',output[0],hidden[0])

#         # Squeeze the output from RNN to remove the middle dimension (batch_first=True makes it (batch_size, 1, hidden_size))
#         # Apply dropout again before the final transformation
#         output = self.fc(self.dropout(output.squeeze(1)))
# #         print('after flatten dec output',output.shape)
        
#         # Return the output predictions and the hidden state to be used in the next time step
#         return output, hidden


# **Sequence to Sequence model for the above encoder and decoder**

In [5]:
import torch
import torch.nn as nn

class Seq_to_Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq_to_Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teaching_force_ratio=0.5):
        batch_size = source.size(0)
        target_len = target.size(1)
        target_vocab_size = self.decoder.output_size
        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(source.device)
        
        encoder_hidden = self.encoder(source)

        # Adjust for bidirectional and layer mismatch:
        if isinstance(encoder_hidden, tuple):  # LSTM case
            h_n, c_n = encoder_hidden
            if self.encoder.bidirectional:
                # Combine the bidirectional states
                h_n = torch.cat([h_n[i:i+1] for i in range(0, h_n.shape[0], 2)] + [h_n[i:i+1] for i in range(1, h_n.shape[0], 2)], dim=2)
                c_n = torch.cat([c_n[i:i+1] for i in range(0, c_n.shape[0], 2)] + [c_n[i:i+1] for i in range(1, c_n.shape[0], 2)], dim=2)
            
            # Adjust layer dimension to match decoder's expected number of layers
            if h_n.size(0) < self.decoder.num_layers:
                zero_h = torch.zeros(self.decoder.num_layers - h_n.size(0), batch_size, self.decoder.hidden_size, device=h_n.device)
                zero_c = torch.zeros(self.decoder.num_layers - c_n.size(0), batch_size, self.decoder.hidden_size, device=c_n.device)
                h_n = torch.cat([h_n, zero_h], dim=0)
                c_n = torch.cat([c_n, zero_c], dim=0)
            encoder_hidden = (h_n[:self.decoder.num_layers], c_n[:self.decoder.num_layers])
        else:  # GRU or RNN
            if self.encoder.bidirectional:
                # Combine the bidirectional states
                encoder_hidden = torch.cat([encoder_hidden[i::2] for i in range(2)], dim=2)
            
            # Adjust layer dimension to match decoder's expected number of layers
            if encoder_hidden.size(0) < self.decoder.num_layers:
                zero_h = torch.zeros(self.decoder.num_layers - encoder_hidden.size(0), batch_size, self.decoder.hidden_size, device=encoder_hidden.device)
                encoder_hidden = torch.cat([encoder_hidden, zero_h], dim=0)
            encoder_hidden = encoder_hidden[:self.decoder.num_layers]
        
        decoder_input = target[:, 0]
        
        for t in range(1, target_len):
            decoder_output, encoder_hidden = self.decoder(decoder_input, encoder_hidden)
            outputs[:, t] = decoder_output
            teacher_force = torch.rand(1) < teaching_force_ratio
            top1 = decoder_output.argmax(1)
            decoder_input = target[:, t] if teacher_force else top1

        return outputs

In [188]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, rnn_cell='lstm', dropout=0.5, bidirectional=True):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=input_size, embedding_dim=embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        
        rnn_hidden_size = hidden_size // 2 if bidirectional else hidden_size
        
        if rnn_cell.lower() == 'lstm':
            self.rnn = nn.LSTM(embedding_size, rnn_hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout), bidirectional=bidirectional)
        elif rnn_cell.lower() == 'gru':
            self.rnn = nn.GRU(embedding_size, rnn_hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout), bidirectional=bidirectional)
        else:
            self.rnn = nn.RNN(embedding_size, rnn_hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout), bidirectional=bidirectional)
    
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)
        outputs, hidden = self.rnn(embedded)

        if self.bidirectional:
            if isinstance(hidden, tuple):
                h_n, c_n = hidden
#                 print('enc h bef dir',h_n.shape)
#                 print('enc c bef dir',c_n.shape)
                h_n = torch.cat((h_n[0::2], h_n[1::2]), dim=2)
                c_n = torch.cat((c_n[0::2], c_n[1::2]), dim=2)
#                 print('enc h af dir',h_n.shape)
#                 print('enc c af dir',c_n.shape)
                hidden = (h_n, c_n)
            else:
#                 print('enc hidd bef dir',hidden.shape)
                hidden = torch.cat((hidden[0::2], hidden[1::2]), dim=2)
#                 print('after dir enc:',hidden.shape)

        return hidden

class Decoder(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, num_layers, encoder_num_layers, rnn_cell='lstm', dropout=0.5, bidirectional=True):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=output_size, embedding_dim=embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.output_size = output_size
        self.hidden_size = hidden_size * encoder_num_layers if bidirectional else hidden_size
        self.num_layers = num_layers
        
        if rnn_cell.lower() == 'lstm':
            self.rnn = nn.LSTM(embedding_size, self.hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout))
        elif rnn_cell.lower() == 'gru':
            self.rnn = nn.GRU(embedding_size, self.hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout))
        else:
            self.rnn = nn.RNN(embedding_size, self.hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout))
        
        self.fc = nn.Linear(self.hidden_size, output_size)

    def forward(self, x, hidden):
        x = x.unsqueeze(1)
        embedded = self.dropout(self.embedding(x))
        output, hidden = self.rnn(embedded, hidden)
        output = self.fc(self.dropout(output.squeeze(1)))
        return output, hidden

class Seq_to_Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq_to_Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teaching_force_ratio=0.5):
        batch_size = source.size(0)
        target_len = target.size(1)
        target_vocab_size = self.decoder.output_size
        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(source.device)
        
        encoder_hidden = self.encoder(source)
        
        if isinstance(encoder_hidden, tuple):
            h_n, c_n = encoder_hidden
#             print('enc h bef dir',h_n.shape)
#             print('enc c bef dir',c_n.shape)
            if self.encoder.bidirectional:
#                 print('enc h bef dir',h_n.shape)
#                 print('enc c bef dir',c_n.shape)
                h_n = torch.cat([h_n[i:i+1] for i in range(0, h_n.shape[0], 2)] + [h_n[i:i+1] for i in range(1, h_n.shape[0], 2)], dim=2)
                c_n = torch.cat([c_n[i:i+1] for i in range(0, c_n.shape[0], 2)] + [c_n[i:i+1] for i in range(1, c_n.shape[0], 2)], dim=2)
#                 print('enc h af dir',h_n.shape)
#                 print('enc c af dir',c_n.shape)
            
            if h_n.size(0) < self.decoder.num_layers:
#                 zero_h = torch.zeros(self.decoder.num_layers - h_n.size(0), batch_size, self.decoder.hidden_size, device=h_n.device)
#                 zero_c = torch.zeros(self.decoder.num_layers - c_n.size(0), batch_size, self.decoder.hidden_size, device=c_n.device)
                zero_h = torch.zeros(self.decoder.num_layers - h_n.size(0), batch_size, self.encoder.num_layers * self.encoder.hidden_size, device=h_n.device)
                zero_c = torch.zeros(self.decoder.num_layers - c_n.size(0), batch_size, self.encoder.num_layers * self.encoder.hidden_size, device=c_n.device)
#                 print(zero_h.shape)
#                 print(zero_c.shape)
                h_n = torch.cat([h_n, zero_h], dim=0)
                c_n = torch.cat([c_n, zero_c], dim=0)
#                 print(h_n.shape)
#                 print(c_n.shape)

            encoder_hidden = (h_n[:self.decoder.num_layers], c_n[:self.decoder.num_layers])
#             print('encoder_hidden h_n shape',encoder_hidden[0].shape)
        else:
            h_n = encoder_hidden
#             print(h_n.shape)
            if self.encoder.bidirectional:
                h_n = torch.cat([h_n[i:i+1] for i in range(0, h_n.shape[0], 2)] + [h_n[i:i+1] for i in range(1, h_n.shape[0], 2)], dim=2)
#                 print('encoder_hidden shape is: ',h_n.shape)
#             encoder_hidden = h_n[:self.decoder.num_layers]
#             print(encoder_hidden.shape)
#             if self.encoder.bidirectional:
#                 print('encoder_hidden shape is: ',encoder_hidden.shape)
# #                 encoder_hidden = torch.cat((encoder_hidden[0:encoder_hidden.size(0):2], encoder_hidden[1:encoder_hidden.size(0):2]), dim=2)
# #                 encoder_hidden = torch.cat([encoder_hidden[i::2] for i in range(2)], dim=2)
# #                 encoder_hidden = encoder_hidden.view(self.encoder.num_layers, 2, batch_size, self.encoder.hidden_size)
#                 print(encoder_hidden.shape)        
# #                 encoder_hidden = torch.cat((encoder_hidden[:, 0, :, :], encoder_hidden[:, 1, :, :]), dim=2)
                
# enc hidd bef dir torch.Size([4, 64, 256])
# after dir enc: torch.Size([2, 64, 512])
# torch.Size([2, 64, 512])
# torch.Size([1, 64, 1024])
# torch.Size([1, 64, 1024])
            if h_n.size(0) < self.decoder.num_layers:
                zero_h = torch.zeros(self.decoder.num_layers - h_n.size(0), batch_size, self.encoder.num_layers * self.encoder.hidden_size, device=encoder_hidden.device)
#                 print('zero',zero_h.shape)
                h_n = torch.cat([h_n, zero_h], dim=0)
            encoder_hidden = h_n[:self.decoder.num_layers]
#             print('encoder_hidden shape is: ',encoder_hidden.shape)
        
        decoder_input = target[:, 0]
#         print('decoder_input',decoder_input.shape)
                    
        for t in range(1, target_len):
            decoder_output, encoder_hidden = self.decoder(decoder_input, encoder_hidden)
#             print('hghg')
            outputs[:, t] = decoder_output
            teacher_force = torch.rand(1) < teaching_force_ratio
            top1 = decoder_output.argmax(1)
            decoder_input = target[:, t] if teacher_force else top1

        return outputs


In [189]:
def create_vocab(text):
    vocab = set(char for word in text for char in word)
    vocab.add('<pad>')
    vocab.add('<sos>')
    vocab.add('<eos>')
    return vocab

def load_data(path):
    df = pd.read_csv(path, header=None, names=['latin', 'bangla'])
    return df['latin'], df['bangla']

latin_train, bangla_train = load_data('/kaggle/input/aksharantar/aksharantar_sampled/ben/ben_train.csv')
latin_vocab = create_vocab(latin_train)
bangla_vocab = create_vocab(bangla_train)
latin_token_to_index = {token: index for index, token in enumerate(sorted(latin_vocab))}
bangla_token_to_index = {token: index for index, token in enumerate(sorted(bangla_vocab))}

print(latin_token_to_index)
print()
print(bangla_token_to_index)

class AksharantarDataset(Dataset):
    def __init__(self, latin_words, bangla_words, latin_token_to_index, bangla_token_to_index):
        self.latin_words = latin_words
        self.bangla_words = bangla_words
        self.latin_token_to_index = latin_token_to_index
        self.bangla_token_to_index = bangla_token_to_index

    def __len__(self):
        return len(self.latin_words)

    def __getitem__(self, index):
        latin_word = self.latin_words.iloc[index]
        bangla_word = self.bangla_words.iloc[index]
        latin_indices = [latin_token_to_index[char] for char in latin_word]
        bangla_indices = [bangla_token_to_index['<sos>']] + [bangla_token_to_index[char] for char in bangla_word] + [bangla_token_to_index['<eos>']]
        return torch.tensor(latin_indices, dtype=torch.long), torch.tensor(bangla_indices, dtype=torch.long)

def packet_fn(batch):
    latin, bangla = zip(*batch)
    latin_padded = pad_sequence(latin, batch_first=True, padding_value=latin_token_to_index['<pad>'])
    bangla_padded = pad_sequence(bangla, batch_first=True, padding_value=bangla_token_to_index['<pad>'])
    return latin_padded, bangla_padded

train_dataset = AksharantarDataset(latin_train, bangla_train, latin_token_to_index, bangla_token_to_index)
train_loader = DataLoader(train_dataset, batch_size = 64, collate_fn=packet_fn, shuffle=True)

def word_accuracy(outputs, targets, ignore_index):
    correct = 0
    total = 0
    for out, tar in zip(outputs, targets):
        out = out[out != ignore_index]
        tar = tar[tar != ignore_index]
        ignore_index_eos = 0
        out = out[out != ignore_index_eos]
        tar = tar[tar != ignore_index_eos]
        if torch.equal(out, tar):
            correct += 1
        total += 1
    return correct / total if total > 0 else 0


def train(model, iterator, optimizer, criterion, clip, device, ignore_index):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    
    for source, target in iterator:
        source = source.to(device)
        target = target.to(device)
        
        optimizer.zero_grad()
        output = model(source, target)
        
        output_dim = output.shape[-1]
        output = output[:, 1:, :]
        target = target[:, 1:]
        
        output_flat = output.reshape(-1, output_dim)
        target_flat = target.reshape(-1)
        
        loss = criterion(output_flat, target_flat)
        acc = word_accuracy(output.argmax(dim=2), target, ignore_index)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion, device, ignore_index):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    
    with torch.no_grad():
        for source, target in iterator:
            source = source.to(device)
            target = target.to(device)
            
            output = model(source, target, 0)
            output_dim = output.shape[-1]
            output = output[:, 1:, :]
            target = target[:, 1:]
            
            output_flat = output.reshape(-1, output_dim)
            target_flat = target.reshape(-1)
            loss = criterion(output_flat, target_flat)
            acc = word_accuracy(output.argmax(dim=2), target, ignore_index)
            
            epoch_loss += loss.item()
            epoch_acc += acc
            
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

latin_valid, bangla_valid = load_data('/kaggle/input/aksharantar/aksharantar_sampled/ben/ben_valid.csv')
valid_dataset = AksharantarDataset(latin_valid, bangla_valid, latin_token_to_index, bangla_token_to_index)
valid_loader = DataLoader(valid_dataset, batch_size=64, collate_fn=packet_fn, shuffle=True)

INPUT_DIM = 100
OUTPUT_DIM = 100
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
ENC_LAYERS = 1
DEC_LAYERS = 3
ENC_RNN_CELL = 'rnn'
DEC_RNN_CELL = 'rnn'

encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_LAYERS, ENC_RNN_CELL,dropout=0.3, bidirectional = True)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_LAYERS, encoder.num_layers, DEC_RNN_CELL, dropout=0.3, bidirectional = True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = Seq_to_Seq(encoder, decoder).to(device)
print(model)

NUM_EPOCHS = 1
CLIP = 1
optimizer = torch.optim.Adam(model.parameters())
ignore_index = bangla_token_to_index['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=ignore_index).to(device)

for epoch in range(NUM_EPOCHS):
    train_loss, train_accuracy = train(model, train_loader, optimizer, criterion, CLIP, device, ignore_index)
    val_loss, val_accuracy = evaluate(model, valid_loader, criterion, device, ignore_index)
    
    print(f'Epoch: {epoch+1}')
    print(f'\tTrain_Loss: {train_loss:.3f}, Train_Accuracy: {train_accuracy*100:.2f}%')
    print(f'\tVal_Loss: {val_loss:.3f},  Val_Accuracy: {val_accuracy*100:.2f}%')


{'<eos>': 0, '<pad>': 1, '<sos>': 2, 'a': 3, 'b': 4, 'c': 5, 'd': 6, 'e': 7, 'f': 8, 'g': 9, 'h': 10, 'i': 11, 'j': 12, 'k': 13, 'l': 14, 'm': 15, 'n': 16, 'o': 17, 'p': 18, 'q': 19, 'r': 20, 's': 21, 't': 22, 'u': 23, 'v': 24, 'w': 25, 'x': 26, 'y': 27, 'z': 28}

{'<eos>': 0, '<pad>': 1, '<sos>': 2, 'ঁ': 3, 'ং': 4, 'ঃ': 5, 'অ': 6, 'আ': 7, 'ই': 8, 'ঈ': 9, 'উ': 10, 'ঊ': 11, 'ঋ': 12, 'এ': 13, 'ঐ': 14, 'ও': 15, 'ঔ': 16, 'ক': 17, 'খ': 18, 'গ': 19, 'ঘ': 20, 'ঙ': 21, 'চ': 22, 'ছ': 23, 'জ': 24, 'ঝ': 25, 'ঞ': 26, 'ট': 27, 'ঠ': 28, 'ড': 29, 'ঢ': 30, 'ণ': 31, 'ত': 32, 'থ': 33, 'দ': 34, 'ধ': 35, 'ন': 36, 'প': 37, 'ফ': 38, 'ব': 39, 'ভ': 40, 'ম': 41, 'য': 42, 'র': 43, 'ল': 44, 'শ': 45, 'ষ': 46, 'স': 47, 'হ': 48, '়': 49, 'া': 50, 'ি': 51, 'ী': 52, 'ু': 53, 'ূ': 54, 'ৃ': 55, 'ে': 56, 'ৈ': 57, 'ো': 58, 'ৌ': 59, '্': 60, 'ৎ': 61, '২': 62}
Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): RNN(256, 256, batch

In [6]:
# # Define the Seq_to_Seq model which is a subclass of nn.Module
# class Seq_to_Seq(nn.Module):
#     # Constructor with parameters for initialization
#     def __init__(self, encoder, decoder):
#         # Initialize the base class
#         super(Seq_to_Seq, self).__init__()
#         # Assign the encoder instance
#         self.encoder = encoder
#         # Assign the decoder instance
#         self.decoder = decoder
        
#     # Define the forward pass method that takes source data, target data, and a teaching force ratio
#     def forward(self, source, target, teaching_force_ratio=0.5):
#         # Determine the batch size from the source input
#         batch_size = source.size(0)
# #         print('seq:: batch_size',batch_size)
#         # Determine the target sequence length from the target input
#         target_len = target.size(1)
# #         print('target_len',target_len)
#         # Get the target vocabulary size from the decoder
#         target_vocab_size = self.decoder.output_size
# #         print('target_vocab_size',target_vocab_size)
        
#         # Initialize a tensor to store the outputs from the decoder
#         outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(source.device)
# #         print('outputs',outputs.shape)
        
#         # Encode the source input to get the initial hidden state for the decoder
#         encoder_hidden = self.encoder(source)
# #         print('encoder_hidden',encoder_hidden[0])
#         # The first input to the decoder is typically a start token; here, it's the first target token
#         decoder_input = target[:, 0]
# #         print('decoder_input',decoder_input[0])
        
#         # Reshape or transform the encoder's hidden state to match the decoder's expected initial state
#         if isinstance(encoder_hidden, tuple):  # LSTM case
#             encoder_hidden = tuple([h[:self.decoder.num_layers] for h in encoder_hidden])
# #             print(f" encoder_hidden: {encoder_hidden}")
#         else:  # GRU or RNN
#             encoder_hidden = encoder_hidden[:self.decoder.num_layers]
# #             print(f" encoder_hidden: {encoder_hidden}")
        
#         # Iterate through each position in the target sequence
#         for t in range(1, target_len):
#             # Generate output and update the hidden state from the decoder
#             decoder_output, encoder_hidden = self.decoder(decoder_input, encoder_hidden)
# #             print(f"decoder_output:  {decoder_output[0]} , encoder_hidden: {encoder_hidden[0]}")
#             # Store the output of the decoder at the corresponding position in the output tensor
#             outputs[:, t] = decoder_output
# #             print(outputs[0])
            
#             # Determine whether to use teacher forcing based on a random probability compared to the ratio
#             teacher_force = torch.rand(1) < teaching_force_ratio
#             # Get the highest probability token from the decoder's output
#             top1 = decoder_output.argmax(1)
# #             print('top1',top1)
#             # Depending on teacher forcing, use either the true next token or the predicted token as the next input
#             decoder_input = target[:, t] if teacher_force else top1
# #             print('decoder_input',decoder_input[0])
# #         print(f"final decoder outputs: {outputs[0]}")
#         # Return the tensor holding all the decoder outputs
#         return outputs

# **Printing the model**

In [26]:
# Constants defining the dimensions of the input and output character sets
INPUT_DIM = 50  # size of the Latin character set
OUTPUT_DIM = 100  # size of the Bangla character set

# Constants defining the dimensions of the embeddings for encoder and decoder
ENC_EMB_DIM = 64  # Encoder embedding dimension
DEC_EMB_DIM = 64  # Decoder embedding dimension

# Constants defining the dimension of the hidden layers for encoder and decoder
HID_DIM = 512  # Hidden dimension size

# Constants defining the number of layers for encoder and decoder
ENC_LAYERS = 2  # Number of layers in the encoder
DEC_LAYERS = 2  # Number of layers in the decoder

# Constants defining the type of RNN cell to use for encoder and decoder
ENC_RNN_CELL = 'lstm'  # RNN cell type for the encoder
DEC_RNN_CELL = 'lstm'  # RNN cell type for the decoder

# Instantiate the encoder with specified configurations
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_LAYERS, ENC_RNN_CELL)
# Instantiate the decoder with specified configurations
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_LAYERS, DEC_RNN_CELL)

# Determine the computing device (CUDA if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Print the device will be used
print(f"Using device: {device}")

# Instantiate the Seq_to_Seq model and move it to the chosen computing device
model = Seq_to_Seq(encoder, decoder).to(device)
# Print the model architecture
print(model)

Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(50, 64)
    (dropout): Dropout(p=0.5, inplace=False)
    (rnn): LSTM(64, 256, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.5, inplace=False)
    (rnn): LSTM(64, 1024, num_layers=2, batch_first=True, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=100, bias=True)
  )
)


# **A function to create a vocabulary set from the given text**

In [4]:

# Define a function to create a vocabulary set from a given text
def create_vocab(text):
    # Create a set of unique characters found in the text
    # Each word in the text is processed to extract its characters
    vocab = set(char for word in text for char in word)
    # Add a padding token to the vocabulary
    vocab.add('<pad>')
    # Add a start-of-sequence token to the vocabulary
    vocab.add('<sos>')  # Start of sequence token
    # Add an end-of-sequence token to the vocabulary
    vocab.add('<eos>')  # End of sequence token
    # Return the complete set of vocabulary items
    return vocab

# **A function to load data from a CSV file**

In [5]:
# Define a function to load data from a CSV file
def load_data(path):
    # The file has no header and columns are named as 'latin' and 'bangla'
    df = pd.read_csv(path, header=None, names=['latin', 'bangla'])
#     df = df.head(10)
    # Return the columns as two separate Series objects
    return df['latin'], df['bangla']

# **Load Latin and bangla training data**

In [6]:
# Load Latin and bangla training data from specified path
latin_train, bangla_train = load_data('/kaggle/input/aksharantar/aksharantar_sampled/ben/ben_train.csv')

# **Print the loaded Latin and Bangla training data**

In [7]:
# Print the loaded Latin training data
print(latin_train)
print()
# Print the loaded bangla training data
print(bangla_train)

0        namdharirao
1        hindukusher
2        farajikandi
3           moubarak
4             chiung
            ...     
51195       silmadar
51196        jonnote
51197      handibage
51198         borpar
51199     bideshikei
Name: latin, Length: 51200, dtype: object

0            নামধারীরাও
1           হিন্দুকুশের
2           ফরাজীকান্দি
3                মুবারক
4                চিয়ুং
              ...      
51195          সিলমাদার
51196            জন্যতে
51197    হ্যান্ডিব্যাগে
51198             বরপার
51199         বিদেশীকেই
Name: bangla, Length: 51200, dtype: object


# **Create two vocabularies from the Latin and Bangla training data**

In [8]:
# Create a vocabulary from the Latin training data
latin_vocab = create_vocab(latin_train)
# Create a vocabulary from the bangla training data
bangla_vocab = create_vocab(bangla_train)

# **Print the created Latin and Bangla vocabularies**

In [9]:
# Print the created Latin vocabulary
print(latin_vocab)
print()
# Print the created bangla vocabulary
print(bangla_vocab)

{'a', 'n', 'j', 'b', 'i', '<eos>', 'l', '<pad>', 'o', 's', 't', 'y', 'x', 'g', 'm', 'h', 'c', 'v', 'z', 'u', 'e', 'f', 'd', 'q', 'k', '<sos>', 'r', 'p', 'w'}

{'ঃ', 'গ', 'হ', '২', 'এ', 'ল', 'খ', 'ৃ', 'া', 'প', 'ি', 'ম', 'ু', 'ঐ', 'ী', 'স', '<eos>', 'ঈ', 'ঞ', 'ধ', 'ছ', 'ঔ', 'ঙ', 'থ', 'অ', '<pad>', '<sos>', 'ফ', 'ঢ', 'ব', 'ৌ', 'ষ', 'ৎ', 'ই', 'ন', 'আ', 'চ', 'ঋ', 'ক', 'ঝ', 'ঊ', 'ঘ', 'ে', 'র', 'ো', 'ট', 'ণ', '্', '়', 'জ', 'য', 'ও', 'ং', 'ভ', 'ড', 'দ', 'উ', 'ূ', 'ঁ', 'ঠ', 'শ', 'ত', 'ৈ'}


# **Map each token in the Latin and Bangla vocabularies to a unique index and then Print the dictionaries mapping (Latin tokens to indices) and (Bangla tokens to indices)**


In [10]:
# Map each token in the Latin vocabulary to a unique index
latin_token_to_index = {token: index for index, token in enumerate(sorted(latin_vocab))}
# Map each token in the bangla vocabulary to a unique index
bangla_token_to_index = {token: index for index, token in enumerate(sorted(bangla_vocab))}

# Print the dictionary mapping Latin tokens to indices
print(latin_token_to_index)
print()

# Print the dictionary mapping bangla tokens to indices
print(bangla_token_to_index)

{'<eos>': 0, '<pad>': 1, '<sos>': 2, 'a': 3, 'b': 4, 'c': 5, 'd': 6, 'e': 7, 'f': 8, 'g': 9, 'h': 10, 'i': 11, 'j': 12, 'k': 13, 'l': 14, 'm': 15, 'n': 16, 'o': 17, 'p': 18, 'q': 19, 'r': 20, 's': 21, 't': 22, 'u': 23, 'v': 24, 'w': 25, 'x': 26, 'y': 27, 'z': 28}

{'<eos>': 0, '<pad>': 1, '<sos>': 2, 'ঁ': 3, 'ং': 4, 'ঃ': 5, 'অ': 6, 'আ': 7, 'ই': 8, 'ঈ': 9, 'উ': 10, 'ঊ': 11, 'ঋ': 12, 'এ': 13, 'ঐ': 14, 'ও': 15, 'ঔ': 16, 'ক': 17, 'খ': 18, 'গ': 19, 'ঘ': 20, 'ঙ': 21, 'চ': 22, 'ছ': 23, 'জ': 24, 'ঝ': 25, 'ঞ': 26, 'ট': 27, 'ঠ': 28, 'ড': 29, 'ঢ': 30, 'ণ': 31, 'ত': 32, 'থ': 33, 'দ': 34, 'ধ': 35, 'ন': 36, 'প': 37, 'ফ': 38, 'ব': 39, 'ভ': 40, 'ম': 41, 'য': 42, 'র': 43, 'ল': 44, 'শ': 45, 'ষ': 46, 'স': 47, 'হ': 48, '়': 49, 'া': 50, 'ি': 51, 'ী': 52, 'ু': 53, 'ূ': 54, 'ৃ': 55, 'ে': 56, 'ৈ': 57, 'ো': 58, 'ৌ': 59, '্': 60, 'ৎ': 61, '২': 62}


# **Defining a Dataset class for handling Latin and Bangla word pairs**

In [11]:
# Define a Dataset class for handling Latin and Bangla word pairs
class AksharantarDataset(Dataset):
    def __init__(self, latin_words, bangla_words, latin_token_to_index, bangla_token_to_index):
        # Store the lists of Latin and Bangla words
        self.latin_words = latin_words
        self.bangla_words = bangla_words
        # Store the dictionaries that map characters to indices for both languages
        self.latin_token_to_index = latin_token_to_index
        self.bangla_token_to_index = bangla_token_to_index

    def __len__(self):
        # Return the number of word pairs in the dataset
        return len(self.latin_words)

    def __getitem__(self, index):
        # Fetching the Latin and Bangla words at the specified index
        latin_word = self.latin_words.iloc[index]
#         print(latin_word)
        bangla_word = self.bangla_words.iloc[index]
#         print(bangla_word)
        # Convert the Latin word into indices using the latin_token_to_index mapping
        latin_indices = [latin_token_to_index[char] for char in latin_word]
#         print(latin_indices)
        # Convert the Bangla word into indices, adding <sos> and <eos> tokens
        bangla_indices = [bangla_token_to_index['<sos>']] + [bangla_token_to_index[char] for char in bangla_word] + [bangla_token_to_index['<eos>']]
#         print(bangla_indices)
        # Return the indices as tensor objects
        return torch.tensor(latin_indices, dtype=torch.long), torch.tensor(bangla_indices, dtype=torch.long)

# **Defining a function for padding sequences and packing batches**

In [12]:
# Define a function for padding sequences and packing batches
# packet_fn specifies a function to control how batches are created from the individual data items
def packet_fn(batch):
    # Unzip the batch to separate Latin and Bangla indices
    latin, bangla = zip(*batch)
#     print(latin, bangla)
    # Pad the sequences of Latin indices
    latin_padded = pad_sequence(latin, batch_first=True, padding_value=latin_token_to_index['<pad>'])
#     print(latin_padded)
    # Pad the sequences of Bangla indices
    bangla_padded = pad_sequence(bangla, batch_first=True, padding_value=bangla_token_to_index['<pad>'])
#     print(bangla_padded)
    # Return the padded batches
    return latin_padded, bangla_padded

# **Load training data into the AksharantarDataset and then creating the train_loader by Dataloader function**

In [13]:
# Load training data into the AksharantarDataset
train_dataset = AksharantarDataset(latin_train, bangla_train, latin_token_to_index, bangla_token_to_index)
# Create a DataLoader to batch and shuffle the dataset
# packet_fn specifies a function to control how batches are created from the individual data items
train_loader = DataLoader(train_dataset, batch_size = 64, collate_fn=packet_fn, shuffle=True)

# **Print an example from the dataset**

In [14]:
# Print an example from the dataset
print(train_dataset[4000])
# for i,j in train_loader:
#     print(i,'\n\n\n',j)

(tensor([19, 23,  7, 20,  5,  7, 22, 11, 16]), tensor([ 2, 17, 50, 43, 47, 56, 32, 51, 36,  0]))



# **A function for calculating accuracy per batch, ignoring the padding token**

In [15]:
# # A function for calculating accuracy per batch, ignoring a specific index, typically the padding token
# def categorical_accuracy(preds, y, ignore_index):
#     # Get the index of the maximum probability to predict the class
#     max_preds = preds.argmax(dim=1, keepdim=True)
#     # Identify positions that do not correspond to the ignore_index (typically padding)
#     non_pad_elements = (y != ignore_index).nonzero(as_tuple=True)
#     # Check if predicted classes match the actual classes, excluding the ignore index
#     correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
#     # Compute accuracy as the number of correct predictions over the number of non-ignored positions
#     return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]]).to(y.device)


# Define a new accuracy function for word-level accuracy
def word_accuracy(outputs, targets, ignore_index):
    # Assuming outputs and targets are batched sequences of token indices
    # Ignoring <pad> tokens as specified by `ignore_index`
    correct = 0
    total = 0
    for out, tar in zip(outputs, targets):
        # Ignoring padding in accuracy calculation
#         print('out bef pad:',out)
#         print('tar:',tar)
        out = out[out != ignore_index]
        tar = tar[tar != ignore_index]
        ignore_index_eos = 0
        out = out[out != ignore_index_eos]
        tar = tar[tar != ignore_index_eos]
#         print('out aft pad:',out)
#         print('tar:',tar)
        if torch.equal(out, tar):
            correct += 1
#             print('correct:',correct)
        total += 1
#         print('total:',total)
    return correct / total if total > 0 else 0



# **Defining the Training function**

In [16]:

def train(model, iterator, optimizer, criterion, clip, device, ignore_index):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    
    for source, target in iterator:
        source = source.to(device)
        target = target.to(device)
        
        optimizer.zero_grad()
        output = model(source, target)
        
        output_dim = output.shape[-1]
        # Slice to ignore the <sos> token and keep sequence structure
        output = output[:, 1:, :]
        target = target[:, 1:]
        
        # Flatten all dimensions except for the batch dimension for loss calculation
        output_flat = output.reshape(-1, output_dim)
        target_flat = target.reshape(-1)
        
#         print('trainnnnnnnn')
        
        loss = criterion(output_flat, target_flat)
        # Calculate word-by-word accuracy
        acc = word_accuracy(output.argmax(dim=2), target, ignore_index)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)
######################

# **Defining the Evaluation function**

In [17]:
def evaluate(model, iterator, criterion, device, ignore_index):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    
    with torch.no_grad():
        for source, target in iterator:
            source = source.to(device)
            target = target.to(device)
            
            output = model(source, target, 0)
            output_dim = output.shape[-1]
            output = output[:, 1:, :]
            target = target[:, 1:]
            
            output_flat = output.reshape(-1, output_dim)
            target_flat = target.reshape(-1)
#             print('vallllllll')
            loss = criterion(output_flat, target_flat)
            acc = word_accuracy(output.argmax(dim=2), target, ignore_index)
            
            epoch_loss += loss.item()
            epoch_acc += acc
            
#             break
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

#######################

# **Load validation data into the AksharantarDataset and then creating the valid_loader by Dataloader function**

In [18]:
# Load validation data by reading a CSV file
latin_valid, bangla_valid = load_data('/kaggle/input/aksharantar/aksharantar_sampled/ben/ben_valid.csv')

# Create a validation dataset using the AksharantarDataset class.
valid_dataset = AksharantarDataset(latin_valid, bangla_valid, latin_token_to_index, bangla_token_to_index)

# Create a DataLoader to batch and shuffle the dataset
# 'collate_fn=packet_fn' specifies a function to control how batches are created from the individual data items.
# 'shuffle=True' ensures that the data is shuffled at every epoch which helps to reduce model overfitting
valid_loader = DataLoader(valid_dataset, batch_size=64, collate_fn=packet_fn, shuffle=True)

# **The training process for specified number of epochs**

In [109]:
# Constants defining the dimensions of the input and output character sets
INPUT_DIM = 100  # size of the Latin character set
OUTPUT_DIM = 100  # size of the Bangla character set

# Constants defining the dimensions of the embeddings for encoder and decoder
ENC_EMB_DIM = 256  # Encoder embedding dimension
DEC_EMB_DIM = 256  # Decoder embedding dimension

# Constants defining the dimension of the hidden layers for encoder and decoder
HID_DIM = 512  # Hidden dimension size

# Constants defining the number of layers for encoder and decoder
ENC_LAYERS = 2  # Number of layers in the encoder
DEC_LAYERS = 1  # Number of layers in the decoder

# Constants defining the type of RNN cell to use for encoder and decoder
ENC_RNN_CELL = 'lstm'  # RNN cell type for the encoder
DEC_RNN_CELL = 'lstm'  # RNN cell type for the decoder

# Instantiate the encoder with specified configurations
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_LAYERS, ENC_RNN_CELL,dropout=0.3, bidirectional = True)
# Instantiate the decoder with specified configurations
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_LAYERS, DEC_RNN_CELL, dropout=0.3, bidirectional = True)

# Determine the computing device (CUDA if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Print the device will be used
print(f"Using device: {device}")

# Instantiate the Seq_to_Seq model and move it to the chosen computing device
model = Seq_to_Seq(encoder, decoder).to(device)
# Print the model architecture
print(model)

Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 256, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 1024, batch_first=True)
    (fc): Linear(in_features=1024, out_features=100, bias=True)
  )
)


In [110]:
# Setting the number of epochs the training process should run
NUM_EPOCHS = 1
# Set the maximum norm of the gradients to 1 to prevent exploding gradients
CLIP = 1
# Initialize the optimizer, Adam
optimizer = torch.optim.Adam(model.parameters())
# Padding token index should be ignored in loss calculation
ignore_index = bangla_token_to_index['<pad>']
# Define the loss function with 'ignore_index' to avoid affecting loss calculation with padding tokens
criterion = nn.CrossEntropyLoss(ignore_index=ignore_index).to(device)

# Start the training process for the defined number of epochs
for epoch in range(NUM_EPOCHS):
    # Doing training on the train dataset and return average loss and accuracy
    train_loss, train_accuracy = train(model, train_loader, optimizer, criterion, CLIP, device, ignore_index)
    # Evaluating the model on the validation dataset and return average loss and accuracy
    val_loss, val_accuracy = evaluate(model, valid_loader, criterion, device, ignore_index)
    
    # Print the loss and accuracy for each epoch
    print(f'Epoch: {epoch+1}')
    print(f'\tTrain_Loss: {train_loss:.3f}, Train_Accuracy: {train_accuracy*100:.2f}%')
    print(f'\tVal_Loss: {val_loss:.3f},  Val_Accuracy: {val_accuracy*100:.2f}%')

Epoch: 1
	Train_Loss: 1.402, Train_Accuracy: 6.92%
	Val_Loss: 1.304,  Val_Accuracy: 15.06%


# **Load the Test data into the AksharantarDataset and then creating the test_loader by Dataloader function**

In [53]:
# Load the test data from the specified CSV file location
latin_test, bangla_test = load_data('/kaggle/input/aksharantar/aksharantar_sampled/ben/ben_test.csv')

# Create test_dataset using the AksharantarDataset class, initializing it with test data
# and corresponding token-to-index mappings for both Latin and Bangla scripts
test_dataset = AksharantarDataset(latin_test, bangla_test, latin_token_to_index, bangla_token_to_index)

# A DataLoader for the test dataset. Here, the batch size is set to 1, indicates
# that the model will process one item at a time. This is for testing to make
# detailed predictions per sample without batching effects.
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=packet_fn, shuffle=False)
# print(test_dataset[0])



# **A function to convert an array of indices back into a string, excluding any indices corresponding to special tokens like padding, start, or end of sequence tokens, which should not appear in the final output string**

In [54]:
def decode_indices(indices, index_to_token):
    # Filter out indices for padding, start-of-sequence, and end-of-sequence tokens to ensure only valid character indices are decoded
    valid_indices = [index for index in indices if index in index_to_token and index not in (bangla_token_to_index['<pad>'], bangla_token_to_index['<sos>'], bangla_token_to_index['<eos>'])]
    # Convert each index to its corresponding character and join them to form the decoded string
    return ''.join([index_to_token[index] for index in valid_indices])

# **Creating the prediction function to generate outputs for all samples in the test_loader**

In [55]:
def predict(model, iterator, device):
    # Set the model to evaluation mode to disable dropout or batch normalization effects during inference
    model.eval()
    predictions = []
    # Disables gradient calculations for performance improvement since they are not needed in inference
    with torch.no_grad():
        for source, target in iterator:
            # Ensure the source and target tensors are on the correct device (GPU or CPU)
            source = source.to(device)
            target = target.to(device)
            # Obtain model output without teacher forcing (i.e., the model relies entirely on its predictions)
            output = model(source, target, 0)
            # Get the index with the highest probability from output predictions
            output = output.argmax(2)
            # Convert tensors to CPU numpy arrays for easier manipulation and extraction
            source = source.cpu().numpy()
            output = output.cpu().numpy()
            target = target.cpu().numpy()
            # Store the tuple of source and decoded output predictions
            predictions.append((source, target, output))
    # Return all predictions made over the iterator
    return predictions

# **Creating dictionaries to map indices back to its corresponding characters**

In [56]:
# Create dictionaries to map indices back to characters, observing the interpretation of prediction outputs
latin_index_to_token = {index: char for char, index in latin_token_to_index.items()}
bangla_index_to_token = {index: char for char, index in bangla_token_to_index.items()}
# print(latin_index_to_token)
# print(bangla_index_to_token)

# **Displaying results: Each input text from the test dataset and its corresponding predicted output text are printed. This helps in visually assessing the accuracy and quality of the transliterations produced by the model**

In [187]:
# Taking the prediction function to generate outputs for all samples in the test_loader
test_predictions = predict(model, test_loader, device)
# print(test_predictions[1])
# Loop through the list of tuples containing source and output indices from the test predictions
for source_indices, target_indices, output_indices in test_predictions:
    # Iterate through each example in the batch. This is necessary as batches may contain multiple examples
    for i in range(source_indices.shape[0]):
        # Decode the source indices to their corresponding text using the mapping dictionary for Latin script
        input_text = decode_indices(source_indices[i], latin_index_to_token)
        
        target_text = decode_indices(target_indices[i], bangla_index_to_token)

        # Decode the output indices to their corresponding text using the mapping dictionary for Bangla script
        predicted_text = decode_indices(output_indices[i], bangla_index_to_token)
        # Print the original input text and its corresponding predicted transliteration
        print(f'Input Text: {input_text} -> Actual Text: {target_text} -> Predicted Text: {predicted_text}')

Input Text: saphallya -> Actual Text: সাফল্য -> Predicted Text: সাপল্যা
Input Text: kaarentabaahee -> Actual Text: কারেন্টবাহী -> Predicted Text: কারেতনাবাহেে
Input Text: mashterpiece -> Actual Text: মাস্টারপিস -> Predicted Text: মাশ্টেরিপিকক
Input Text: cheeken -> Actual Text: চিকেন -> Predicted Text: চেকেন
Input Text: ekdaala -> Actual Text: একডালা -> Predicted Text: একদালা
Input Text: neerbachokra -> Actual Text: নির্বাচকরা -> Predicted Text: নির্বারকার
Input Text: neture -> Actual Text: নেচার -> Predicted Text: নেতুরে
Input Text: michilkey -> Actual Text: মিছিলকে -> Predicted Text: মিছিলকে
Input Text: chitfund -> Actual Text: চিটফান্ড -> Predicted Text: চিটিউন্ড
Input Text: panchanan -> Actual Text: পঞ্চানন -> Predicted Text: পাঁচানান
Input Text: manna -> Actual Text: মন্ন -> Predicted Text: মান্ানা
Input Text: portillo -> Actual Text: পর্টিল্লো -> Predicted Text: পর্তিলোলো
Input Text: quess -> Actual Text: কুয়েস -> Predicted Text: কেউসস
Input Text: budh -> Actual Text: বুধ -> Pre

In [25]:
import wandb
import numpy as np
from types import SimpleNamespace
import random

# key = input('Enter your API:')
wandb.login(key='25c2257eaf6c22aa056893db14da4ee2bf0a531a')  #25c2257eaf6c22aa056893db14da4ee2bf0a531a

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [28]:
sweep_config = {
    'method': 'random',
    'name' : 'sweep 4 bayes false',
    'metric': {
        'name': 'Val_Accuracy',
        'goal': 'maximize'
    },
    'parameters': {
        'input_embed_size': {
            'values': [16,32,64,256,512]
        },
        'num_enc_layers':{
            'values': [1,2,3]
        },
        'num_dec_layers':{
            'values': [1,2,3]
        },
        'hid_layer_size': {
            'values': [16,32,64,256,512]
        },
        'cell_type': {
            'values': ['rnn', 'gru', 'lstm']
        },
        'bidirectional':{
            'values': [False]
        },
        'dropout': {
            'values': [0.2, 0.3]
        },
#       'beam search in decoder with different beam sizes': 
    }
}

sweep_id = wandb.sweep(sweep = sweep_config, project="Deep_Learning_A3")


Create sweep with ID: anrkraoj
Sweep URL: https://wandb.ai/parthasakhapaul/Deep_Learning_A3/sweeps/anrkraoj


In [29]:
import wandb

def main():
    # Initialize a new wandb run
    with wandb.init() as run:
        # Construct run name from configuration
        run_name = "-embed_size-"+str(wandb.config.input_embed_size)+"-layers_enc-"+str(wandb.config.num_enc_layers)+"-layers_dec-"+str(wandb.config.num_dec_layers)+"-hid_size-"+str(wandb.config.hid_layer_size)+"-cell_type-"+wandb.config.cell_type+"-bidirectional-"+str(wandb.config.bidirectional)+"-dropout-"+str(wandb.config.dropout)
        wandb.run.name = run_name

        # Constants defining the dimensions of the input and output character sets
        INPUT_DIM = 100  # size of the Latin character set
        OUTPUT_DIM = 100  # size of the Bangla character set

        # Constants defining the dimensions of the embeddings for encoder and decoder
        ENC_EMB_DIM = wandb.config.input_embed_size  # Encoder embedding dimension
        DEC_EMB_DIM = wandb.config.input_embed_size  # Decoder embedding dimension

        # Constants defining the dimension of the hidden layers for encoder and decoder
        HID_DIM = wandb.config.hid_layer_size  # Hidden dimension size

        # Constants defining the number of layers for encoder and decoder
        ENC_LAYERS = wandb.config.num_enc_layers  # Number of layers in the encoder
        DEC_LAYERS = wandb.config.num_dec_layers  # Number of layers in the decoder
        

        # Constants defining the type of RNN cell to use for encoder and decoder
        ENC_RNN_CELL = wandb.config.cell_type  # RNN cell type for the encoder
        DEC_RNN_CELL = wandb.config.cell_type  # RNN cell type for the decoder

        # Instantiate the encoder with specified configurations
        encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_LAYERS, ENC_RNN_CELL, dropout = wandb.config.dropout, bidirectional = wandb.config.bidirectional)
        # Instantiate the decoder with specified configurations
        decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_LAYERS, DEC_RNN_CELL, dropout = wandb.config.dropout, bidirectional = wandb.config.bidirectional)

        # Determine the computing device (CUDA if available, otherwise CPU)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # Print the device will be used
        print(f"Using device: {device}")

        # Instantiate the Seq_to_Seq model and move it to the chosen computing device
        model = Seq_to_Seq(encoder, decoder).to(device)
        print(model)
        
        
        # Setting the number of epochs the training process should run
        NUM_EPOCHS = 10
        # Set the maximum norm of the gradients to 1 to prevent exploding gradients
        CLIP = 1
        # Initialize the optimizer, Adam
        optimizer = torch.optim.Adam(model.parameters())
        # Padding token index should be ignored in loss calculation
        ignore_index = bangla_token_to_index['<pad>']
        # Define the loss function with 'ignore_index' to avoid affecting loss calculation with padding tokens
        criterion = nn.CrossEntropyLoss(ignore_index=ignore_index).to(device)

        # Start the training process for the defined number of epochs
        for epoch in range(NUM_EPOCHS):
            # Doing training on the train dataset and return average loss and accuracy
            train_loss, train_accuracy = train(model, train_loader, optimizer, criterion, CLIP, device, ignore_index)
            # Evaluating the model on the validation dataset and return average loss and accuracy
            val_loss, val_accuracy = evaluate(model, valid_loader, criterion, device, ignore_index)

            # Print the loss and accuracy for each epoch
            print(f'Epoch: {epoch+1}')
            print(f'\tTrain_Loss: {train_loss:.3f}, Train_Accuracy: {train_accuracy*100:.2f}%')
            print(f'\tVal_Loss: {val_loss:.3f},  Val_Accuracy: {val_accuracy*100:.2f}%')
            wandb.log({"train_accuracy": train_accuracy * 100, "training_loss": train_loss})
            wandb.log({"Val_Accuracy": val_accuracy * 100, "Val_Loss": val_loss})


wandb.agent(sweep_id, function=main, count=30)
wandb.finish()

[34m[1mwandb[0m: Agent Starting Run: z9ipjc7j with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 256
[34m[1mwandb[0m: 	input_embed_size: 16
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): RNN(16, 256, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): RNN(16, 256, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=256, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.074, Train_Accuracy: 0.00%
	Val_Loss: 3.059,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.928, Train_Accuracy: 0.00%
	Val_Loss: 3.058,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.880, Train_Accuracy: 0.00%
	Val_Loss: 3.055,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.857, Train_Accuracy: 0.00%
	Val_Loss: 3.089,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.846, Train_Accuracy: 0.00%
	Val_Loss: 3.099,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.832, Train_Accuracy: 0.00%
	Val_Loss: 3.070,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 2.831, Train_Accu

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁▁▁▁
Val_Loss,▄▄▄▇█▅▅▆▂▁
train_accuracy,▁▁▁▁▁██▁▁▁
training_loss,█▄▃▂▂▁▁▁▁▁

0,1
Val_Accuracy,0.0
Val_Loss,3.02591
train_accuracy,0.0
training_loss,2.8176


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 57pbvw9r with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 16
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 16, num_layers=3, batch_first=True, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 16, batch_first=True)
    (fc): Linear(in_features=16, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.336, Train_Accuracy: 0.00%
	Val_Loss: 3.227,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 3.032, Train_Accuracy: 0.00%
	Val_Loss: 3.086,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.968, Train_Accuracy: 0.00%
	Val_Loss: 3.041,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.940, Train_Accuracy: 0.00%
	Val_Loss: 2.999,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.907, Train_Accuracy: 0.00%
	Val_Loss: 2.960,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.885, Train_Accuracy: 0.00%
	Val_Loss: 2.947,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 2.868, Train_A

VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁██▅
Val_Loss,█▅▄▄▃▃▂▂▁▁
train_accuracy,▁▁▁▁█▁▁▁▁▁
training_loss,█▄▃▃▂▂▂▁▁▁

0,1
Val_Accuracy,0.02441
Val_Loss,2.85599
train_accuracy,0.0
training_loss,2.80878


[34m[1mwandb[0m: Agent Starting Run: ziz5hjl7 with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 32
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): RNN(512, 32, num_layers=3, batch_first=True, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): RNN(512, 32, batch_first=True)
    (fc): Linear(in_features=32, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.234, Train_Accuracy: 0.00%
	Val_Loss: 3.139,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 3.049, Train_Accuracy: 0.00%
	Val_Loss: 3.137,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 3.018, Train_Accuracy: 0.00%
	Val_Loss: 3.122,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 3.002, Train_Accuracy: 0.00%
	Val_Loss: 3.109,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 3.000, Train_Accuracy: 0.00%
	Val_Loss: 3.124,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.998, Train_Accuracy: 0.00%
	Val_Loss: 3.143,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 2.992, Train_Acc

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁▁▁▁
Val_Loss,▇▇▄▁▄█▂▄▃▃
train_accuracy,▁▁▁▁▁▁▁▁▁▁
training_loss,█▃▂▂▁▁▁▁▁▁

0,1
Val_Accuracy,0.0
Val_Loss,3.11796
train_accuracy,0.0
training_loss,2.98399


[34m[1mwandb[0m: Agent Starting Run: t7xzmdhx with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 64
[34m[1mwandb[0m: 	input_embed_size: 32
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): RNN(32, 64, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): RNN(32, 64, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=64, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.121, Train_Accuracy: 0.00%
	Val_Loss: 3.112,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.947, Train_Accuracy: 0.00%
	Val_Loss: 3.133,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.897, Train_Accuracy: 0.00%
	Val_Loss: 3.046,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.870, Train_Accuracy: 0.00%
	Val_Loss: 3.015,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.860, Train_Accuracy: 0.00%
	Val_Loss: 3.024,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.851, Train_Accuracy: 0.00%
	Val_Loss: 3.047,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 2.845, Train_Accurac

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁▁▁▁
Val_Loss,▇█▄▂▃▄▃▁▂▂
train_accuracy,▁▁▁▁▁▁▁▁▁▁
training_loss,█▄▃▂▂▂▁▁▁▁

0,1
Val_Accuracy,0.0
Val_Loss,3.00323
train_accuracy,0.0
training_loss,2.82918


[34m[1mwandb[0m: Agent Starting Run: j5zaxjmr with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 16
[34m[1mwandb[0m: 	input_embed_size: 32
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): RNN(32, 16, num_layers=3, batch_first=True, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): RNN(32, 16, batch_first=True)
    (fc): Linear(in_features=16, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.443, Train_Accuracy: 0.00%
	Val_Loss: 3.210,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 3.127, Train_Accuracy: 0.00%
	Val_Loss: 3.117,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 3.055, Train_Accuracy: 0.00%
	Val_Loss: 3.098,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 3.032, Train_Accuracy: 0.00%
	Val_Loss: 3.145,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 3.022, Train_Accuracy: 0.00%
	Val_Loss: 3.276,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 3.011, Train_Accuracy: 0.00%
	Val_Loss: 3.342,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 3.005, Train_Accurac

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁▁▁▁
Val_Loss,▅▃▂▃▆█▁▆▃▁
train_accuracy,▁▁▁▁▁▁▁▁▁▁
training_loss,█▃▂▂▂▂▁▁▁▁

0,1
Val_Accuracy,0.0
Val_Loss,3.04812
train_accuracy,0.0
training_loss,2.97668


[34m[1mwandb[0m: Agent Starting Run: 7mox2dam with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 256
[34m[1mwandb[0m: 	input_embed_size: 32
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): RNN(32, 256, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): RNN(32, 256, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=256, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.986, Train_Accuracy: 0.00%
	Val_Loss: 3.112,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.862, Train_Accuracy: 0.00%
	Val_Loss: 3.086,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.822, Train_Accuracy: 0.00%
	Val_Loss: 3.110,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.803, Train_Accuracy: 0.00%
	Val_Loss: 3.080,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.782, Train_Accuracy: 0.00%
	Val_Loss: 3.062,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.783, Train_Accuracy: 0.00%
	Val_Loss: 3.019,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 2.779, Train_Accu

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁▁▁▁
Val_Loss,█▆█▆▄▁▂▅▃▂
train_accuracy,▁▁▁▁▁▁▁▁▁▁
training_loss,█▄▃▃▂▂▂▁▁▁

0,1
Val_Accuracy,0.0
Val_Loss,3.03758
train_accuracy,0.0
training_loss,2.74968


[34m[1mwandb[0m: Agent Starting Run: 6yrtpwhl with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 16
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 16, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 16, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=16, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.410, Train_Accuracy: 0.00%
	Val_Loss: 3.162,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 3.130, Train_Accuracy: 0.00%
	Val_Loss: 3.066,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 3.023, Train_Accuracy: 0.00%
	Val_Loss: 3.071,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.991, Train_Accuracy: 0.00%
	Val_Loss: 3.059,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.972, Train_Accuracy: 0.00%
	Val_Loss: 3.033,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.952, Train_Accuracy: 0.00%
	Val_Loss: 3.005,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 2.933, Train_A

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁▁▁▁
Val_Loss,█▅▅▅▄▃▃▂▁▁
train_accuracy,▁▁█▁▁▁█▁▁▁
training_loss,█▄▃▂▂▂▂▁▁▁

0,1
Val_Accuracy,0.0
Val_Loss,2.96336
train_accuracy,0.0
training_loss,2.89434


[34m[1mwandb[0m: Agent Starting Run: wdje9s51 with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 64
[34m[1mwandb[0m: 	input_embed_size: 16
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): GRU(16, 64, num_layers=3, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): GRU(16, 64, batch_first=True)
    (fc): Linear(in_features=64, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.076, Train_Accuracy: 0.00%
	Val_Loss: 3.002,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.728, Train_Accuracy: 0.00%
	Val_Loss: 2.741,  Val_Accuracy: 0.02%
Epoch: 3
	Train_Loss: 2.514, Train_Accuracy: 0.01%
	Val_Loss: 2.520,  Val_Accuracy: 0.15%
Epoch: 4
	Train_Loss: 2.319, Train_Accuracy: 0.05%
	Val_Loss: 2.337,  Val_Accuracy: 0.54%
Epoch: 5
	Train_Loss: 2.186, Train_Accuracy: 0.17%
	Val_Loss: 2.218,  Val_Accuracy: 1.15%
Epoch: 6
	Train_Loss: 2.064, Train_Accuracy: 0.34%
	Val_Loss: 2.068,  Val_Accuracy: 1.88%
Epoch: 7
	Train_Loss: 1.967, Train_Accurac

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▂▃▄▅▅▆█
Val_Loss,█▆▅▄▃▃▂▂▁▁
train_accuracy,▁▁▁▁▂▃▄▅▇█
training_loss,█▆▅▄▃▃▂▂▁▁

0,1
Val_Accuracy,5.00488
Val_Loss,1.78904
train_accuracy,1.33594
training_loss,1.77974


[34m[1mwandb[0m: Agent Starting Run: htedkq6n with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 256
[34m[1mwandb[0m: 	input_embed_size: 32
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(32, 256, num_layers=2, batch_first=True, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(32, 256, batch_first=True)
    (fc): Linear(in_features=256, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.862, Train_Accuracy: 0.00%
	Val_Loss: 2.580,  Val_Accuracy: 0.02%
Epoch: 2
	Train_Loss: 1.985, Train_Accuracy: 0.61%
	Val_Loss: 1.717,  Val_Accuracy: 6.01%
Epoch: 3
	Train_Loss: 1.461, Train_Accuracy: 2.81%
	Val_Loss: 1.461,  Val_Accuracy: 8.91%
Epoch: 4
	Train_Loss: 1.228, Train_Accuracy: 5.52%
	Val_Loss: 1.398,  Val_Accuracy: 11.40%
Epoch: 5
	Train_Loss: 1.092, Train_Accuracy: 8.25%
	Val_Loss: 1.299,  Val_Accuracy: 16.04%
Epoch: 6
	Train_Loss: 0.992, Train_Accuracy: 9.79%
	Val_Loss: 1.277,  Val_Accuracy: 17.07%
Epoch: 7
	Train_Loss: 0.928, Train

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▃▄▅▆▆█▇██
Val_Loss,█▄▂▂▁▁▁▁▁▁
train_accuracy,▁▁▂▄▅▆▇▇▇█
training_loss,█▅▃▂▂▂▁▁▁▁

0,1
Val_Accuracy,21.80176
Val_Loss,1.20314
train_accuracy,14.59375
training_loss,0.78428


[34m[1mwandb[0m: Agent Starting Run: vt24nyl5 with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 256
[34m[1mwandb[0m: 	input_embed_size: 32
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): RNN(32, 256, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): RNN(32, 256, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=256, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.032, Train_Accuracy: 0.00%
	Val_Loss: 3.126,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.895, Train_Accuracy: 0.00%
	Val_Loss: 3.079,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.880, Train_Accuracy: 0.00%
	Val_Loss: 3.046,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.844, Train_Accuracy: 0.00%
	Val_Loss: 3.047,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.829, Train_Accuracy: 0.00%
	Val_Loss: 3.028,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.814, Train_Accuracy: 0.00%
	Val_Loss: 3.028,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 2.813, Train_Accu

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁▁▁▁
Val_Loss,█▅▃▃▂▂▆▁▄▃
train_accuracy,▁▁▁▁▁▁▁▁▁▁
training_loss,█▄▃▂▂▁▁▁▁▁

0,1
Val_Accuracy,0.0
Val_Loss,3.04821
train_accuracy,0.0
training_loss,2.80405


[34m[1mwandb[0m: Agent Starting Run: p86lg6ni with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 256
[34m[1mwandb[0m: 	input_embed_size: 32
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): RNN(32, 256, num_layers=3, batch_first=True, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): RNN(32, 256, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=256, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.041, Train_Accuracy: 0.00%
	Val_Loss: 3.083,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.902, Train_Accuracy: 0.00%
	Val_Loss: 3.078,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.854, Train_Accuracy: 0.00%
	Val_Loss: 3.084,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.836, Train_Accuracy: 0.00%
	Val_Loss: 3.042,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.827, Train_Accuracy: 0.00%
	Val_Loss: 3.078,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.824, Train_Accuracy: 0.00%
	Val_Loss: 3.066,  Val_Accuracy: 0.00%
Epoch: 7
	Tr

VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁▁▁▁
Val_Loss,█▇█▄▇▆▆▁▄▃
train_accuracy,▁▁▁▁▁▁█▁▁▁
training_loss,█▄▂▂▂▂▁▁▁▁

0,1
Val_Accuracy,0.0
Val_Loss,3.03717
train_accuracy,0.0
training_loss,2.8045


[34m[1mwandb[0m: Agent Starting Run: tmihtfhy with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 256
[34m[1mwandb[0m: 	input_embed_size: 64
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(64, 256, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(64, 256, batch_first=True)
    (fc): Linear(in_features=256, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.710, Train_Accuracy: 0.02%
	Val_Loss: 2.317,  Val_Accuracy: 1.22%
Epoch: 2
	Train_Loss: 1.759, Train_Accuracy: 1.40%
	Val_Loss: 1.622,  Val_Accuracy: 8.74%
Epoch: 3
	Train_Loss: 1.357, Train_Accuracy: 2.53%
	Val_Loss: 1.433,  Val_Accuracy: 12.08%
Epoch: 4
	Train_Loss: 1.158, Train_Accuracy: 5.85%
	Val_Loss: 1.355,  Val_Accuracy: 17.58%
Epoch: 5
	Train_Loss: 1.043, Train_Accuracy: 6.62%
	Val_Loss: 1.312,  Val_Accuracy: 18.46%
Epoch: 6
	Train_Loss: 0.949, Train_Accuracy: 9.18%
	Val_Loss: 1.286,  Val_Accuracy: 17.72%
Epoch: 7
	Train_Loss: 0.888, Train_Accuracy: 8.94%
	Val_Loss

VBox(children=(Label(value='0.001 MB of 0.019 MB uploaded\r'), FloatProgress(value=0.07075543782792522, max=1.…

0,1
Val_Accuracy,▁▃▄▆▆▆▇▇██
Val_Loss,█▄▂▂▂▂▁▁▁▁
train_accuracy,▁▂▃▅▆▇▇██▇
training_loss,█▅▃▂▂▂▁▁▁▁

0,1
Val_Accuracy,23.21777
Val_Loss,1.19794
train_accuracy,8.64453
training_loss,0.74878


[34m[1mwandb[0m: Agent Starting Run: va23t941 with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 64
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): RNN(64, 512, num_layers=2, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): RNN(64, 512, batch_first=True)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.974, Train_Accuracy: 0.00%
	Val_Loss: 3.279,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.889, Train_Accuracy: 0.00%
	Val_Loss: 3.139,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.858, Train_Accuracy: 0.00%
	Val_Loss: 3.113,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.850, Train_Accuracy: 0.00%
	Val_Loss: 3.081,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.840, Train_Accuracy: 0.00%
	Val_Loss: 3.115,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.837, Train_Accuracy: 0.00%
	Val_Loss: 3.103,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 2.839, Train_Accu

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁▁▁▁
Val_Loss,█▃▂▁▂▂▃▂▃▂
train_accuracy,▁▁▁▁▁▁▁▁▁▁
training_loss,█▄▂▂▁▁▁▁▁▁

0,1
Val_Accuracy,0.0
Val_Loss,3.09946
train_accuracy,0.0
training_loss,2.83296


[34m[1mwandb[0m: Agent Starting Run: 4vwbpn0b with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 16
[34m[1mwandb[0m: 	input_embed_size: 16
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): RNN(16, 16, num_layers=2, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): RNN(16, 16, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=16, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.312, Train_Accuracy: 0.00%
	Val_Loss: 3.117,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 3.050, Train_Accuracy: 0.00%
	Val_Loss: 3.158,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 3.010, Train_Accuracy: 0.00%
	Val_Loss: 3.208,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.989, Train_Accuracy: 0.00%
	Val_Loss: 3.161,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.982, Train_Accuracy: 0.00%
	Val_Loss: 3.131,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.970, Train_Accuracy: 0.00%
	Val_Loss: 3.042,  Val_Accuracy: 0.00%
Epoch: 7
	Train

VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁▁▁▁
Val_Loss,▅▆█▆▅▃▂▁▃▂
train_accuracy,▁▁▁▁▁█▁▁▁▁
training_loss,█▃▃▂▂▂▂▁▁▁

0,1
Val_Accuracy,0.0
Val_Loss,3.011
train_accuracy,0.0
training_loss,2.92291


[34m[1mwandb[0m: Agent Starting Run: 2f34auc4 with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 64
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(512, 64, num_layers=3, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(512, 64, num_layers=3, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=64, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.048, Train_Accuracy: 0.00%
	Val_Loss: 2.898,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.520, Train_Accuracy: 0.02%
	Val_Loss: 2.346,  Val_Accuracy: 0.83%
Epoch: 3
	Train_Loss: 2.112, Train_Accuracy: 0.25%
	Val_Loss: 1.962,  Val_Accuracy: 2.54%
Epoch: 4
	Train_Loss: 1.832, Train_Accuracy: 1.01%
	Val_Loss: 1.751,  Val_Accuracy: 5.74%
Epoch: 5
	Train_Loss: 1.641, Train_Accuracy: 1.28%
	Val_Loss: 1.648,  Val_Accuracy: 7.45%
Epoch: 6
	Train_Loss: 1.537, Train_Accuracy: 1.30%
	Val_Loss: 1.581,  Val_Accuracy: 8.57%
Epoch: 7


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▂▄▅▆▆▆▇█
Val_Loss,█▅▄▃▂▂▁▁▁▁
train_accuracy,▁▁▂▄▅▅▄▄▆█
training_loss,█▆▄▃▂▂▂▁▁▁

0,1
Val_Accuracy,12.98828
Val_Loss,1.43284
train_accuracy,2.36523
training_loss,1.29774


[34m[1mwandb[0m: Agent Starting Run: 6ai9cpqm with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 256
[34m[1mwandb[0m: 	input_embed_size: 32
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(32, 256, num_layers=3, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(32, 256, batch_first=True)
    (fc): Linear(in_features=256, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.860, Train_Accuracy: 0.00%
	Val_Loss: 2.639,  Val_Accuracy: 0.15%
Epoch: 2
	Train_Loss: 2.018, Train_Accuracy: 0.54%
	Val_Loss: 1.792,  Val_Accuracy: 4.22%
Epoch: 3
	Train_Loss: 1.460, Train_Accuracy: 3.13%
	Val_Loss: 1.527,  Val_Accuracy: 7.59%
Epoch: 4
	Train_Loss: 1.205, Train_Accuracy: 7.00%
	Val_Loss: 1.413,  Val_Accuracy: 14.23%
Epoch: 5
	Train_Loss: 1.057, Train_Accuracy: 9.66%
	Val_Loss: 1.345,  Val_Accuracy: 15.55%
Epoch: 6
	Train_Loss: 0.953, Train_Accuracy: 10.30%
	Val_Loss: 1.275,  Val_Accuracy: 15.99%
Epoch: 7
	Train_Loss: 0.878, Trai

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▂▃▅▆▆██▇▄
Val_Loss,█▄▃▂▂▁▁▁▁▁
train_accuracy,▁▁▃▅▆▇▇█▇▇
training_loss,█▅▃▃▂▂▁▁▁▁

0,1
Val_Accuracy,9.74121
Val_Loss,1.20658
train_accuracy,11.56641
training_loss,0.73327


[34m[1mwandb[0m: Agent Starting Run: 0bcntuv8 with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 64
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): RNN(512, 64, num_layers=2, batch_first=True, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): RNN(512, 64, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=64, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.090, Train_Accuracy: 0.00%
	Val_Loss: 3.189,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.952, Train_Accuracy: 0.00%
	Val_Loss: 3.097,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.934, Train_Accuracy: 0.00%
	Val_Loss: 3.172,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.917, Train_Accuracy: 0.00%
	Val_Loss: 3.154,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.914, Train_Accuracy: 0.00%
	Val_Loss: 3.144,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.902, Train_Accuracy: 0.00%
	Val_Loss: 3.071,  Val_Accuracy: 0.00%
Epoch: 7
	T

VBox(children=(Label(value='0.001 MB of 0.019 MB uploaded\r'), FloatProgress(value=0.07145429291630227, max=1.…

0,1
Val_Accuracy,▁▁▁▁▁▁▁▁▁▁
Val_Loss,█▅▇▇▆▄▄▃▁▃
train_accuracy,▁▁▁▁▁▁▁▁▁▁
training_loss,█▄▃▃▃▂▂▂▁▁

0,1
Val_Accuracy,0.0
Val_Loss,3.04827
train_accuracy,0.0
training_loss,2.85306


[34m[1mwandb[0m: Agent Starting Run: psxz6jlg with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 64
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(512, 64, num_layers=3, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(512, 64, batch_first=True)
    (fc): Linear(in_features=64, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.938, Train_Accuracy: 0.00%
	Val_Loss: 2.902,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.615, Train_Accuracy: 0.01%
	Val_Loss: 2.681,  Val_Accuracy: 0.22%
Epoch: 3
	Train_Loss: 2.441, Train_Accuracy: 0.03%
	Val_Loss: 2.508,  Val_Accuracy: 0.66%
Epoch: 4
	Train_Loss: 2.328, Train_Accuracy: 0.12%
	Val_Loss: 2.396,  Val_Accuracy: 0.98%
Epoch: 5
	Train_Loss: 2.244, Train_Accuracy: 0.16%
	Val_Loss: 2.341,  Val_Accuracy: 1.59%
Epoch: 6
	Train_Loss: 2.178, Train_Accuracy: 0.23%
	Val_Loss: 2.279,  Val_Accuracy: 1.76%
Epoch: 7
	Train_Loss: 2.126, Train_A

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▂▃▄▅▆▆▇██
Val_Loss,█▆▅▄▃▃▂▂▁▁
train_accuracy,▁▁▁▂▃▄▆▆▆█
training_loss,█▆▄▃▃▂▂▂▁▁

0,1
Val_Accuracy,2.63672
Val_Loss,2.09588
train_accuracy,0.59961
training_loss,2.0165


[34m[1mwandb[0m: Agent Starting Run: vw7b8mbw with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 32
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): RNN(32, 512, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): RNN(32, 512, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.070, Train_Accuracy: 0.00%
	Val_Loss: 3.124,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.938, Train_Accuracy: 0.00%
	Val_Loss: 3.172,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.931, Train_Accuracy: 0.00%
	Val_Loss: 3.101,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.900, Train_Accuracy: 0.00%
	Val_Loss: 3.167,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.891, Train_Accuracy: 0.00%
	Val_Loss: 3.072,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.887, Train_Accuracy: 0.00%
	Val_Loss: 3.104,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 2.883, Train_Accu

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁▁▁▁
Val_Loss,▅█▃█▁▃▁▂▂▃
train_accuracy,▁▁▁▁▁▁▁▁▁█
training_loss,█▃▃▂▂▂▂▁▁▁

0,1
Val_Accuracy,0.0
Val_Loss,3.09467
train_accuracy,0.00195
training_loss,2.86835


[34m[1mwandb[0m: Agent Starting Run: j2cdygay with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 256
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): RNN(512, 256, num_layers=3, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): RNN(512, 256, batch_first=True)
    (fc): Linear(in_features=256, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.957, Train_Accuracy: 0.00%
	Val_Loss: 3.197,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.895, Train_Accuracy: 0.00%
	Val_Loss: 3.127,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.877, Train_Accuracy: 0.00%
	Val_Loss: 3.192,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.867, Train_Accuracy: 0.00%
	Val_Loss: 3.133,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.862, Train_Accuracy: 0.00%
	Val_Loss: 3.127,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.855, Train_Accuracy: 0.00%
	Val_Loss: 3.198,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 2.857, Train_

VBox(children=(Label(value='0.001 MB of 0.019 MB uploaded\r'), FloatProgress(value=0.0715389369592089, max=1.0…

0,1
Val_Accuracy,▁▁▁▁▁▁▁▁▁▁
Val_Loss,█▂▇▃▂█▇▅▅▁
train_accuracy,▁▁▁▁▁▁▁▁▁▁
training_loss,█▄▃▂▂▂▂▂▂▁

0,1
Val_Accuracy,0.0
Val_Loss,3.10962
train_accuracy,0.0
training_loss,2.84604


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 97d66w7n with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 16
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): RNN(256, 16, num_layers=2, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): RNN(256, 16, num_layers=3, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=16, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.374, Train_Accuracy: 0.00%
	Val_Loss: 3.170,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 3.075, Train_Accuracy: 0.00%
	Val_Loss: 3.149,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 3.018, Train_Accuracy: 0.00%
	Val_Loss: 3.244,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.994, Train_Accuracy: 0.00%
	Val_Loss: 3.136,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.975, Train_Accuracy: 0.00%
	Val_Loss: 3.206,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.961, Train_Accuracy: 0.00%
	Val_Loss: 3.101,  Val_Accuracy: 0.00%
Epoch: 7
	T

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁▁▁▁
Val_Loss,▅▅█▄▇▃▄▃▁▁
train_accuracy,▁▁▁▁▁█▁█▁▁
training_loss,█▃▂▂▂▁▁▁▁▁

0,1
Val_Accuracy,0.0
Val_Loss,3.04685
train_accuracy,0.0
training_loss,2.93012


[34m[1mwandb[0m: Agent Starting Run: v7cmtoae with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 32
[34m[1mwandb[0m: 	input_embed_size: 16
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): RNN(16, 32, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): RNN(16, 32, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=32, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.180, Train_Accuracy: 0.00%
	Val_Loss: 3.105,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.978, Train_Accuracy: 0.00%
	Val_Loss: 3.014,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.926, Train_Accuracy: 0.00%
	Val_Loss: 3.048,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.900, Train_Accuracy: 0.00%
	Val_Loss: 3.016,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.887, Train_Accuracy: 0.00%
	Val_Loss: 3.034,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.880, Train_Accuracy: 0.00%
	Val_Loss: 3.010,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 2.864, Train_Accurac

VBox(children=(Label(value='0.001 MB of 0.019 MB uploaded\r'), FloatProgress(value=0.07076985784888164, max=1.…

0,1
Val_Accuracy,▁▁▁▁▁▁▁▁▁▁
Val_Loss,█▃▅▃▄▂▂▅▄▁
train_accuracy,█▁▁▁▁▁█▁▁▁
training_loss,█▄▃▂▂▂▁▁▁▁

0,1
Val_Accuracy,0.0
Val_Loss,2.98698
train_accuracy,0.0
training_loss,2.85346


[34m[1mwandb[0m: Agent Starting Run: y8eamp06 with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 64
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): GRU(512, 64, num_layers=3, batch_first=True, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): GRU(512, 64, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=64, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.893, Train_Accuracy: 0.01%
	Val_Loss: 2.530,  Val_Accuracy: 0.20%
Epoch: 2
	Train_Loss: 2.330, Train_Accuracy: 0.07%
	Val_Loss: 2.171,  Val_Accuracy: 1.07%
Epoch: 3
	Train_Loss: 2.096, Train_Accuracy: 0.27%
	Val_Loss: 1.966,  Val_Accuracy: 2.56%
Epoch: 4
	Train_Loss: 1.952, Train_Accuracy: 0.51%
	Val_Loss: 1.841,  Val_Accuracy: 3.71%
Epoch: 5
	Train_Loss: 1.845, Train_Accuracy: 0.59%
	Val_Loss: 1.749,  Val_Accuracy: 4.42%
Epoch: 6
	Train_Loss: 1.769, Train_Accuracy: 0.88%
	Val_Loss: 1.678,  Val_Accuracy: 5.54%
Epoch: 7
	T

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▂▃▄▅▆▇▆██
Val_Loss,█▅▄▃▂▂▁▁▁▁
train_accuracy,▁▁▂▃▄▅▆███
training_loss,█▅▄▃▂▂▂▁▁▁

0,1
Val_Accuracy,8.30078
Val_Loss,1.56981
train_accuracy,1.54883
training_loss,1.57357


[34m[1mwandb[0m: Agent Starting Run: xfdionq8 with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 16
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): RNN(16, 512, num_layers=2, batch_first=True, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): RNN(16, 512, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.084, Train_Accuracy: 0.00%
	Val_Loss: 3.099,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.965, Train_Accuracy: 0.00%
	Val_Loss: 3.090,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.944, Train_Accuracy: 0.00%
	Val_Loss: 3.164,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.932, Train_Accuracy: 0.00%
	Val_Loss: 3.109,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.927, Train_Accuracy: 0.00%
	Val_Loss: 3.061,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.918, Train_Accuracy: 0.00%
	Val_Loss: 3.104,  Val_Accuracy: 0.00%
Epoch: 7
	Tr

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁▁▁▁
Val_Loss,▄▃█▄▁▄▃▅▃▄
train_accuracy,▁▁▁▁▁▁▁▁▁▁
training_loss,█▃▂▂▂▁▁▁▁▁

0,1
Val_Accuracy,0.0
Val_Loss,3.10532
train_accuracy,0.0
training_loss,2.90788


[34m[1mwandb[0m: Agent Starting Run: xujubw64 with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 256
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): RNN(512, 256, num_layers=2, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): RNN(512, 256, batch_first=True)
    (fc): Linear(in_features=256, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.961, Train_Accuracy: 0.00%
	Val_Loss: 3.125,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.890, Train_Accuracy: 0.00%
	Val_Loss: 3.172,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.859, Train_Accuracy: 0.00%
	Val_Loss: 3.118,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.840, Train_Accuracy: 0.00%
	Val_Loss: 3.102,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.826, Train_Accuracy: 0.00%
	Val_Loss: 3.093,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.824, Train_Accuracy: 0.00%
	Val_Loss: 3.072,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 2.813, Train_

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁▁▁▁
Val_Loss,▅█▅▄▄▂▁▂▇▃
train_accuracy,▁▁▁▁▁▁▁▁▁▁
training_loss,█▅▃▃▂▂▁▁▁▁

0,1
Val_Accuracy,0.0
Val_Loss,3.08408
train_accuracy,0.0
training_loss,2.81206


[34m[1mwandb[0m: Agent Starting Run: om2dd0mg with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 256
[34m[1mwandb[0m: 	input_embed_size: 32
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): GRU(32, 256, num_layers=3, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): GRU(32, 256, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=256, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.216, Train_Accuracy: 0.74%
	Val_Loss: 1.642,  Val_Accuracy: 6.10%
Epoch: 2
	Train_Loss: 1.287, Train_Accuracy: 5.45%
	Val_Loss: 1.395,  Val_Accuracy: 14.11%
Epoch: 3
	Train_Loss: 1.032, Train_Accuracy: 9.35%
	Val_Loss: 1.315,  Val_Accuracy: 15.92%
Epoch: 4
	Train_Loss: 0.904, Train_Accuracy: 11.38%
	Val_Loss: 1.226,  Val_Accuracy: 18.33%
Epoch: 5
	Train_Loss: 0.818, Train_Accuracy: 13.88%
	Val_Loss: 1.239,  Val_Accuracy: 21.97%
Epoch: 6
	Train_Loss: 0.760, Train_Accuracy: 16.05%
	Val_Loss: 1.180,  Val_Accuracy: 27.64%
Epoc

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▃▄▅▆▇▇▇██
Val_Loss,█▄▃▂▂▁▁▁▁▁
train_accuracy,▁▃▄▅▆▆▇███
training_loss,█▄▃▂▂▂▁▁▁▁

0,1
Val_Accuracy,28.14941
Val_Loss,1.16473
train_accuracy,20.57031
training_loss,0.61446


[34m[1mwandb[0m: Agent Starting Run: lcmecqvh with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 256
[34m[1mwandb[0m: 	input_embed_size: 32
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(32, 256, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(32, 256, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=256, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.995, Train_Accuracy: 0.00%
	Val_Loss: 2.970,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.589, Train_Accuracy: 0.01%
	Val_Loss: 2.487,  Val_Accuracy: 0.22%
Epoch: 3
	Train_Loss: 2.093, Train_Accuracy: 0.15%
	Val_Loss: 1.994,  Val_Accuracy: 2.66%
Epoch: 4
	Train_Loss: 1.692, Train_Accuracy: 0.98%
	Val_Loss: 1.689,  Val_Accuracy: 6.88%
Epoch: 5
	Train_Loss: 1.432, Train_Accuracy: 2.87%
	Val_Loss: 1.553,  Val_Accuracy: 10.72%
Epoch: 6
	Train_Loss: 1.253, Train_Accuracy: 4.76%
	Val_Loss: 1.441,  Val_Accuracy: 11.57%
Epoch: 7
	Train_Loss: 1.118, Train_

VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▂▃▄▄▇▇▆█
Val_Loss,█▆▄▃▂▂▁▁▁▁
train_accuracy,▁▁▁▂▃▄▆▆██
training_loss,█▇▅▄▃▂▂▁▁▁

0,1
Val_Accuracy,24.41406
Val_Loss,1.24847
train_accuracy,10.31641
training_loss,0.90677


[34m[1mwandb[0m: Agent Starting Run: gfyu4l7s with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 32
[34m[1mwandb[0m: 	input_embed_size: 64
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(64, 32, num_layers=3, batch_first=True, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(64, 32, batch_first=True)
    (fc): Linear(in_features=32, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.165, Train_Accuracy: 0.00%
	Val_Loss: 3.141,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.894, Train_Accuracy: 0.00%
	Val_Loss: 2.977,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.813, Train_Accuracy: 0.00%
	Val_Loss: 2.877,  Val_Accuracy: 0.02%
Epoch: 4
	Train_Loss: 2.734, Train_Accuracy: 0.00%
	Val_Loss: 2.814,  Val_Accuracy: 0.02%
Epoch: 5
	Train_Loss: 2.664, Train_Accuracy: 0.00%
	Val_Loss: 2.734,  Val_Accuracy: 0.10%
Epoch: 6
	Train_Loss: 2.610, Train_Accuracy: 0.00%
	Val_Loss: 2.682,  Val_Accuracy: 0.22%
Epoch: 7
	Train_Loss: 2.571, Train_Accur

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▃▅▆▅▄█
Val_Loss,█▆▅▅▄▃▂▂▁▁
train_accuracy,▁▁█▁▁██▁█▁
training_loss,█▅▅▄▃▃▂▂▁▁

0,1
Val_Accuracy,0.39062
Val_Loss,2.47342
train_accuracy,0.0
training_loss,2.42189


[34m[1mwandb[0m: Agent Starting Run: fukh85jr with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 256
[34m[1mwandb[0m: 	input_embed_size: 32
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): GRU(32, 256, num_layers=2, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): GRU(32, 256, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=256, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.277, Train_Accuracy: 0.65%
	Val_Loss: 1.648,  Val_Accuracy: 5.22%
Epoch: 2
	Train_Loss: 1.274, Train_Accuracy: 5.35%
	Val_Loss: 1.366,  Val_Accuracy: 11.96%
Epoch: 3
	Train_Loss: 1.016, Train_Accuracy: 9.49%
	Val_Loss: 1.287,  Val_Accuracy: 14.09%
Epoch: 4
	Train_Loss: 0.891, Train_Accuracy: 11.21%
	Val_Loss: 1.265,  Val_Accuracy: 16.04%
Epoch: 5
	Train_Loss: 0.815, Train_Accuracy: 13.22%
	Val_Loss: 1.212,  Val_Accuracy: 17.82%
Epoch: 6
	Train_Loss: 0.747, Train_Accuracy: 14.54%
	Val_Loss: 1.207,  Val_Accuracy: 20.78%
Epoc

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▃▄▅▅▆▆▇▇█
Val_Loss,█▄▃▂▂▂▁▁▂▁
train_accuracy,▁▃▅▅▆▇██▇█
training_loss,█▄▃▂▂▂▁▁▁▁

0,1
Val_Accuracy,25.73242
Val_Loss,1.18394
train_accuracy,16.74414
training_loss,0.60607


[34m[1mwandb[0m: Agent Starting Run: s2qp38xz with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 16
[34m[1mwandb[0m: 	input_embed_size: 16
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): GRU(16, 16, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): GRU(16, 16, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=16, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.382, Train_Accuracy: 0.00%
	Val_Loss: 3.165,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 3.121, Train_Accuracy: 0.00%
	Val_Loss: 3.067,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 3.003, Train_Accuracy: 0.00%
	Val_Loss: 3.075,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.958, Train_Accuracy: 0.00%
	Val_Loss: 3.040,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.930, Train_Accuracy: 0.00%
	Val_Loss: 3.019,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.913, Train_Accuracy: 0.00%
	Val_Loss: 2.986,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 2.898, Train_Accurac

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁▁▁▁
Val_Loss,█▅▅▄▃▂▂▂▂▁
train_accuracy,▁█▁▁▁▁▁▁██
training_loss,█▅▃▂▂▂▂▁▁▁

0,1
Val_Accuracy,0.0
Val_Loss,2.94621
train_accuracy,0.00195
training_loss,2.85218
