# **Importing all the necessary libraries**

In [73]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import numpy as np
import pandas as pd

## **Encoder class**

In [133]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, rnn_cell='lstm', dropout=0.5, bidirectional=True):
        super(Encoder, self).__init__()  # Initialize the parent class.
        self.embedding = nn.Embedding(num_embeddings=input_size, embedding_dim=embedding_size)  # Create an embedding layer.
        self.dropout = nn.Dropout(dropout)  # Create a dropout layer.
        self.hidden_size = hidden_size  # Store the hidden size.
        self.num_layers = num_layers  # Store the number of layers.
        self.bidirectional = bidirectional  # Store whether the RNN is bidirectional.
        
        rnn_hidden_size = hidden_size // 2 if bidirectional else hidden_size  # Adjust hidden size for bidirectional RNN.
        
        if rnn_cell.lower() == 'lstm':
            self.rnn = nn.LSTM(embedding_size, rnn_hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout), bidirectional=bidirectional)  # Create an LSTM layer.
        elif rnn_cell.lower() == 'gru':
            self.rnn = nn.GRU(embedding_size, rnn_hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout), bidirectional=bidirectional)  # Create a GRU layer.
        else:
            self.rnn = nn.RNN(embedding_size, rnn_hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout), bidirectional=bidirectional)  # Create an RNN layer.
    
    def forward(self, x):
        embedded = self.embedding(x)  # Embed the input sequences.
        embedded = self.dropout(embedded)  # Apply dropout to the embeddings.
        outputs, hidden = self.rnn(embedded)  # Pass the embeddings through the RNN.

        if self.bidirectional:  # If the RNN is bidirectional.
            if isinstance(hidden, tuple):  # If the hidden state is a tuple (LSTM case).
                h_n, c_n = hidden  # Unpack the hidden states (hidden and cell states for LSTM).
#                 print('enc h bef dir',h_n.shape)  
#                 print('enc c bef dir',c_n.shape)  
                h_n = torch.cat((h_n[0::2], h_n[1::2]), dim=2)  # Concatenate the forward and backward hidden states.
                c_n = torch.cat((c_n[0::2], c_n[1::2]), dim=2)  # Concatenate the forward and backward cell states.
#                 print('enc h af dir',h_n.shape) 
#                 print('enc c af dir',c_n.shape)  
                hidden = (h_n, c_n)  # Pack the adjusted hidden states back into a tuple.
            else:  # If the hidden state is not a tuple (GRU/RNN case).
#                 print('enc hidd bef dir',hidden.shape) 
                hidden = torch.cat((hidden[0::2], hidden[1::2]), dim=2)  # Concatenate the forward and backward hidden states.
#                 print('after dir enc:',hidden.shape) 

        return hidden  # Return the RNN hidden states.


# **Decoder class**

In [134]:
class Decoder(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, num_layers, encoder_num_layers, rnn_cell='lstm', dropout=0.5, bidirectional=True):
        super(Decoder, self).__init__()  # Initialize the parent class.
        self.embedding = nn.Embedding(num_embeddings=output_size, embedding_dim=embedding_size)  # Create an embedding layer.
        self.dropout = nn.Dropout(dropout)  # Create a dropout layer.
        self.output_size = output_size  # Store the output size.
        self.hidden_size = hidden_size * encoder_num_layers if bidirectional else hidden_size  # Adjust hidden size for bidirectional encoder.
        self.num_layers = num_layers  # Store the number of layers.
        
        if rnn_cell.lower() == 'lstm':
            self.rnn = nn.LSTM(embedding_size, self.hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout))  # Create an LSTM layer.
        elif rnn_cell.lower() == 'gru':
            self.rnn = nn.GRU(embedding_size, self.hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout))  # Create a GRU layer.
        else:
            self.rnn = nn.RNN(embedding_size, self.hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout))  # Create an RNN layer.
        
        self.fc = nn.Linear(self.hidden_size, output_size)  # Create a fully connected layer for output.

    def forward(self, x, hidden):
        x = x.unsqueeze(1)  # Add a singleton dimension to the input tensor.
        embedded = self.dropout(self.embedding(x))  # Embed the input sequences and apply dropout.
        output, hidden = self.rnn(embedded, hidden)  # Pass the embedded input through the RNN.
        output = self.fc(self.dropout(output.squeeze(1)))  # Apply dropout and pass through the fully connected layer.
        return output, hidden  # Return the output and hidden states.


# **Sequence to Sequence model for the above encoder and decoder**

In [135]:
class Seq_to_Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq_to_Seq, self).__init__()  # Initialize the parent class.
        self.encoder = encoder  # Store the encoder module.
        self.decoder = decoder  # Store the decoder module.

    def forward(self, source, target, teaching_force_ratio=0.5):
        batch_size = source.size(0)  # Get the batch size.
        target_len = target.size(1)  # Get the target sequence length.
        target_vocab_size = self.decoder.output_size  # Get the output vocabulary size.
        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(source.device)  # Initialize output tensor.
        
        encoder_hidden = self.encoder(source)  # Get encoder hidden states.
        
        if isinstance(encoder_hidden, tuple):  # If encoder hidden states is a tuple (LSTM case).
            h_n, c_n = encoder_hidden  # Unpack hidden states.
            if self.encoder.bidirectional:  # If encoder is bidirectional.
                h_n = torch.cat([h_n[i:i+1] for i in range(0, h_n.shape[0], 2)] + [h_n[i:i+1] for i in range(1, h_n.shape[0], 2)], dim=2)  # Concatenate forward and backward hidden states.
                c_n = torch.cat([c_n[i:i+1] for i in range(0, c_n.shape[0], 2)] + [c_n[i:i+1] for i in range(1, c_n.shape[0], 2)], dim=2)  # Concatenate forward and backward cell states.
            
            if h_n.size(0) < self.decoder.num_layers:  # If decoder has more layers than encoder.
                zero_h = torch.zeros(self.decoder.num_layers - h_n.size(0), batch_size, self.encoder.num_layers * self.encoder.hidden_size, device=h_n.device)  # Create zero tensor for hidden states.
                zero_c = torch.zeros(self.decoder.num_layers - c_n.size(0), batch_size, self.encoder.num_layers * self.encoder.hidden_size, device=c_n.device)  # Create zero tensor for cell states.
                h_n = torch.cat([h_n, zero_h], dim=0)  # Concatenate zero tensor to adjust hidden states shape.
                c_n = torch.cat([c_n, zero_c], dim=0)  # Concatenate zero tensor to adjust cell states shape.
            encoder_hidden = (h_n[:self.decoder.num_layers], c_n[:self.decoder.num_layers])  # Update encoder hidden states.
        else:  # If encoder hidden states is not a tuple (GRU/RNN case).
            h_n = encoder_hidden  # Use hidden states directly.
            if self.encoder.bidirectional:  # If encoder is bidirectional.
                h_n = torch.cat([h_n[i:i+1] for i in range(0, h_n.shape[0], 2)] + [h_n[i:i+1] for i in range(1, h_n.shape[0], 2)], dim=2)  # Concatenate forward and backward hidden states.
            if h_n.size(0) < self.decoder.num_layers:  # If decoder has more layers than encoder.
                zero_h = torch.zeros(self.decoder.num_layers - h_n.size(0), batch_size, self.encoder.num_layers * self.encoder.hidden_size, device=encoder_hidden.device)  # Create zero tensor for hidden states.
                h_n = torch.cat([h_n, zero_h], dim=0)  # Concatenate zero tensor to adjust hidden states shape.
            encoder_hidden = h_n[:self.decoder.num_layers]  # Update encoder hidden states.
        
        decoder_input = target[:, 0]  # Get the decoder input for the first time step.
                    
        for t in range(1, target_len):  # Iterate over target sequence.
            decoder_output, encoder_hidden = self.decoder(decoder_input, encoder_hidden)  # Get decoder output and update hidden states.
            outputs[:, t] = decoder_output  # Store decoder output.
            teacher_force = torch.rand(1) < teaching_force_ratio  # Determine whether to use teacher forcing.
            top1 = decoder_output.argmax(1)  # Get the predicted token.
            decoder_input = target[:, t] if teacher_force else top1  # Update decoder input based on teacher forcing.

        return outputs  # Return the final output tensor.


# **Printing the model**

In [136]:
INPUT_DIM = 100  # Set the size of the input vocabulary.
OUTPUT_DIM = 100  # Set the size of the output vocabulary.
ENC_EMB_DIM = 256  # Set the dimension of the input embeddings.
DEC_EMB_DIM = 256  # Set the dimension of the output embeddings.
HID_DIM = 512  # Set the dimension of the hidden states.
ENC_LAYERS = 1  # Set the number of layers in the encoder.
DEC_LAYERS = 3  # Set the number of layers in the decoder.
ENC_RNN_CELL = 'gru'  # Specify the RNN cell type for the encoder.
DEC_RNN_CELL = 'gru'  # Specify the RNN cell type for the decoder.

encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_LAYERS, ENC_RNN_CELL, dropout=0.3, bidirectional=True)  # Initialize the encoder module.
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_LAYERS, encoder.num_layers, DEC_RNN_CELL, dropout=0.3, bidirectional=True)  # Initialize the decoder module.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Choose the appropriate device for computation.
print(f"Using device: {device}")  # Print the chosen device.
model = Seq_to_Seq(encoder, decoder).to(device)  # Initialize the sequence-to-sequence model and move it to the selected device.
print(model)  # Print the model summary.


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): GRU(256, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): GRU(256, 512, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)


# **A function to create a vocabulary set from the given text**

In [137]:

# Define a function to create a vocabulary set from a given text
def create_vocab(text):
    # Create a set of unique characters found in the text
    # Each word in the text is processed to extract its characters
    vocab = set(char for word in text for char in word)
    # Add a padding token to the vocabulary
    vocab.add('<pad>')
    # Add a start-of-sequence token to the vocabulary
    vocab.add('<sos>')  # Start of sequence token
    # Add an end-of-sequence token to the vocabulary
    vocab.add('<eos>')  # End of sequence token
    # Return the complete set of vocabulary items
    return vocab

# **A function to load data from a CSV file**

In [138]:
# Define a function to load data from a CSV file
def load_data(path):
    # The file has no header and columns are named as 'latin' and 'bangla'
    df = pd.read_csv(path, header=None, names=['latin', 'bangla'])
#     df = df.head(10)
    # Return the columns as two separate Series objects
    return df['latin'], df['bangla']

# **Load Latin and bangla training data**

In [139]:
# Load Latin and bangla training data from specified path
latin_train, bangla_train = load_data('/kaggle/input/aksharantar-sampled/aksharantar_sampled/ben/ben_train.csv')

# **Print the loaded Latin and Bangla training data**

In [140]:
# Print the loaded Latin training data
print(latin_train)
print()
# Print the loaded bangla training data
print(bangla_train)

0        namdharirao
1        hindukusher
2        farajikandi
3           moubarak
4             chiung
            ...     
51195       silmadar
51196        jonnote
51197      handibage
51198         borpar
51199     bideshikei
Name: latin, Length: 51200, dtype: object

0            নামধারীরাও
1           হিন্দুকুশের
2           ফরাজীকান্দি
3                মুবারক
4                চিয়ুং
              ...      
51195          সিলমাদার
51196            জন্যতে
51197    হ্যান্ডিব্যাগে
51198             বরপার
51199         বিদেশীকেই
Name: bangla, Length: 51200, dtype: object


# **Create two vocabularies from the Latin and Bangla training data**

In [141]:
# Create a vocabulary from the Latin training data
latin_vocab = create_vocab(latin_train)
# Create a vocabulary from the bangla training data
bangla_vocab = create_vocab(bangla_train)

# **Print the created Latin and Bangla vocabularies**

In [142]:
# Print the created Latin vocabulary
print(latin_vocab)
print()
# Print the created bangla vocabulary
print(bangla_vocab)

{'o', '<eos>', 'j', 'g', 'l', 'e', 'd', 'm', 'v', '<sos>', 'p', 'z', 'r', 'y', 'w', 'q', '<pad>', 'k', 'n', 'c', 'i', 'h', 'x', 'a', 'u', 'f', 't', 'b', 's'}

{'ট', 'ঙ', 'চ', 'ি', 'ফ', 'ল', 'ম', '<eos>', 'শ', 'ড', 'ঁ', 'ও', 'খ', 'ঐ', 'স', 'ঋ', 'ঠ', 'ক', 'য', 'আ', 'উ', '্', 'হ', 'র', 'ছ', 'ং', 'দ', 'এ', 'ূ', 'ঘ', 'ৃ', 'ঊ', '<sos>', 'ঝ', 'ত', 'ষ', '়', 'ৌ', 'ণ', 'া', 'ে', '২', '<pad>', 'ো', 'গ', 'ৎ', 'ধ', 'ঈ', 'ৈ', 'ভ', 'ই', 'ঃ', 'ু', 'ন', 'ব', 'ী', 'অ', 'ঔ', 'ঞ', 'প', 'থ', 'ঢ', 'জ'}


# **Map each token in the Latin and Bangla vocabularies to a unique index and then Print the dictionaries mapping (Latin tokens to indices) and (Bangla tokens to indices)**


In [143]:
# Map each token in the Latin vocabulary to a unique index
latin_token_to_index = {token: index for index, token in enumerate(sorted(latin_vocab))}
# Map each token in the bangla vocabulary to a unique index
bangla_token_to_index = {token: index for index, token in enumerate(sorted(bangla_vocab))}

# Print the dictionary mapping Latin tokens to indices
print(latin_token_to_index)
print()

# Print the dictionary mapping bangla tokens to indices
print(bangla_token_to_index)

{'<eos>': 0, '<pad>': 1, '<sos>': 2, 'a': 3, 'b': 4, 'c': 5, 'd': 6, 'e': 7, 'f': 8, 'g': 9, 'h': 10, 'i': 11, 'j': 12, 'k': 13, 'l': 14, 'm': 15, 'n': 16, 'o': 17, 'p': 18, 'q': 19, 'r': 20, 's': 21, 't': 22, 'u': 23, 'v': 24, 'w': 25, 'x': 26, 'y': 27, 'z': 28}

{'<eos>': 0, '<pad>': 1, '<sos>': 2, 'ঁ': 3, 'ং': 4, 'ঃ': 5, 'অ': 6, 'আ': 7, 'ই': 8, 'ঈ': 9, 'উ': 10, 'ঊ': 11, 'ঋ': 12, 'এ': 13, 'ঐ': 14, 'ও': 15, 'ঔ': 16, 'ক': 17, 'খ': 18, 'গ': 19, 'ঘ': 20, 'ঙ': 21, 'চ': 22, 'ছ': 23, 'জ': 24, 'ঝ': 25, 'ঞ': 26, 'ট': 27, 'ঠ': 28, 'ড': 29, 'ঢ': 30, 'ণ': 31, 'ত': 32, 'থ': 33, 'দ': 34, 'ধ': 35, 'ন': 36, 'প': 37, 'ফ': 38, 'ব': 39, 'ভ': 40, 'ম': 41, 'য': 42, 'র': 43, 'ল': 44, 'শ': 45, 'ষ': 46, 'স': 47, 'হ': 48, '়': 49, 'া': 50, 'ি': 51, 'ী': 52, 'ু': 53, 'ূ': 54, 'ৃ': 55, 'ে': 56, 'ৈ': 57, 'ো': 58, 'ৌ': 59, '্': 60, 'ৎ': 61, '২': 62}


# **Defining a Dataset class for handling Latin and Bangla word pairs**

In [144]:
# Define a Dataset class for handling Latin and Bangla word pairs
class AksharantarDataset(Dataset):
    def __init__(self, latin_words, bangla_words, latin_token_to_index, bangla_token_to_index):
        # Store the lists of Latin and Bangla words
        self.latin_words = latin_words
        self.bangla_words = bangla_words
        # Store the dictionaries that map characters to indices for both languages
        self.latin_token_to_index = latin_token_to_index
        self.bangla_token_to_index = bangla_token_to_index

    def __len__(self):
        # Return the number of word pairs in the dataset
        return len(self.latin_words)

    def __getitem__(self, index):
        # Fetching the Latin and Bangla words at the specified index
        latin_word = self.latin_words.iloc[index]
#         print(latin_word)
        bangla_word = self.bangla_words.iloc[index]
#         print(bangla_word)
        # Convert the Latin word into indices using the latin_token_to_index mapping
        latin_indices = [latin_token_to_index[char] for char in latin_word]
#         print(latin_indices)
        # Convert the Bangla word into indices, adding <sos> and <eos> tokens
        bangla_indices = [bangla_token_to_index['<sos>']] + [bangla_token_to_index[char] for char in bangla_word] + [bangla_token_to_index['<eos>']]
#         print(bangla_indices)
        # Return the indices as tensor objects
        return torch.tensor(latin_indices, dtype=torch.long), torch.tensor(bangla_indices, dtype=torch.long)

# **Defining a function for padding sequences and packing batches**

In [145]:
# Define a function for padding sequences and packing batches
# packet_fn specifies a function to control how batches are created from the individual data items
def packet_fn(batch):
    # Unzip the batch to separate Latin and Bangla indices
    latin, bangla = zip(*batch)
#     print(latin, bangla)
    # Pad the sequences of Latin indices
    latin_padded = pad_sequence(latin, batch_first=True, padding_value=latin_token_to_index['<pad>'])
#     print(latin_padded)
    # Pad the sequences of Bangla indices
    bangla_padded = pad_sequence(bangla, batch_first=True, padding_value=bangla_token_to_index['<pad>'])
#     print(bangla_padded)
    # Return the padded batches
    return latin_padded, bangla_padded

# **Load training data into the AksharantarDataset and then creating the train_loader by Dataloader function**

In [146]:
# Load training data into the AksharantarDataset
train_dataset = AksharantarDataset(latin_train, bangla_train, latin_token_to_index, bangla_token_to_index)
# Create a DataLoader to batch and shuffle the dataset
# packet_fn specifies a function to control how batches are created from the individual data items
train_loader = DataLoader(train_dataset, batch_size = 64, collate_fn=packet_fn, shuffle=True)

# **Print an example from the dataset**

In [147]:
# Print an example from the dataset
print(train_dataset[4000])
# for i,j in train_loader:
#     print(i,'\n\n\n',j)

(tensor([19, 23,  7, 20,  5,  7, 22, 11, 16]), tensor([ 2, 17, 50, 43, 47, 56, 32, 51, 36,  0]))



# **A function for calculating word accuracy per batch, ignoring the padding token**

In [148]:
# Define a word accuracy function for word-level accuracy
def word_accuracy(outputs, targets, ignore_index):
    # Assuming outputs and targets are batched sequences of token indices
    # Ignoring <pad> tokens as specified by `ignore_index`
    correct = 0  # Initialize the count of correct predictions.
    total = 0  # Initialize the total number of sequences.
    for out, tar in zip(outputs, targets):  # Iterate over each output and target pair.
        # Ignoring padding in accuracy calculation
#         print('out bef pad:',out)  # Uncomment to print the output before removing padding.
#         print('tar:',tar)  # Uncomment to print the target.
        out = out[out != ignore_index]  # Remove padding tokens from the output.
        tar = tar[tar != ignore_index]  # Remove padding tokens from the target.
        ignore_index_eos = 0  # Define an ignore index for end of sequence.
        out = out[out != ignore_index_eos]  # Remove end of sequence tokens from the output.
        tar = tar[tar != ignore_index_eos]  # Remove end of sequence tokens from the target.
#         print('out aft pad:',out)  # Uncomment to print the output after removing padding.
#         print('tar:',tar)  # Uncomment to print the target after removing padding.
        if torch.equal(out, tar):  # Check if the processed output and target are identical.
            correct += 1  # Increment the correct count.
#             print('correct:',correct)  # Uncomment to print the correct count.
        total += 1  # Increment the total count.
#         print('total:',total)  # Uncomment to print the total count.
    return correct / total if total > 0 else 0  # Calculate and return the accuracy.




# **Defining the Training function**

In [149]:
def train(model, iterator, optimizer, criterion, clip, device, ignore_index):
    # Set the model to training mode
    model.train()
    # Initialize epoch loss and accuracy
    epoch_loss = 0
    epoch_acc = 0
    
    # Iterate through the data iterator
    for source, target in iterator:
        # Move source and target tensors to the specified device
        source = source.to(device)
        target = target.to(device)
        
        # Zero the gradients
        optimizer.zero_grad()
        # Forward pass: compute model predictions
        output = model(source, target)
        
        output_dim = output.shape[-1]
        # Slice the output and target tensors to remove <sos> token and maintain sequence structure
        output = output[:, 1:, :]
        target = target[:, 1:]
        
        # Flatten all dimensions except for the batch dimension for loss calculation
        output_flat = output.reshape(-1, output_dim)
        target_flat = target.reshape(-1)
        
        # Compute the loss
        loss = criterion(output_flat, target_flat)
        # Calculate word-by-word accuracy
        acc = word_accuracy(output.argmax(dim=2), target, ignore_index)
        
        # Backpropagation
        loss.backward()
        # Clip gradients to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        # Update model parameters
        optimizer.step()
        
        # Accumulate epoch loss and accuracy
        epoch_loss += loss.item()
        epoch_acc += acc
    
    # Return average epoch loss and accuracy
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


# **Defining the Evaluation function**

In [150]:
def evaluate(model, iterator, criterion, device, ignore_index):
    # Set the model to evaluation mode
    model.eval()
    # Initialize epoch loss and accuracy
    epoch_loss = 0
    epoch_acc = 0
    
    # Iterate through the data iterator
    with torch.no_grad():
        for source, target in iterator:
            # Move source and target tensors to the specified device
            source = source.to(device)
            target = target.to(device)
            
            # Forward pass: compute model predictions without teacher forcing
            output = model(source, target, 0)
            output_dim = output.shape[-1]
            # Slice the output and target tensors to remove <sos> token and maintain sequence structure
            output = output[:, 1:, :]
            target = target[:, 1:]
            
            # Flatten all dimensions except for the batch dimension for loss calculation
            output_flat = output.reshape(-1, output_dim)
            target_flat = target.reshape(-1)
            
            # Compute the loss
            loss = criterion(output_flat, target_flat)
            # Calculate word-by-word accuracy
            acc = word_accuracy(output.argmax(dim=2), target, ignore_index)
            
            # Accumulate epoch loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc
            
    # Return average epoch loss and accuracy
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


# **Load validation data into the AksharantarDataset and then creating the valid_loader by Dataloader function**

In [117]:
# Load validation data by reading a CSV file
latin_valid, bangla_valid = load_data('/kaggle/input/aksharantar-sampled/aksharantar_sampled/ben/ben_valid.csv')

# Create a validation dataset using the AksharantarDataset class.
valid_dataset = AksharantarDataset(latin_valid, bangla_valid, latin_token_to_index, bangla_token_to_index)

# Create a DataLoader to batch and shuffle the dataset
# 'collate_fn=packet_fn' specifies a function to control how batches are created from the individual data items.
# 'shuffle=True' ensures that the data is shuffled at every epoch which helps to reduce model overfitting
valid_loader = DataLoader(valid_dataset, batch_size=64, collate_fn=packet_fn, shuffle=True)

# **The training process for specified number of epochs**

In [151]:
# -embed_size-64-layers_enc-3-layers_dec-3-hid_size-512-cell_type-lstm-bidirectional-True-dropout-0.2
# Define the dimensions and configurations for the encoder and decoder
INPUT_DIM = 100
OUTPUT_DIM = 100
ENC_EMB_DIM = 64
DEC_EMB_DIM = 64
HID_DIM = 512
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_RNN_CELL = 'lstm'
DEC_RNN_CELL = 'lstm'

# Initialize the encoder with the specified parameters
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_LAYERS, ENC_RNN_CELL, dropout=0.2, bidirectional=True)
# Initialize the decoder with the specified parameters, using the number of encoder layers
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_LAYERS, encoder.num_layers, DEC_RNN_CELL, dropout=0.2, bidirectional=True)
# Determine the device for model training (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Initialize the sequence-to-sequence model with the encoder and decoder
model = Seq_to_Seq(encoder, decoder).to(device)
print(model)


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(64, 256, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(64, 1536, num_layers=3, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=1536, out_features=100, bias=True)
  )
)


In [152]:
# Setting the number of epochs the training process should run
NUM_EPOCHS = 10
# Set the maximum norm of the gradients to 1 to prevent exploding gradients
CLIP = 1
# Initialize the optimizer, Adam
optimizer = torch.optim.Adam(model.parameters())
# Create Adam optimizer with default parameters
optimizer = torch.optim.Adam(model.parameters())


# Padding token index should be ignored in loss calculation
ignore_index = bangla_token_to_index['<pad>']
# Define the loss function with 'ignore_index' to avoid affecting loss calculation with padding tokens
criterion = nn.CrossEntropyLoss(ignore_index=ignore_index).to(device)

# Start the training process for the defined number of epochs
for epoch in range(NUM_EPOCHS):
    # Doing training on the train dataset and return average loss and accuracy
    train_loss, train_accuracy = train(model, train_loader, optimizer, criterion, CLIP, device, ignore_index)
    # Evaluating the model on the validation dataset and return average loss and accuracy
    val_loss, val_accuracy = evaluate(model, valid_loader, criterion, device, ignore_index)
    
    # Print the loss and accuracy for each epoch
    print(f'Epoch: {epoch+1}')
    print(f'\tTrain_Loss: {train_loss:.3f}, Train_Accuracy: {train_accuracy*100:.2f}%')
    print(f'\tVal_Loss: {val_loss:.3f},  Val_Accuracy: {val_accuracy*100:.2f}%')

KeyboardInterrupt: 

# **Load the Test data into the AksharantarDataset and then creating the test_loader by Dataloader function**

In [27]:
# Load the test data from the specified CSV file location
latin_test, bangla_test = load_data('/kaggle/input/aksharantar-sampled/aksharantar_sampled/ben/ben_test.csv')

# Create test_dataset using the AksharantarDataset class, initializing it with test data
# and corresponding token-to-index mappings for both Latin and Bangla scripts
test_dataset = AksharantarDataset(latin_test, bangla_test, latin_token_to_index, bangla_token_to_index)

# A DataLoader for the test dataset. Here, the batch size is set to 1, indicates
# that the model will process one item at a time. This is for testing to make
# detailed predictions per sample without batching effects.
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=packet_fn, shuffle=False)
# print(test_dataset[0])



# **A function to convert an array of indices back into a string, excluding any indices corresponding to special tokens like padding, start, or end of sequence tokens, which should not appear in the final output string**

In [29]:
def decode_indices(indices, index_to_token):
    # Filter out indices for padding, start-of-sequence, and end-of-sequence tokens to ensure only valid character indices are decoded
    valid_indices = [index for index in indices if index in index_to_token and index not in (bangla_token_to_index['<pad>'], bangla_token_to_index['<sos>'], bangla_token_to_index['<eos>'])]
    # Convert each index to its corresponding character and join them to form the decoded string
    return ''.join([index_to_token[index] for index in valid_indices])

# **Creating the prediction function to generate outputs for all samples in the test_loader**

In [30]:
def predict(model, iterator, device):
    # Set the model to evaluation mode to disable dropout or batch normalization effects during inference
    model.eval()
    predictions = []
    # Disables gradient calculations for performance improvement since they are not needed in inference
    with torch.no_grad():
        for source, target in iterator:
            # Ensure the source and target tensors are on the correct device (GPU or CPU)
            source = source.to(device)
            target = target.to(device)
            # Obtain model output without teacher forcing (i.e., the model relies entirely on its predictions)
            output = model(source, target, 0)
            # Get the index with the highest probability from output predictions
            output = output.argmax(2)
            # Convert tensors to CPU numpy arrays for easier manipulation and extraction
            source = source.cpu().numpy()
            output = output.cpu().numpy()
            target = target.cpu().numpy()
            # Store the tuple of source and decoded output predictions
            predictions.append((source, target, output))
    # Return all predictions made over the iterator
    return predictions

# **Creating dictionaries to map indices back to its corresponding characters**

In [31]:
# Create dictionaries to map indices back to characters, observing the interpretation of prediction outputs
latin_index_to_token = {index: char for char, index in latin_token_to_index.items()}
bangla_index_to_token = {index: char for char, index in bangla_token_to_index.items()}
# print(latin_index_to_token)
# print(bangla_index_to_token)

# **Displaying results: Each input text from the test dataset and its corresponding predicted output text are printed. This helps in visually assessing the accuracy and quality of the transliterations produced by the model**

In [60]:
# Taking the prediction function to generate outputs for all samples in the test_loader
test_predictions = predict(model, test_loader, device)
# print(len(test_predictions[0]))
# Loop through the list of tuples containing source and output indices from the test predictions
for source_indices, target_indices, output_indices in test_predictions:
    # Iterate through each example in the batch. This is necessary as batches may contain multiple examples
    for i in range(source_indices.shape[0]):
        # Decode the source indices to their corresponding text using the mapping dictionary for Latin script
        input_text = decode_indices(source_indices[i], latin_index_to_token)
        
        target_text = decode_indices(target_indices[i], bangla_index_to_token)

        # Decode the output indices to their corresponding text using the mapping dictionary for Bangla script
        predicted_text = decode_indices(output_indices[i], bangla_index_to_token)
        # Print the original input text and its corresponding predicted transliteration
        print(f'Input Text: {input_text} -> Actual Text: {target_text} -> Predicted Text: {predicted_text}')
        
        

3


# **CSV File creation**

In [32]:
import pandas as pd

# Create lists to store the data
input_texts = []
actual_texts = []
predicted_texts = []
test_predictions = predict(model, test_loader, device)

# Loop through the list of tuples containing source and output indices from the test predictions
for source_indices, target_indices, output_indices in test_predictions:
    # Iterate through each example in the batch. This is necessary as batches may contain multiple examples
    for i in range(source_indices.shape[0]):
        # Decode the source indices to their corresponding text using the mapping dictionary for Latin script
        input_text = decode_indices(source_indices[i], latin_index_to_token)
        target_text = decode_indices(target_indices[i], bangla_index_to_token)
        # Decode the output indices to their corresponding text using the mapping dictionary for Bangla script
        predicted_text = decode_indices(output_indices[i], bangla_index_to_token)
        # Append the texts to the lists
        input_texts.append(input_text)
        actual_texts.append(target_text)
        predicted_texts.append(predicted_text)

# Create a DataFrame from the lists
df = pd.DataFrame({
    'Input Text': input_texts,
    'Actual Text': actual_texts,
    'Predicted Text': predicted_texts
})

# Save the DataFrame to a CSV file
df.to_csv('predictions_without_attn.csv', index=False, encoding='utf-8')

# **Wandb Setup**

In [24]:
import wandb
import numpy as np
from types import SimpleNamespace
import random

# key = input('Enter your API:')
wandb.login(key='25c2257eaf6c22aa056893db14da4ee2bf0a531a')  #25c2257eaf6c22aa056893db14da4ee2bf0a531a

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# **For training and evaluating model on the training and validation dataset wandb setup**

In [25]:
sweep_config = {
    'method': 'bayes',
    'name' : 'sweep all final new lr 5',
    'metric': {
        'name': 'Val_Accuracy',
        'goal': 'maximize'
    },
    'parameters': {
        'input_embed_size': {
            'values': [16,32,64,256,512]
        },
        'num_enc_layers':{
            'values': [1,2,3]
        },
        'num_dec_layers':{
            'values': [1,2,3]
        },
        'hid_layer_size': {
            'values': [16,32,64,256,512]
        },
        'cell_type': {
            'values': ['lstm']
        },
        'bidirectional':{
            'values': [True, False]
        },
        'dropout': {
            'values': [0.2, 0.3]
        },
        'new_learning_rate':{
            'values': [0.001,0.01,0.1]
        }
#       'beam search in decoder with different beam sizes': 
    }
}

sweep_id = wandb.sweep(sweep = sweep_config, project="Deep_Learning_A3")


Create sweep with ID: qza57kfp
Sweep URL: https://wandb.ai/parthasakhapaul/Deep_Learning_A3/sweeps/qza57kfp


In [None]:
import wandb

def main():
    # Initialize a new wandb run
    with wandb.init() as run:
        # Construct run name from configuration
        run_name = "-embed_size-"+str(wandb.config.input_embed_size)+"-layers_enc-"+str(wandb.config.num_enc_layers)+"-layers_dec-"+str(wandb.config.num_dec_layers)+"-hid_size-"+str(wandb.config.hid_layer_size)+"-cell_type-"+wandb.config.cell_type+"-bidirectional-"+str(wandb.config.bidirectional)+"-dropout-"+str(wandb.config.dropout)+"-lr-"+str(wandb.config.new_learning_rate)
        wandb.run.name = run_name

        # Constants defining the dimensions of the input and output character sets
        INPUT_DIM = 100  # size of the Latin character set
        OUTPUT_DIM = 100  # size of the Bangla character set

        # Constants defining the dimensions of the embeddings for encoder and decoder
        ENC_EMB_DIM = wandb.config.input_embed_size  # Encoder embedding dimension
        DEC_EMB_DIM = wandb.config.input_embed_size  # Decoder embedding dimension

        # Constants defining the dimension of the hidden layers for encoder and decoder
        HID_DIM = wandb.config.hid_layer_size  # Hidden dimension size

        # Constants defining the number of layers for encoder and decoder
        ENC_LAYERS = wandb.config.num_enc_layers  # Number of layers in the encoder
        DEC_LAYERS = wandb.config.num_dec_layers  # Number of layers in the decoder
        

        # Constants defining the type of RNN cell to use for encoder and decoder
        ENC_RNN_CELL = wandb.config.cell_type  # RNN cell type for the encoder
        DEC_RNN_CELL = wandb.config.cell_type  # RNN cell type for the decoder

        # Instantiate the encoder with specified configurations
        encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_LAYERS, ENC_RNN_CELL, dropout = wandb.config.dropout, bidirectional = wandb.config.bidirectional)
        # Instantiate the decoder with specified configurations
        decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_LAYERS, encoder.num_layers, DEC_RNN_CELL, dropout = wandb.config.dropout, bidirectional = wandb.config.bidirectional)

        # Determine the computing device (CUDA if available, otherwise CPU)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # Print the device will be used
        print(f"Using device: {device}")

        # Instantiate the Seq_to_Seq model and move it to the chosen computing device
        model = Seq_to_Seq(encoder, decoder).to(device)
        print(model)
        
        
        # Setting the number of epochs the training process should run
        NUM_EPOCHS = 7
        # Set the maximum norm of the gradients to 1 to prevent exploding gradients
        CLIP = 1
        # Initialize the optimizer, Adam
        optimizer = torch.optim.Adam(model.parameters(), lr=wandb.config.new_learning_rate)  # Set the learning rate to 0.001


        # Padding token index should be ignored in loss calculation
        ignore_index = bangla_token_to_index['<pad>']
        # Define the loss function with 'ignore_index' to avoid affecting loss calculation with padding tokens
        criterion = nn.CrossEntropyLoss(ignore_index=ignore_index).to(device)

        # Start the training process for the defined number of epochs
        for epoch in range(NUM_EPOCHS):
            # Doing training on the train dataset and return average loss and accuracy
            train_loss, train_accuracy = train(model, train_loader, optimizer, criterion, CLIP, device, ignore_index)
            # Evaluating the model on the validation dataset and return average loss and accuracy
            val_loss, val_accuracy = evaluate(model, valid_loader, criterion, device, ignore_index)

            # Print the loss and accuracy for each epoch
            print(f'Epoch: {epoch+1}')
            print(f'\tTrain_Loss: {train_loss:.3f}, Train_Accuracy: {train_accuracy*100:.2f}%')
            print(f'\tVal_Loss: {val_loss:.3f},  Val_Accuracy: {val_accuracy*100:.2f}%')
            wandb.log({"train_accuracy": train_accuracy * 100, "training_loss": train_loss})
            wandb.log({"Val_Accuracy": val_accuracy * 100, "Val_Loss": val_loss})


wandb.agent(sweep_id, function=main, count=50)
wandb.finish()

[34m[1mwandb[0m: Agent Starting Run: ucipipva with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 256
[34m[1mwandb[0m: 	input_embed_size: 16
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(16, 256, num_layers=2, batch_first=True, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(16, 256, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=256, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.972, Train_Accuracy: 0.00%
	Val_Loss: 2.867,  Val_Accuracy: 0.07%
Epoch: 2
	Train_Loss: 2.372, Train_Accuracy: 0.04%
	Val_Loss: 2.085,  Val_Accuracy: 1.12%
Epoch: 3
	Train_Loss: 1.790, Train_Accuracy: 1.00%
	Val_Loss: 1.655,  Val_Accuracy: 7.45%
Epoch: 4
	Train_Loss: 1.444, Train_Accuracy: 2.82%
	Val_Loss: 1.458,  Val_Accuracy: 12.21%
Epoch: 5
	Train_Loss: 1.227, Train_Accuracy: 5.19%
	Val_Loss: 1.373,  Val_Accuracy: 12.28%
Epoch: 6
	Train_Loss: 1.088, Train_Accuracy: 7.38%
	Val_Loss: 1.313,  Val_Accuracy: 16.55%
Epoch: 

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▄▅▅▇█
Val_Loss,█▅▃▂▁▁▁
train_accuracy,▁▁▂▃▅▆█
training_loss,█▆▄▃▂▁▁

0,1
Val_Accuracy,20.1416
Val_Loss,1.27364
train_accuracy,9.63086
training_loss,0.986


[34m[1mwandb[0m: Agent Starting Run: 6y2i95e4 with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 512, num_layers=3, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 512, num_layers=3, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.440, Train_Accuracy: 0.43%
	Val_Loss: 1.823,  Val_Accuracy: 6.05%
Epoch: 2
	Train_Loss: 1.201, Train_Accuracy: 9.26%
	Val_Loss: 1.351,  Val_Accuracy: 17.65%
Epoch: 3
	Train_Loss: 0.866, Train_Accuracy: 15.92%
	Val_Loss: 1.263,  Val_Accuracy: 20.24%
Epoch: 4
	Train_Loss: 0.729, Train_Accuracy: 19.67%
	Val_Loss: 1.192,  Val_Accuracy: 23.41%
Epoch: 5
	Train_Loss: 0.626, Train_Accuracy: 23.88%
	Val_Loss: 1.170,  Val_Accuracy: 28.83%
Epoch: 6
	Train_Loss: 0.562, Train_Accuracy: 26.22%
	Val_Loss: 1.101,  Val_Accuracy: 31.8

VBox(children=(Label(value='0.001 MB of 0.018 MB uploaded\r'), FloatProgress(value=0.07226785899968806, max=1.…

0,1
Val_Accuracy,▁▄▅▆▇█▆
Val_Loss,█▃▃▂▂▁▁
train_accuracy,▁▃▅▆▇▇█
training_loss,█▄▂▂▁▁▁

0,1
Val_Accuracy,26.2207
Val_Loss,1.10827
train_accuracy,28.47266
training_loss,0.50294


[34m[1mwandb[0m: Agent Starting Run: zxpvi8o6 with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 256
[34m[1mwandb[0m: 	input_embed_size: 16
[34m[1mwandb[0m: 	new_learning_rate: 0.01
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(16, 128, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(16, 768, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=768, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.141, Train_Accuracy: 0.72%
	Val_Loss: 1.655,  Val_Accuracy: 5.66%
Epoch: 2
	Train_Loss: 1.379, Train_Accuracy: 3.89%
	Val_Loss: 1.529,  Val_Accuracy: 8.50%
Epoch: 3
	Train_Loss: 1.220, Train_Accuracy: 5.41%
	Val_Loss: 1.463,  Val_Accuracy: 12.50%
Epoch: 4
	Train_Loss: 1.131, Train_Accuracy: 6.84%
	Val_Loss: 1.387,  Val_Accuracy: 12.50%
Epoch: 5
	Train_Loss: 1.086, Train_Accuracy: 7.17%
	Val_Loss: 1.362,  Val_Accuracy: 15.21%
Epoch: 6
	Train_Loss: 1.059, Train_Accuracy: 8.14%
	Val_Loss: 1.363,  Val_Acc

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▃▅▅▇█▇
Val_Loss,█▅▃▂▁▁▁
train_accuracy,▁▄▅▇▇██
training_loss,█▃▂▂▁▁▁

0,1
Val_Accuracy,15.06348
Val_Loss,1.36958
train_accuracy,8.03516
training_loss,1.03156


[34m[1mwandb[0m: Agent Starting Run: hnz3vy39 with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.01
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 256, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 1536, num_layers=3, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=1536, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.284, Train_Accuracy: 0.00%
	Val_Loss: 3.158,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 3.190, Train_Accuracy: 0.00%
	Val_Loss: 3.093,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 3.142, Train_Accuracy: 0.00%
	Val_Loss: 3.231,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 3.141, Train_Accuracy: 0.00%
	Val_Loss: 3.020,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 3.133, Train_Accuracy: 0.00%
	Val_Loss: 3.077,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 3.137, Train_Accuracy: 0.00%
	Val_Loss: 3.063,  Val_

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁
Val_Loss,▆▄█▂▃▃▁
train_accuracy,▁▁▁▁▁▁▁
training_loss,█▄▁▁▁▁▂

0,1
Val_Accuracy,0.0
Val_Loss,3.00119
train_accuracy,0.0
training_loss,3.14736


[34m[1mwandb[0m: Agent Starting Run: pvlqb6a6 with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 64
[34m[1mwandb[0m: 	input_embed_size: 32
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(32, 32, num_layers=3, batch_first=True, dropout=0.3, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(32, 192, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=192, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.024, Train_Accuracy: 0.00%
	Val_Loss: 2.931,  Val_Accuracy: 0.02%
Epoch: 2
	Train_Loss: 2.628, Train_Accuracy: 0.00%
	Val_Loss: 2.436,  Val_Accuracy: 0.15%
Epoch: 3
	Train_Loss: 2.152, Train_Accuracy: 0.14%
	Val_Loss: 1.988,  Val_Accuracy: 2.12%
Epoch: 4
	Train_Loss: 1.791, Train_Accuracy: 0.85%
	Val_Loss: 1.766,  Val_Accuracy: 5.71%
Epoch: 5
	Train_Loss: 1.567, Train_Accuracy: 1.95%
	Val_Loss: 1.628,  Val_Accuracy: 9.18%
Epoch: 6
	Train_Loss: 1.421, Train_Accuracy: 3.43%
	Val_Loss: 1.470,  Val_Accurac

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▂▄▆▇█
Val_Loss,█▆▃▂▂▁▁
train_accuracy,▁▁▁▂▄▇█
training_loss,█▆▄▃▂▁▁

0,1
Val_Accuracy,13.06152
Val_Loss,1.46997
train_accuracy,4.18164
training_loss,1.30768


[34m[1mwandb[0m: Agent Starting Run: cn29cvbs with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 16
[34m[1mwandb[0m: 	input_embed_size: 32
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(32, 8, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(32, 32, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=32, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.233, Train_Accuracy: 0.00%
	Val_Loss: 2.986,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.869, Train_Accuracy: 0.00%
	Val_Loss: 2.874,  Val_Accuracy: 0.02%
Epoch: 3
	Train_Loss: 2.718, Train_Accuracy: 0.01%
	Val_Loss: 2.675,  Val_Accuracy: 0.12%
Epoch: 4
	Train_Loss: 2.577, Train_Accuracy: 0.01%
	Val_Loss: 2.561,  Val_Accuracy: 0.24%
Epoch: 5
	Train_Loss: 2.473, Train_Accuracy: 0.03%
	Val_Loss: 2.456,  Val_Accuracy: 0.27%
Epoch: 6
	Train_Loss: 2.387, Train_Accuracy: 0.05%
	Val_Loss: 2.384,  Val_Accuracy: 

VBox(children=(Label(value='0.001 MB of 0.019 MB uploaded\r'), FloatProgress(value=0.07139555190302532, max=1.…

0,1
Val_Accuracy,▁▁▂▄▄▅█
Val_Loss,█▇▅▄▃▂▁
train_accuracy,▁▁▂▂▃▆█
training_loss,█▅▄▃▂▁▁

0,1
Val_Accuracy,0.65918
Val_Loss,2.30618
train_accuracy,0.07227
training_loss,2.33035


[34m[1mwandb[0m: Agent Starting Run: fyl5yvh8 with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.1
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 512, batch_first=True)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.261, Train_Accuracy: 0.00%
	Val_Loss: 3.154,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 3.153, Train_Accuracy: 0.00%
	Val_Loss: 3.274,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 3.193, Train_Accuracy: 0.00%
	Val_Loss: 3.340,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 3.190, Train_Accuracy: 0.00%
	Val_Loss: 3.194,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 3.259, Train_Accuracy: 0.00%
	Val_Loss: 3.361,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 3.240, Train_Accuracy: 0.00%
	Val_Loss: 3.230,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 3.232, Trai

VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁
Val_Loss,▁▅▇▂█▄▂
train_accuracy,▁▁▁▁▁▁▁
training_loss,█▁▄▃█▇▆

0,1
Val_Accuracy,0.0
Val_Loss,3.19278
train_accuracy,0.0
training_loss,3.23167


[34m[1mwandb[0m: Agent Starting Run: 127nxe86 with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 16
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(16, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(16, 512, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.957, Train_Accuracy: 0.00%
	Val_Loss: 2.786,  Val_Accuracy: 0.02%
Epoch: 2
	Train_Loss: 2.173, Train_Accuracy: 0.26%
	Val_Loss: 1.891,  Val_Accuracy: 3.20%
Epoch: 3
	Train_Loss: 1.501, Train_Accuracy: 3.86%
	Val_Loss: 1.524,  Val_Accuracy: 10.84%
Epoch: 4
	Train_Loss: 1.154, Train_Accuracy: 8.47%
	Val_Loss: 1.393,  Val_Accuracy: 16.14%
Epoch: 5
	Train_Loss: 0.971, Train_Accuracy: 10.87%
	Val_Loss: 1.274,  Val_Accuracy: 16.94%
Epoch: 6
	Train_Loss: 0.842, Train_Accuracy: 13.10%
	Val_Loss: 1.236,  Val_Accuracy: 18.87%
Epoch: 7
	T

VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▂▄▆▆▇█
Val_Loss,█▄▂▂▁▁▁
train_accuracy,▁▁▃▅▆▇█
training_loss,█▆▃▂▂▁▁

0,1
Val_Accuracy,22.36328
Val_Loss,1.22428
train_accuracy,15.86719
training_loss,0.74159


[34m[1mwandb[0m: Agent Starting Run: 2rgssalt with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.01
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(512, 512, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(512, 512, num_layers=3, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 3
	Train_Loss: 2.821, Train_Accuracy: 0.00%
	Val_Loss: 2.944,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.766, Train_Accuracy: 0.00%
	Val_Loss: 2.916,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.721, Train_Accuracy: 0.00%
	Val_Loss: 2.876,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.681, Train_Accuracy: 0.00%
	Val_Loss: 2.870,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 2.653, Train_Accuracy: 0.00%
	Val_Loss: 2.808,  Val_Accuracy: 0.00%


VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁
Val_Loss,██▆▅▃▃▁
train_accuracy,▁▁▁▁▁▁▁
training_loss,█▆▅▄▃▂▁

0,1
Val_Accuracy,0.0
Val_Loss,2.80793
train_accuracy,0.0
training_loss,2.65335


[34m[1mwandb[0m: Agent Starting Run: jlrn2y4u with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.01
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.635, Train_Accuracy: 0.03%
	Val_Loss: 2.322,  Val_Accuracy: 0.51%
Epoch: 2
	Train_Loss: 2.128, Train_Accuracy: 0.24%
	Val_Loss: 2.151,  Val_Accuracy: 1.37%
Epoch: 3
	Train_Loss: 1.984, Train_Accuracy: 0.50%
	Val_Loss: 1.998,  Val_Accuracy: 2.00%
Epoch: 4
	Train_Loss: 1.877, Train_Accuracy: 0.83%
	Val_Loss: 1.951,  Val_Accuracy: 3.39%
Epoch: 5
	Train_Loss: 1.820, Train_Accuracy: 1.06%
	Val_Loss: 1.894,  Val_Accuracy: 3.20%
Epoch: 6
	Train_Loss: 1.774, Train_Accuracy: 1.42%
	Val_Loss: 1.864,  Val_Accuracy: 3.61%
Epoch:

VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▃▄▆▆▇█
Val_Loss,█▆▃▃▂▂▁
train_accuracy,▁▂▃▅▆██
training_loss,█▄▃▂▂▁▁

0,1
Val_Accuracy,4.19922
Val_Loss,1.82083
train_accuracy,1.48047
training_loss,1.75254


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 9fsl9gx1 with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 64
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.01
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 64, num_layers=3, batch_first=True, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 64, batch_first=True)
    (fc): Linear(in_features=64, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.966, Train_Accuracy: 0.00%
	Val_Loss: 3.084,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.897, Train_Accuracy: 0.00%
	Val_Loss: 3.063,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.859, Train_Accuracy: 0.00%
	Val_Loss: 3.028,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.848, Train_Accuracy: 0.00%
	Val_Loss: 3.059,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.836, Train_Accuracy: 0.00%
	Val_Loss: 3.033,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.826, Train_Accuracy: 0.00%
	Val_Loss: 3.076,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 2.820, Train_A

VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁
Val_Loss,█▆▃▆▃▇▁
train_accuracy,▁▁▁▁▁█▁
training_loss,█▅▃▂▂▁▁

0,1
Val_Accuracy,0.0
Val_Loss,3.00976
train_accuracy,0.0
training_loss,2.81953


[34m[1mwandb[0m: Agent Starting Run: xwgj6yyk with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 16
[34m[1mwandb[0m: 	input_embed_size: 16
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(16, 16, num_layers=3, batch_first=True, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(16, 16, batch_first=True)
    (fc): Linear(in_features=16, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.366, Train_Accuracy: 0.00%
	Val_Loss: 3.124,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 3.074, Train_Accuracy: 0.00%
	Val_Loss: 3.064,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.987, Train_Accuracy: 0.00%
	Val_Loss: 3.017,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.953, Train_Accuracy: 0.00%
	Val_Loss: 3.053,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.931, Train_Accuracy: 0.00%
	Val_Loss: 3.012,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.916, Train_Accuracy: 0.00%
	Val_Loss: 3.023,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 2.903, Train_Accur

VBox(children=(Label(value='0.001 MB of 0.018 MB uploaded\r'), FloatProgress(value=0.0724448845572523, max=1.0…

0,1
Val_Accuracy,▁▁▁▁▁▁▁
Val_Loss,█▄▁▄▁▂▁
train_accuracy,▁▁▁▁▁▁▁
training_loss,█▄▂▂▁▁▁

0,1
Val_Accuracy,0.0
Val_Loss,3.01378
train_accuracy,0.0
training_loss,2.9029


[34m[1mwandb[0m: Agent Starting Run: 5z4zpd7y with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 16
[34m[1mwandb[0m: 	input_embed_size: 64
[34m[1mwandb[0m: 	new_learning_rate: 0.01
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(64, 8, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(64, 48, num_layers=3, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=48, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.953, Train_Accuracy: 0.00%
	Val_Loss: 2.776,  Val_Accuracy: 0.05%
Epoch: 2
	Train_Loss: 2.584, Train_Accuracy: 0.01%
	Val_Loss: 2.538,  Val_Accuracy: 0.02%
Epoch: 3
	Train_Loss: 2.413, Train_Accuracy: 0.02%
	Val_Loss: 2.393,  Val_Accuracy: 0.15%
Epoch: 4
	Train_Loss: 2.328, Train_Accuracy: 0.03%
	Val_Loss: 2.332,  Val_Accuracy: 0.27%
Epoch: 5
	Train_Loss: 2.278, Train_Accuracy: 0.02%
	Val_Loss: 2.308,  Val_Accuracy: 0.42%
Epoch: 6
	Train_Loss: 2.244, Train_Accuracy: 0.04%
	Val_Loss: 2.297,  Val_Accuracy: 

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▃▅▇█▆
Val_Loss,█▅▃▂▂▂▁
train_accuracy,▁▂▅▅▅▇█
training_loss,█▄▃▂▂▁▁

0,1
Val_Accuracy,0.36621
Val_Loss,2.24628
train_accuracy,0.04688
training_loss,2.22128


[34m[1mwandb[0m: Agent Starting Run: um3lh8ax with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 32
[34m[1mwandb[0m: 	input_embed_size: 16
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(16, 32, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(16, 32, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=32, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.275, Train_Accuracy: 0.00%
	Val_Loss: 3.069,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 3.015, Train_Accuracy: 0.00%
	Val_Loss: 3.056,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.936, Train_Accuracy: 0.00%
	Val_Loss: 3.015,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.901, Train_Accuracy: 0.00%
	Val_Loss: 2.968,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.870, Train_Accuracy: 0.00%
	Val_Loss: 2.968,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.836, Train_Accuracy: 0.00%
	Val_Loss: 2.895,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 2.798, Train_Accur

VBox(children=(Label(value='0.001 MB of 0.019 MB uploaded\r'), FloatProgress(value=0.07163471449185735, max=1.…

0,1
Val_Accuracy,▁▁▁▁▁▁▁
Val_Loss,█▇▆▄▄▁▁
train_accuracy,▁▁▁▁▁▁▁
training_loss,█▄▃▃▂▂▁

0,1
Val_Accuracy,0.0
Val_Loss,2.88362
train_accuracy,0.0
training_loss,2.79808


[34m[1mwandb[0m: Agent Starting Run: 5nx1gqia with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 64
[34m[1mwandb[0m: 	new_learning_rate: 0.01
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(64, 256, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(64, 1024, batch_first=True)
    (fc): Linear(in_features=1024, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.501, Train_Accuracy: 0.00%
	Val_Loss: 2.262,  Val_Accuracy: 0.59%
Epoch: 2
	Train_Loss: 2.031, Train_Accuracy: 0.24%
	Val_Loss: 2.041,  Val_Accuracy: 1.64%
Epoch: 3
	Train_Loss: 1.810, Train_Accuracy: 0.67%
	Val_Loss: 1.917,  Val_Accuracy: 2.66%
Epoch: 4
	Train_Loss: 1.643, Train_Accuracy: 1.18%
	Val_Loss: 1.765,  Val_Accuracy: 4.00%
Epoch: 5
	Train_Loss: 1.531, Train_Accuracy: 1.47%
	Val_Loss: 1.745,  Val_Accuracy: 6.27%
Epoch: 6
	Train_Loss: 1.477, Train_Accuracy: 1.27%
	Val_Loss: 1.641,  Val_Accuracy: 6.54%
Epoch: 7
	Train

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▂▃▄▇▇█
Val_Loss,█▆▄▂▂▁▁
train_accuracy,▁▂▄▇█▇█
training_loss,█▅▄▂▂▁▁

0,1
Val_Accuracy,7.8125
Val_Loss,1.64825
train_accuracy,1.44336
training_loss,1.42079


[34m[1mwandb[0m: Agent Starting Run: ihwqu15r with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 32
[34m[1mwandb[0m: 	input_embed_size: 32
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(32, 16, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(32, 32, batch_first=True)
    (fc): Linear(in_features=32, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.113, Train_Accuracy: 0.00%
	Val_Loss: 2.750,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.661, Train_Accuracy: 0.01%
	Val_Loss: 2.536,  Val_Accuracy: 0.29%
Epoch: 3
	Train_Loss: 2.520, Train_Accuracy: 0.03%
	Val_Loss: 2.410,  Val_Accuracy: 0.51%
Epoch: 4
	Train_Loss: 2.420, Train_Accuracy: 0.07%
	Val_Loss: 2.313,  Val_Accuracy: 0.78%
Epoch: 5
	Train_Loss: 2.350, Train_Accuracy: 0.08%
	Val_Loss: 2.263,  Val_Accuracy: 1.03%
Epoch: 6
	Train_Loss: 2.298, Train_Accuracy: 0.14%
	Val_Loss: 2.213,  Val_Accuracy: 1.29%
Epoch: 7
	Train_Loss: 2.254, Train_Accuracy: 0.

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▂▃▅▆▇█
Val_Loss,█▅▄▃▂▁▁
train_accuracy,▁▁▂▄▅▇█
training_loss,█▄▃▂▂▁▁

0,1
Val_Accuracy,1.53809
Val_Loss,2.17742
train_accuracy,0.15625
training_loss,2.25369


[34m[1mwandb[0m: Agent Starting Run: wnr59c0h with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 16
[34m[1mwandb[0m: 	input_embed_size: 64
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(64, 8, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(64, 48, batch_first=True)
    (fc): Linear(in_features=48, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.988, Train_Accuracy: 0.00%
	Val_Loss: 2.785,  Val_Accuracy: 0.02%
Epoch: 2
	Train_Loss: 2.536, Train_Accuracy: 0.01%
	Val_Loss: 2.492,  Val_Accuracy: 0.15%
Epoch: 3
	Train_Loss: 2.352, Train_Accuracy: 0.04%
	Val_Loss: 2.324,  Val_Accuracy: 0.42%
Epoch: 4
	Train_Loss: 2.251, Train_Accuracy: 0.11%
	Val_Loss: 2.233,  Val_Accuracy: 0.78%
Epoch: 5
	Train_Loss: 2.170, Train_Accuracy: 0.17%
	Val_Loss: 2.163,  Val_Accuracy: 1.20%
Epoch: 6
	Train_Loss: 2.097, Train_Accuracy: 0.28%
	Val_Loss: 2.086,  Val_Accuracy: 1.81%
Epoch: 7
	Train_Loss:

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▂▄▅▇█
Val_Loss,█▅▄▃▂▂▁
train_accuracy,▁▁▂▃▅▇█
training_loss,█▅▃▃▂▁▁

0,1
Val_Accuracy,2.05078
Val_Loss,2.0282
train_accuracy,0.3457
training_loss,2.0317


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: qdzrspqo with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 64
[34m[1mwandb[0m: 	new_learning_rate: 0.1
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(64, 512, num_layers=3, batch_first=True, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(64, 512, batch_first=True)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.182, Train_Accuracy: 0.00%
	Val_Loss: 3.274,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 3.115, Train_Accuracy: 0.00%
	Val_Loss: 3.154,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 3.091, Train_Accuracy: 0.00%
	Val_Loss: 3.194,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 3.084, Train_Accuracy: 0.00%
	Val_Loss: 3.149,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 3.085, Train_Accuracy: 0.00%
	Val_Loss: 3.141,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 3.084, Train_Accuracy: 0.00%
	Val_Loss: 3.172,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 3.090, Train_Ac

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁
Val_Loss,█▂▄▁▁▃▁
train_accuracy,▁▁▁▁▁▁▁
training_loss,█▃▂▁▁▁▁

0,1
Val_Accuracy,0.0
Val_Loss,3.14683
train_accuracy,0.0
training_loss,3.09036


[34m[1mwandb[0m: Agent Starting Run: 1ofys77n with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 16
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 8, num_layers=3, batch_first=True, dropout=0.3, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 48, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=48, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.233, Train_Accuracy: 0.00%
	Val_Loss: 3.036,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.960, Train_Accuracy: 0.00%
	Val_Loss: 3.035,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.892, Train_Accuracy: 0.00%
	Val_Loss: 2.947,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.828, Train_Accuracy: 0.00%
	Val_Loss: 2.894,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.782, Train_Accuracy: 0.00%
	Val_Loss: 2.837,  Val_Accuracy: 0.02%
Epoch: 6
	Train_Loss: 2.735, Train_Accuracy: 0.00%
	Val_Loss: 2.797,  Val_Accura

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▅█▅
Val_Loss,██▆▅▃▂▁
train_accuracy,▁▁▁█▁█▁
training_loss,█▄▄▃▂▁▁

0,1
Val_Accuracy,0.02441
Val_Loss,2.74133
train_accuracy,0.0
training_loss,2.69841


[34m[1mwandb[0m: Agent Starting Run: x359tq53 with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 64
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.01
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(512, 64, num_layers=2, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(512, 64, batch_first=True)
    (fc): Linear(in_features=64, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.998, Train_Accuracy: 0.00%
	Val_Loss: 3.217,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.955, Train_Accuracy: 0.00%
	Val_Loss: 3.115,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.941, Train_Accuracy: 0.00%
	Val_Loss: 3.082,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.935, Train_Accuracy: 0.00%
	Val_Loss: 3.094,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.933, Train_Accuracy: 0.00%
	Val_Loss: 3.108,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.934, Train_Accuracy: 0.00%
	Val_Loss: 3.137,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 2.931, Train_A

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁
Val_Loss,█▃▁▂▂▄▄
train_accuracy,▁▁▁▁▁▁▁
training_loss,█▄▂▁▁▁▁

0,1
Val_Accuracy,0.0
Val_Loss,3.13184
train_accuracy,0.0
training_loss,2.93131


[34m[1mwandb[0m: Agent Starting Run: 4gumauyv with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 16
[34m[1mwandb[0m: 	new_learning_rate: 0.1
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(16, 256, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(16, 1024, num_layers=3, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=1024, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 55.365, Train_Accuracy: 0.00%
	Val_Loss: 44.881,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 50.499, Train_Accuracy: 0.00%
	Val_Loss: 52.492,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 51.907, Train_Accuracy: 0.00%
	Val_Loss: 54.862,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 51.634, Train_Accuracy: 0.00%
	Val_Loss: 56.787,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 51.413, Train_Accuracy: 0.00%
	Val_Loss: 52.221,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 51.046, Train_Accuracy: 0.00%
	Val_Loss: 44.00

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁
Val_Loss,▁▆▇█▆▁▄
train_accuracy,▁▁▁▁▁▁▁
training_loss,█▁▃▃▂▂▁

0,1
Val_Accuracy,0.0
Val_Loss,49.21648
train_accuracy,0.0
training_loss,50.84049


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: rnnc5kc5 with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 256
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 128, num_layers=3, batch_first=True, dropout=0.3, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 768, batch_first=True)
    (fc): Linear(in_features=768, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.570, Train_Accuracy: 5.31%
	Val_Loss: 1.405,  Val_Accuracy: 12.99%
Epoch: 2
	Train_Loss: 0.923, Train_Accuracy: 14.22%
	Val_Loss: 1.281,  Val_Accuracy: 19.53%
Epoch: 3
	Train_Loss: 0.761, Train_Accuracy: 18.40%
	Val_Loss: 1.266,  Val_Accuracy: 22.19%
Epoch: 4
	Train_Loss: 0.665, Train_Accuracy: 20.86%
	Val_Loss: 1.217,  Val_Accuracy: 27.54%
Epoch: 5
	Train_Loss: 0.607, Train_Accuracy: 22.15%
	Val_Loss: 1.211,  Val_Accuracy: 25.29%
Epoch: 6
	Train_Loss: 0.552, Train_Accuracy: 20.25%
	Val_Loss: 1.130,  Val_Accuracy: 29.59%
Ep

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▄▅▇▆█▇
Val_Loss,█▅▄▃▃▁▁
train_accuracy,▁▅▆▇█▇▇
training_loss,█▄▃▂▂▁▁

0,1
Val_Accuracy,26.85547
Val_Loss,1.14699
train_accuracy,20.91016
training_loss,0.51052


[34m[1mwandb[0m: Agent Starting Run: 5f9v61sa with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 32
[34m[1mwandb[0m: 	new_learning_rate: 0.01
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(32, 512, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(32, 512, batch_first=True)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.825, Train_Accuracy: 0.00%
	Val_Loss: 2.823,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.458, Train_Accuracy: 0.00%
	Val_Loss: 2.550,  Val_Accuracy: 0.07%
Epoch: 3
	Train_Loss: 2.260, Train_Accuracy: 0.01%
	Val_Loss: 2.331,  Val_Accuracy: 0.15%
Epoch: 4
	Train_Loss: 2.105, Train_Accuracy: 0.05%
	Val_Loss: 2.226,  Val_Accuracy: 0.37%
Epoch: 5
	Train_Loss: 1.996, Train_Accuracy: 0.09%
	Val_Loss: 2.135,  Val_Accuracy: 0.85%
Epoch: 6
	Train_Loss: 1.890, Train_Accuracy: 0.16%
	Val_Loss: 2.024,  Val_Accuracy: 1.44%
Epoch: 7
	Train_Loss: 1.813, Train_Accuracy: 0.16%
	Val_Loss: 2.

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▂▄▅█
Val_Loss,█▆▄▃▂▁▁
train_accuracy,▁▁▁▃▅██
training_loss,█▅▄▃▂▂▁

0,1
Val_Accuracy,2.31934
Val_Loss,2.01066
train_accuracy,0.16016
training_loss,1.8131


[34m[1mwandb[0m: Agent Starting Run: 1eeiuddk with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 16
[34m[1mwandb[0m: 	input_embed_size: 16
[34m[1mwandb[0m: 	new_learning_rate: 0.01
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(16, 16, num_layers=2, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(16, 16, batch_first=True)
    (fc): Linear(in_features=16, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.001, Train_Accuracy: 0.00%
	Val_Loss: 3.039,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.841, Train_Accuracy: 0.00%
	Val_Loss: 2.880,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.721, Train_Accuracy: 0.00%
	Val_Loss: 2.767,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.645, Train_Accuracy: 0.00%
	Val_Loss: 2.646,  Val_Accuracy: 0.05%
Epoch: 5
	Train_Loss: 2.592, Train_Accuracy: 0.00%
	Val_Loss: 2.635,  Val_Accuracy: 0.10%
Epoch: 6
	Train_Loss: 2.549, Train_Accuracy: 0.00%
	Val_Loss: 2.622,  Val_Accuracy: 0.07%
Epoch: 7
	Train_Loss: 2.526, Train_Accur

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▅█▆▆
Val_Loss,█▆▄▂▂▂▁
train_accuracy,▃▁▁▃▆▆█
training_loss,█▆▄▃▂▁▁

0,1
Val_Accuracy,0.07324
Val_Loss,2.56157
train_accuracy,0.00586
training_loss,2.52565


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: oqzkyvdp with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 64
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.1
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 32, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 128, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=128, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.930, Train_Accuracy: 0.00%
	Val_Loss: 2.851,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.843, Train_Accuracy: 0.00%
	Val_Loss: 2.820,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.843, Train_Accuracy: 0.00%
	Val_Loss: 2.817,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.826, Train_Accuracy: 0.00%
	Val_Loss: 2.823,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.840, Train_Accuracy: 0.00%
	Val_Loss: 2.891,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.832, Train_Accuracy: 0.00%
	Val_Loss: 2.813,  Val_Acc

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁
Val_Loss,▆▄▄▄█▃▁
train_accuracy,▁▁▁▁▁▁▁
training_loss,█▂▂▁▂▂▁

0,1
Val_Accuracy,0.0
Val_Loss,2.77022
train_accuracy,0.0
training_loss,2.82084


[34m[1mwandb[0m: Agent Starting Run: k6ca69jh with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 16
[34m[1mwandb[0m: 	input_embed_size: 32
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(32, 16, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(32, 16, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=16, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.431, Train_Accuracy: 0.00%
	Val_Loss: 3.139,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 3.149, Train_Accuracy: 0.00%
	Val_Loss: 3.101,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 3.033, Train_Accuracy: 0.00%
	Val_Loss: 3.064,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.989, Train_Accuracy: 0.00%
	Val_Loss: 3.053,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.973, Train_Accuracy: 0.00%
	Val_Loss: 3.028,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.952, Train_Accuracy: 0.00%
	Val_Loss: 3.036,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 2.938, Train_Accur

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁
Val_Loss,█▆▄▃▂▂▁
train_accuracy,▁▁▁▁▁▁▁
training_loss,█▄▂▂▁▁▁

0,1
Val_Accuracy,0.0
Val_Loss,3.01775
train_accuracy,0.0
training_loss,2.93771


[34m[1mwandb[0m: Agent Starting Run: z5kmij4f with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 256
[34m[1mwandb[0m: 	input_embed_size: 32
[34m[1mwandb[0m: 	new_learning_rate: 0.01
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(32, 256, num_layers=3, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(32, 256, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=256, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.875, Train_Accuracy: 0.00%
	Val_Loss: 2.810,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.424, Train_Accuracy: 0.02%
	Val_Loss: 2.395,  Val_Accuracy: 0.22%
Epoch: 3
	Train_Loss: 2.162, Train_Accuracy: 0.02%
	Val_Loss: 2.235,  Val_Accuracy: 0.34%
Epoch: 4
	Train_Loss: 2.026, Train_Accuracy: 0.10%
	Val_Loss: 2.163,  Val_Accuracy: 0.81%
Epoch: 5
	Train_Loss: 1.920, Train_Accuracy: 0.13%
	Val_Loss: 2.036,  Val_Accuracy: 1.42%
Epoch: 6
	Train_Loss: 1.817, Train_Accuracy: 0.15%
	Val_Loss: 1.938,  Val_Accuracy: 1.81%
Epoch: 7
	

VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▂▂▄▅▇█
Val_Loss,█▅▃▃▂▁▁
train_accuracy,▁▂▂▆▇█▇
training_loss,█▅▄▃▂▂▁

0,1
Val_Accuracy,2.22168
Val_Loss,1.92575
train_accuracy,0.12695
training_loss,1.72715


[34m[1mwandb[0m: Agent Starting Run: zbeydp7n with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 16
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.1
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(512, 16, num_layers=3, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(512, 16, batch_first=True)
    (fc): Linear(in_features=16, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.178, Train_Accuracy: 0.00%
	Val_Loss: 3.160,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 3.183, Train_Accuracy: 0.00%
	Val_Loss: 3.210,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 3.186, Train_Accuracy: 0.00%
	Val_Loss: 3.264,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 3.190, Train_Accuracy: 0.00%
	Val_Loss: 3.225,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 3.178, Train_Accuracy: 0.00%
	Val_Loss: 3.212,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 3.174, Train_Accuracy: 0.00%
	Val_Loss: 3.190,  Val_Accuracy: 0.00%
Epoch: 7
	Train_Loss: 3.170, Train_A

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▁▁
Val_Loss,▁▄█▅▅▃▇
train_accuracy,▁▁▁▁▁▁▁
training_loss,▄▅▆█▄▂▁

0,1
Val_Accuracy,0.0
Val_Loss,3.24677
train_accuracy,0.0
training_loss,3.17049


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ti75t3pd with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 32
[34m[1mwandb[0m: 	input_embed_size: 16
[34m[1mwandb[0m: 	new_learning_rate: 0.01
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(16, 32, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(16, 32, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=32, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.005, Train_Accuracy: 0.00%
	Val_Loss: 2.964,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.852, Train_Accuracy: 0.00%
	Val_Loss: 2.948,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.792, Train_Accuracy: 0.00%
	Val_Loss: 2.855,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.742, Train_Accuracy: 0.00%
	Val_Loss: 2.815,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.701, Train_Accuracy: 0.00%
	Val_Loss: 2.726,  Val_Accuracy: 0.05%
Epoch: 6
	Train_Loss: 2.636, Train_Accuracy: 0.00%
	Val_Loss: 2.664,  Val_Accuracy: 0.02%
Epoch: 7
	Train_Loss: 2.595, Train_Accur

VBox(children=(Label(value='0.001 MB of 0.019 MB uploaded\r'), FloatProgress(value=0.07160150414670581, max=1.…

0,1
Val_Accuracy,▁▁▁▁█▅▅
Val_Loss,██▆▅▃▂▁
train_accuracy,▁▁▅▅█▁▁
training_loss,█▅▄▄▃▂▁

0,1
Val_Accuracy,0.02441
Val_Loss,2.6059
train_accuracy,0.0
training_loss,2.59472


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: cpy9anqr with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 32
[34m[1mwandb[0m: 	input_embed_size: 16
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(16, 32, num_layers=2, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(16, 32, num_layers=3, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=32, out_features=100, bias=True)
  )
)


Traceback (most recent call last):
  File "/tmp/ipykernel_34/2511518392.py", line 61, in main
    train_loss, train_accuracy = train(model, train_loader, optimizer, criterion, CLIP, device, ignore_index)
  File "/tmp/ipykernel_34/2440559799.py", line 11, in train
    output = model(source, target)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/tmp/ipykernel_34/2483102920.py", line 89, in forward
    h_n = torch.cat([h_n, zero_h], dim=0)
RuntimeError: Sizes of tensors must match except in dimension 0. Expected size 32 but got size 64 for tensor number 1 in the list.


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Run cpy9anqr errored:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/wandb/agents/pyagent.py", line 308, in _run_job
    self._function()
  File "/tmp/ipykernel_34/2511518392.py", line 61, in main
    train_loss, train_accuracy = train(model, train_loader, optimizer, criterion, CLIP, device, ignore_index)
  File "/tmp/ipykernel_34/2440559799.py", line 11, in train
    output = model(source, target)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/tmp/ipykernel_34/2483102920.py", line 89, in forward
    h_n = torch.cat([h_n, zero_h], dim=0)
RuntimeError: Sizes of tensors must match except in dimension 0. Expected size 32 but got size 64 for tensor number 1 in the list.

[34m[1mwandb[0m: [32m[41

Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(16, 8, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(16, 32, num_layers=3, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=32, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.285, Train_Accuracy: 0.00%
	Val_Loss: 3.078,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 3.049, Train_Accuracy: 0.00%
	Val_Loss: 3.021,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.926, Train_Accuracy: 0.00%
	Val_Loss: 3.015,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.867, Train_Accuracy: 0.00%
	Val_Loss: 2.965,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.787, Train_Accuracy: 0.00%
	Val_Loss: 2.799,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 2.679, Train_Accuracy: 0.00%
	Val_Loss: 2.711,  Val_Accuracy: 

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▁▁▁▁▆█
Val_Loss,█▇▇▆▄▂▁
train_accuracy,▁▁▅▅▁██
training_loss,█▆▄▄▃▂▁

0,1
Val_Accuracy,0.07324
Val_Loss,2.64182
train_accuracy,0.00391
training_loss,2.6032


[34m[1mwandb[0m: Agent Starting Run: hmnvppqg with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 16
[34m[1mwandb[0m: 	new_learning_rate: 0.01
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(16, 256, num_layers=3, batch_first=True, dropout=0.3, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(16, 1536, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=1536, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 3.105, Train_Accuracy: 0.00%
	Val_Loss: 3.045,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 3.068, Train_Accuracy: 0.00%
	Val_Loss: 2.993,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 3.065, Train_Accuracy: 0.00%
	Val_Loss: 2.991,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 3.073, Train_Accuracy: 0.00%
	Val_Loss: 3.013,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 3.071, Train_Accuracy: 0.00%
	Val_Loss: 3.060,  Val_Accuracy: 0.00%
Epoch: 6
	Train_Loss: 3.068, Train_Accuracy: 0.00%
	Val_Loss: 3.036,  Val_Accu

VBox(children=(Label(value='0.001 MB of 0.018 MB uploaded\r'), FloatProgress(value=0.0722528329348165, max=1.0…

0,1
Val_Accuracy,▁▁▁▁▁▁▁
Val_Loss,▆▁▁▃█▆▁
train_accuracy,▁▁▁▁▁▁▁
training_loss,█▂▁▂▂▂▃

0,1
Val_Accuracy,0.0
Val_Loss,2.9948
train_accuracy,0.0
training_loss,3.07595


[34m[1mwandb[0m: Agent Starting Run: 480so5xl with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 512, num_layers=3, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.211, Train_Accuracy: 1.25%
	Val_Loss: 1.555,  Val_Accuracy: 9.81%
Epoch: 2
	Train_Loss: 1.076, Train_Accuracy: 8.88%
	Val_Loss: 1.331,  Val_Accuracy: 17.50%
Epoch: 3
	Train_Loss: 0.823, Train_Accuracy: 14.23%
	Val_Loss: 1.236,  Val_Accuracy: 23.36%
Epoch: 4
	Train_Loss: 0.685, Train_Accuracy: 18.55%
	Val_Loss: 1.219,  Val_Accuracy: 25.59%
Epoch: 5
	Train_Loss: 0.606, Train_Accuracy: 20.46%
	Val_Loss: 1.153,  Val_Accuracy: 23.19%
Epoch: 6
	Train_Loss: 0.537, Train_Accuracy: 22.59%
	Val_Loss: 1.170,  Val_Accuracy: 30.0

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▃▅▆▅▇█
Val_Loss,█▅▃▃▂▂▁
train_accuracy,▁▄▅▇▇██
training_loss,█▃▂▂▁▁▁

0,1
Val_Accuracy,33.91113
Val_Loss,1.10666
train_accuracy,21.92773
training_loss,0.49028


[34m[1mwandb[0m: Agent Starting Run: wd6w5wfr with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 256
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 256, num_layers=3, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 256, num_layers=3, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=256, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.708, Train_Accuracy: 0.03%
	Val_Loss: 2.173,  Val_Accuracy: 0.51%
Epoch: 2
	Train_Loss: 1.565, Train_Accuracy: 2.94%
	Val_Loss: 1.439,  Val_Accuracy: 11.50%
Epoch: 3
	Train_Loss: 1.082, Train_Accuracy: 7.88%
	Val_Loss: 1.297,  Val_Accuracy: 15.36%
Epoch: 4
	Train_Loss: 0.894, Train_Accuracy: 10.95%
	Val_Loss: 1.248,  Val_Accuracy: 20.31%
Epoch: 5
	Train_Loss: 0.781, Train_Accuracy: 13.05%
	Val_Loss: 1.171,  Val_Accuracy: 27.49%
Epoch: 6
	Train_Loss: 0.702, Train_Accuracy: 14.94%
	Val_Loss: 1.149,  Val_Accuracy: 27.66

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▄▅▆███
Val_Loss,█▃▂▂▁▁▁
train_accuracy,▁▂▄▅▆▇█
training_loss,█▄▂▂▁▁▁

0,1
Val_Accuracy,26.36719
Val_Loss,1.1383
train_accuracy,17.23828
training_loss,0.64753


[34m[1mwandb[0m: Agent Starting Run: qtzmht1w with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(512, 512, num_layers=3, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.156, Train_Accuracy: 1.85%
	Val_Loss: 1.551,  Val_Accuracy: 10.50%
Epoch: 2
	Train_Loss: 1.064, Train_Accuracy: 10.86%
	Val_Loss: 1.299,  Val_Accuracy: 20.85%
Epoch: 3
	Train_Loss: 0.829, Train_Accuracy: 16.97%
	Val_Loss: 1.254,  Val_Accuracy: 22.88%
Epoch: 4
	Train_Loss: 0.704, Train_Accuracy: 16.88%
	Val_Loss: 1.164,  Val_Accuracy: 27.56%
Epoch: 5
	Train_Loss: 0.622, Train_Accuracy: 17.04%
	Val_Loss: 1.188,  Val_Accuracy: 19.41%
Epoch: 6
	Train_Loss: 0.565, Train_Accuracy: 19.95%
	Val_Loss: 1.171,  Val_Accuracy: 29

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Val_Accuracy,▁▅▆▇▄██
Val_Loss,█▄▃▁▂▁▁
train_accuracy,▁▄▆▆▆▇█
training_loss,█▃▂▂▁▁▁

0,1
Val_Accuracy,29.56543
Val_Loss,1.15352
train_accuracy,22.94531
training_loss,0.51416


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: b90zfijx with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 256, num_layers=3, batch_first=True, dropout=0.3, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 1536, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=1536, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.487, Train_Accuracy: 5.37%
	Val_Loss: 1.461,  Val_Accuracy: 15.70%
Epoch: 2
	Train_Loss: 0.849, Train_Accuracy: 12.16%
	Val_Loss: 1.278,  Val_Accuracy: 20.92%
Epoch: 3
	Train_Loss: 0.682, Train_Accuracy: 16.16%
	Val_Loss: 1.203,  Val_Accuracy: 26.29%
Epoch: 4
	Train_Loss: 0.584, Train_Accuracy: 18.63%
	Val_Loss: 1.181,  Val_Accuracy: 27.83%
Epoch: 5
	Train_Loss: 0.524, Train_Accuracy: 15.05%
	Val_Loss: 1.145,  Val_Accuracy: 24.68%
Epoch: 6
	Train_Loss: 0.471, Train_Accuracy: 21.66%
	Val_Loss: 1.

# **For training and testing model on the training and test dataset wandb setup**

In [29]:
# -embed_size-64-layers_enc-3-layers_dec-3-hid_size-512-cell_type-lstm-bidirectional-True-dropout-0.2

sweep_config = {
    'method': 'bayes',
    'name' : 'sweep test 2',
    'metric': {
        'name': 'Test_Accuracy',
        'goal': 'maximize'
    },
    'parameters': {
        'input_embed_size': {
            'values': [64]
        },
        'num_enc_layers':{
            'values': [3]
        },
        'num_dec_layers':{
            'values': [3]
        },
        'hid_layer_size': {
            'values': [512]
        },
        'cell_type': {
            'values': ['lstm']
        },
        'bidirectional':{
            'values': [True]
        },
        'dropout': {
            'values': [0.2]
        },
        'new_learning_rate':{
            'values': [0.001]
        }
#       'beam search in decoder with different beam sizes': 
    }
}

sweep_id = wandb.sweep(sweep = sweep_config, project="Deep_Learning_A3")


Create sweep with ID: 34ygj7rs
Sweep URL: https://wandb.ai/parthasakhapaul/Deep_Learning_A3/sweeps/34ygj7rs


In [30]:
import wandb

def main():
    # Initialize a new wandb run
    with wandb.init() as run:
        # Construct run name from configuration
        run_name = "-embed_size-"+str(wandb.config.input_embed_size)+"-layers_enc-"+str(wandb.config.num_enc_layers)+"-layers_dec-"+str(wandb.config.num_dec_layers)+"-hid_size-"+str(wandb.config.hid_layer_size)+"-cell_type-"+wandb.config.cell_type+"-bidirectional-"+str(wandb.config.bidirectional)+"-dropout-"+str(wandb.config.dropout)+"-lr-"+str(wandb.config.new_learning_rate)
        wandb.run.name = run_name

        # Constants defining the dimensions of the input and output character sets
        INPUT_DIM = 100  # size of the Latin character set
        OUTPUT_DIM = 100  # size of the Bangla character set

        # Constants defining the dimensions of the embeddings for encoder and decoder
        ENC_EMB_DIM = wandb.config.input_embed_size  # Encoder embedding dimension
        DEC_EMB_DIM = wandb.config.input_embed_size  # Decoder embedding dimension

        # Constants defining the dimension of the hidden layers for encoder and decoder
        HID_DIM = wandb.config.hid_layer_size  # Hidden dimension size

        # Constants defining the number of layers for encoder and decoder
        ENC_LAYERS = wandb.config.num_enc_layers  # Number of layers in the encoder
        DEC_LAYERS = wandb.config.num_dec_layers  # Number of layers in the decoder
        

        # Constants defining the type of RNN cell to use for encoder and decoder
        ENC_RNN_CELL = wandb.config.cell_type  # RNN cell type for the encoder
        DEC_RNN_CELL = wandb.config.cell_type  # RNN cell type for the decoder

        # Instantiate the encoder with specified configurations
        encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_LAYERS, ENC_RNN_CELL, dropout = wandb.config.dropout, bidirectional = wandb.config.bidirectional)
        # Instantiate the decoder with specified configurations
        decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_LAYERS, encoder.num_layers, DEC_RNN_CELL, dropout = wandb.config.dropout, bidirectional = wandb.config.bidirectional)

        # Determine the computing device (CUDA if available, otherwise CPU)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # Print the device will be used
        print(f"Using device: {device}")

        # Instantiate the Seq_to_Seq model and move it to the chosen computing device
        model = Seq_to_Seq(encoder, decoder).to(device)
        print(model)
        
        
        # Setting the number of epochs the training process should run
        NUM_EPOCHS = 20
        # Set the maximum norm of the gradients to 1 to prevent exploding gradients
        CLIP = 1
        # Initialize the optimizer, Adam
        optimizer = torch.optim.Adam(model.parameters(), lr=wandb.config.new_learning_rate)  # Set the learning rate to 0.001


        # Padding token index should be ignored in loss calculation
        ignore_index = bangla_token_to_index['<pad>']
        # Define the loss function with 'ignore_index' to avoid affecting loss calculation with padding tokens
        criterion = nn.CrossEntropyLoss(ignore_index=ignore_index).to(device)

        # Start the training process for the defined number of epochs
        for epoch in range(NUM_EPOCHS):
            # Doing training on the train dataset and return average loss and accuracy
            train_loss, train_accuracy = train(model, train_loader, optimizer, criterion, CLIP, device, ignore_index)
            # Evaluating the model on the validation dataset and return average loss and accuracy
            val_loss, val_accuracy = evaluate(model, test_loader, criterion, device, ignore_index)

            # Print the loss and accuracy for each epoch
            print(f'Epoch: {epoch+1}')
            print(f'\tTrain_Loss: {train_loss:.3f}, Train_Accuracy: {train_accuracy*100:.2f}%')
            print(f'\tTest_Loss: {val_loss:.3f},  Test_Accuracy: {val_accuracy*100:.2f}%')
            wandb.log({"train_accuracy": train_accuracy * 100, "training_loss": train_loss})
            wandb.log({"Test_Accuracy": val_accuracy * 100, "Test_Loss": val_loss})


wandb.agent(sweep_id, function=main, count=1)
wandb.finish()

[34m[1mwandb[0m: Agent Starting Run: ldnzzju5 with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 64
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(64, 256, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(64, 1536, num_layers=3, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=1536, out_features=100, bias=True)
  )
)


Traceback (most recent call last):
  File "/tmp/ipykernel_34/970870673.py", line 61, in main
    train_loss, train_accuracy = train(model, train_loader, optimizer, criterion, CLIP, device, ignore_index)
  File "/tmp/ipykernel_34/4219852660.py", line 17, in train
    output = model(source, target)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/tmp/ipykernel_34/594173864.py", line 23, in forward
    c_n = torch.cat([c_n[i:i+1] for i in range(0, c_n.shape[0], 2)] + [c_n[i:i+1] for i in range(1, c_n.shape[0], 2)], dim=2)
AttributeError: 'tuple' object has no attribute 'shape'


VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Run ldnzzju5 errored:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/wandb/agents/pyagent.py", line 308, in _run_job
    self._function()
  File "/tmp/ipykernel_34/970870673.py", line 61, in main
    train_loss, train_accuracy = train(model, train_loader, optimizer, criterion, CLIP, device, ignore_index)
  File "/tmp/ipykernel_34/4219852660.py", line 17, in train
    output = model(source, target)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/tmp/ipykernel_34/594173864.py", line 23, in forward
    c_n = torch.cat([c_n[i:i+1] for i in range(0, c_n.shape[0], 2)] + [c_n[i:i+1] for i in range(1, c_n.shape[0], 2)], dim=2)
AttributeError: 'tuple' object has no attribute 'shape'

[34m[1mwandb[0m: [