In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
'''
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))'''

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [7]:
import torch
import torch.nn as nn
import random
import torch.nn.functional as F
import wandb
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

#logging into wandb............................
wandb.login(key='b5d1fbca9d5170f54415e9c5a70ef09cee7a0aec')

# Encoder class....................................
class InputEncoder(nn.Module):
    #initializing.....................................................
    def __init__(self, vocab_size, embedding_size, hidden_size, layers, rnn_type='LSTM', dropout_rate=0.2, is_bidirectional=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
        rnn_class = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}[rnn_type]
        self.rnn = rnn_class(embedding_size, hidden_size, layers, dropout=dropout_rate, batch_first=True, bidirectional=is_bidirectional)
        self.is_bidirectional = is_bidirectional
        self.rnn_type = rnn_type
        self.hidden_size = hidden_size
        self.num_layers = layers
    # forward pass...............................
    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)  # output: [B, T, H*num_directions]
        return output, hidden  # Return all outputs for attention

# Attention class................................................................
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))

    def forward(self, hidden, encoder_outputs):
        # hidden: [B, H], encoder_outputs: [B, T, H]
        timestep = encoder_outputs.size(1)
        hidden = hidden.unsqueeze(1).repeat(1, timestep, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))  # [B, T, H]
        energy = energy @ self.v  # [B, T]
        return F.softmax(energy, dim=1)

# Decoder class....................................................................................
class OutputDecoder(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, layers, rnn_type='LSTM', dropout_rate=0.2, is_bidirectional=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
        rnn_class = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}[rnn_type]
        self.rnn = rnn_class(embedding_size + hidden_size, hidden_size, layers, dropout=dropout_rate, batch_first=True)
        self.output_layer = nn.Linear(hidden_size, vocab_size)
        self.attention = Attention(hidden_size)
        self.rnn_type = rnn_type
        self.hidden_size = hidden_size
        self.num_layers = layers

    def forward(self, token, hidden, encoder_outputs):
        token = token.unsqueeze(1)
        embedded = self.embedding(token)  # [B, 1, E]

        if self.rnn_type == 'LSTM':
            h = hidden[0][-1]  # last layer hidden
        else:
            h = hidden[-1]  # last layer hidden

        attn_weights = self.attention(h, encoder_outputs)  # [B, T]
        attn_applied = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)  # [B, 1, H]

        rnn_input = torch.cat((embedded, attn_applied), dim=2)  # [B, 1, E+H]
        output, hidden = self.rnn(rnn_input, hidden)
        output = self.output_layer(output.squeeze(1))
        #if return_attention:
          #  return output, hidden, attn_weights
        return output, hidden

# model for encoding-decdoing sequences................................................................
class TransliterationModelattention(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, embedding_size, hidden_size, enc_layers, dec_layers,
                 rnn_type='LSTM', dropout_rate=0.2, is_bidirectional=False):
        super().__init__()
        self.encoder = InputEncoder(input_vocab_size, embedding_size, hidden_size, enc_layers, rnn_type, dropout_rate, is_bidirectional)
        self.decoder = OutputDecoder(output_vocab_size, embedding_size, hidden_size * (2 if is_bidirectional else 1),
                                     dec_layers, rnn_type, dropout_rate, is_bidirectional=False)  # decoder not bidirectional
        self.rnn_type = rnn_type
        self.hidden_size = hidden_size
        self.is_bidirectional = is_bidirectional

    def forward(self, source, target, teacher_forcing_prob=0.5):
        batch_size, target_len = target.size()
        output_vocab_size = self.decoder.output_layer.out_features
        predictions = torch.zeros(batch_size, target_len, output_vocab_size, device=source.device)

        encoder_outputs, encoder_hidden = self.encoder(source)

        def merge_bidirectional(state):
            return torch.cat([state[i::2] for i in range(2)], dim=2)

        def match_layers(state, required_layers):
            actual_layers = state.size(0)
            if actual_layers == required_layers:
                return state
            elif actual_layers > required_layers:
                return state[:required_layers]
            else:
                pad = torch.zeros(required_layers - actual_layers, *state.shape[1:], device=state.device)
                return torch.cat([state, pad], dim=0)

        if self.rnn_type == 'LSTM':
            h, c = encoder_hidden
            if self.encoder.is_bidirectional:
                h, c = merge_bidirectional(h), merge_bidirectional(c)
            h = match_layers(h, self.decoder.rnn.num_layers)
            c = match_layers(c, self.decoder.rnn.num_layers)
            decoder_hidden = (h, c)
        else:
            h = encoder_hidden
            if self.encoder.is_bidirectional:
                h = merge_bidirectional(h)
            h = match_layers(h, self.decoder.rnn.num_layers)
            decoder_hidden = h

        decoder_input = target[:, 0]
        for t in range(1, target_len):
            output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            predictions[:, t] = output
            top1 = output.argmax(1)
            decoder_input = target[:, t] if random.random() < teacher_forcing_prob else top1

        return predictions


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [5]:
# Vocabulary and creating batch.......................................................
def build_vocab_and_prepare_batch(seqs, device):
    special_tokens = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
    
    # Extract unique characters from Latin and Devanagari sequences.........................
    unique_chars_latin = sorted(set(ch for seq in seqs for ch in seq[0]))
    unique_chars_dev = sorted(set(ch for seq in seqs for ch in seq[1]))

    # Build vocabularies and reverse mappings....................................................
    src_vocab = {ch: idx+len(special_tokens) for idx, ch in enumerate(unique_chars_latin)}
    src_vocab.update(special_tokens)
    tgt_vocab = {ch: idx+len(special_tokens) for idx, ch in enumerate(unique_chars_dev)}
    tgt_vocab.update(special_tokens)
    idx2src = {idx: ch for ch, idx in src_vocab.items()}
    idx2tgt = {idx: ch for ch, idx in tgt_vocab.items()}

    def encode_text(seq, vocab):
        return [vocab.get(ch, vocab['<unk>']) for ch in seq]
        
    # creating batches........................................................................
    def create_batch(pairs):
        src = [torch.tensor(encode_text(x, src_vocab) + [src_vocab['<eos>']]) for x, _ in pairs]
        tgt = [torch.tensor([tgt_vocab['<sos>']] + encode_text(y, tgt_vocab) + [tgt_vocab['<eos>']]) for _, y in pairs]
        src = pad_sequence(src, batch_first=True, padding_value=src_vocab['<pad>'])
        tgt = pad_sequence(tgt, batch_first=True, padding_value=tgt_vocab['<pad>'])
        return src.to(device), tgt.to(device)

    return src_vocab, idx2src, tgt_vocab, idx2tgt, create_batch, unique_chars_latin, unique_chars_dev
   
def read_pairs(file_path):
    with open(file_path, encoding='utf-8') as f:
        return [(line.split('\t')[1], line.split('\t')[0]) for line in f.read().strip().split('\n') if '\t' in line]

# function for computing word-level accuracy................................................
def compute_word_level_accuracy(preds, targets, vocab):
    sos, eos, pad = vocab['<sos>'], vocab['<eos>'], vocab['<pad>']
    preds = preds.tolist()
    targets = targets.tolist()
    correct = 0
    for p, t in zip(preds, targets):
        p = [x for x in p if x != pad and x != eos]
        t = [x for x in t if x != pad and x != eos]
        if p == t:
            correct += 1
    return correct / len(preds) * 100

# Training Function.....................................................................
def run_training():
    wandb.init(config={
        "embedding_size": 128,
        "hidden_size": 256,
        "enc_layers": 2,
        "dec_layers": 2,
        "rnn_type": "LSTM",
        "dropout_rate": 0.2,
        "epochs": 10,
        "batch_size": 64,
        "is_bidirectional": False,
        "learning_rate": 0.001,
        "optimizer": "adam",
        "teacher_forcing_prob": 0.5
    })
    cfg = wandb.config
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # takig the path.......................................................
    train_path = "/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
    dev_path = "/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv"
    train_set = read_pairs(train_path)
    dev_set = read_pairs(dev_path)

    # src_vocab, idx2src, tgt_vocab, idx2tgt, create_batch = build_vocab_and_prepare_batch(train_set, device)
    src_vocab, idx2src, tgt_vocab, idx2tgt, create_batch,unique_chars_latin, unique_chars_dev = build_vocab_and_prepare_batch(train_set, device)
    
    # model creation .....................................................................................
    model = TransliterationModelattention(len(src_vocab), len(tgt_vocab), cfg.embedding_size, cfg.hidden_size,
                                 cfg.enc_layers, cfg.dec_layers, cfg.rnn_type, cfg.dropout_rate, cfg.is_bidirectional).to(device)

    optimizer = optim.Adam(model.parameters(), lr=cfg.learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab['<pad>'])
    
    epochs = cfg.epochs if isinstance(cfg.epochs, int) else cfg.epochs[0]
    for epoch in range(epochs):
        model.train()
        total_loss, total_acc = 0, 0
        random.shuffle(train_set)

        for i in range(0, len(train_set), cfg.batch_size):
            batch = train_set[i:i+cfg.batch_size]
            src, tgt = create_batch(batch)

            optimizer.zero_grad()
            outputs = model(src, tgt, cfg.teacher_forcing_prob)
            loss = criterion(outputs[:, 1:].reshape(-1, outputs.size(-1)), tgt[:, 1:].reshape(-1))

            preds = outputs.argmax(-1)
            acc = compute_word_level_accuracy(preds[:, 1:], tgt[:, 1:], tgt_vocab)

            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            total_acc += acc

        avg_train_loss = total_loss / (len(train_set) // cfg.batch_size)
        avg_train_acc = total_acc / (len(train_set) // cfg.batch_size)
        
        # evaluation module................................................................   
        model.eval()
        dev_loss, dev_acc = 0, 0
        printed = 0
        with torch.no_grad():
            for i in range(0, len(dev_set), cfg.batch_size):
                batch = dev_set[i:i+cfg.batch_size]
                src, tgt = create_batch(batch)
                outputs = model(src, tgt, 0)
                loss = criterion(outputs[:, 1:].reshape(-1, outputs.size(-1)), tgt[:, 1:].reshape(-1))

                preds = outputs.argmax(-1)
                acc = compute_word_level_accuracy(preds[:, 1:], tgt[:, 1:], tgt_vocab)

                dev_loss += loss.item()
                dev_acc += acc
        # validation loss and accuracy..............................................
        avg_dev_loss = dev_loss / (len(dev_set) // cfg.batch_size)
        avg_dev_acc = dev_acc / (len(dev_set) // cfg.batch_size)

        wandb.log({
            "Train Loss": avg_train_loss,
            "Train Accuracy": avg_train_acc,
            "Validation Loss": avg_dev_loss,
            "Validation Accuracy": avg_dev_acc,
            "Epoch": epoch + 1
        })

        print(f"Epoch {epoch + 1}/{cfg.epochs} | Train Loss: {avg_train_loss:.4f}, Train Acc: {avg_train_acc:.2f}% | Val Loss: {avg_dev_loss:.4f}, Val Acc: {avg_dev_acc:.2f}%")
    
    wandb.finish()
    return model


In [8]:
# sweep configuration...........................................
sweep_config = {
    'method': 'random',
    'metric': {'name': 'Validation Accuracy', 'goal': 'maximize'},
    'parameters': {
        'embed_dim': {
            'values': [32, 64, 128, 256]
        }, 
        'hidden_dim': {
            'values': [64, 128, 256]
        },
        'enc_layers': {
            'values': [1,2,3]
        }, 
        'dec_layers': {
            'values': [1,2,3]
        }, 
        'cell_type': {
            'values': ['GRU', 'LSTM','RNN']
        }, 
        'dropout': {
            'values': [0.2, 0.3]
        }, 
        'batch_size': {
            'values': [32, 64]
        }, 
        'epochs': {
            'values': [5,10]

        },
        'bidirectional': {
            'values': [False, True]
        }, 
        'learning_rate': {
            'values': [0.001, 0.002, 0.001]
        }, 
        'optimizer': {
            'values': ['adam', 'nadam']
        }, 
        'teacher_forcing_ratio': {
            'values': [0.2, 0.5, 0.7]
        }, 
        'beam_width': {
            'values': [1, 3, 5]
        }
    }
}

#import wandb
#wandb.login(key='b5d1fbca9d5170f54415e9c5a70ef09cee7a0aec')
sweep_id = wandb.sweep(sweep_config, project="MA23M021_A3_Attention")
wandb.agent(sweep_id, function=run_training, count = 50)

Create sweep with ID: vyfpw2ux
Sweep URL: https://wandb.ai/ma23m021-iit-madras/MA23M021_A3_Attention/sweeps/vyfpw2ux


[34m[1mwandb[0m: Agent Starting Run: 3fxj7trw with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 5
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 256
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2


Epoch 1/5 | Train Loss: 1.7953, Train Acc: 7.81% | Val Loss: 1.2708, Val Acc: 21.78%
Epoch 2/5 | Train Loss: 0.8374, Train Acc: 25.72% | Val Loss: 1.0848, Val Acc: 29.75%
Epoch 3/5 | Train Loss: 0.6460, Train Acc: 35.28% | Val Loss: 1.0887, Val Acc: 33.63%
Epoch 4/5 | Train Loss: 0.5411, Train Acc: 41.83% | Val Loss: 1.0211, Val Acc: 34.41%
Epoch 5/5 | Train Loss: 0.4568, Train Acc: 47.24% | Val Loss: 1.0255, Val Acc: 35.59%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▄▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▅▇▇█
Validation Loss,█▃▃▁▁

0,1
Epoch,5.0
Train Accuracy,47.23706
Train Loss,0.45679
Validation Accuracy,35.58517
Validation Loss,1.0255


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: qsytruls with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 32
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2


Epoch 1/5 | Train Loss: 1.4957, Train Acc: 13.90% | Val Loss: 1.1448, Val Acc: 27.36%
Epoch 2/5 | Train Loss: 0.7260, Train Acc: 31.34% | Val Loss: 1.0740, Val Acc: 31.24%
Epoch 3/5 | Train Loss: 0.5994, Train Acc: 37.45% | Val Loss: 1.0322, Val Acc: 34.83%
Epoch 4/5 | Train Loss: 0.5290, Train Acc: 41.89% | Val Loss: 1.0013, Val Acc: 35.21%
Epoch 5/5 | Train Loss: 0.4734, Train Acc: 45.53% | Val Loss: 1.0045, Val Acc: 38.01%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▅▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▄▆▆█
Validation Loss,█▅▃▁▁

0,1
Epoch,5.0
Train Accuracy,45.53087
Train Loss,0.47339
Validation Accuracy,38.01317
Validation Loss,1.00452


[34m[1mwandb[0m: Agent Starting Run: tmhrqsni with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 128
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2


Epoch 1/10 | Train Loss: 1.7084, Train Acc: 7.93% | Val Loss: 1.3704, Val Acc: 17.47%
Epoch 2/10 | Train Loss: 0.8661, Train Acc: 25.98% | Val Loss: 1.1729, Val Acc: 25.89%
Epoch 3/10 | Train Loss: 0.6872, Train Acc: 34.45% | Val Loss: 1.0696, Val Acc: 30.96%
Epoch 4/10 | Train Loss: 0.5719, Train Acc: 41.35% | Val Loss: 1.1099, Val Acc: 31.74%
Epoch 5/10 | Train Loss: 0.4946, Train Acc: 46.34% | Val Loss: 1.1066, Val Acc: 32.87%
Epoch 6/10 | Train Loss: 0.4380, Train Acc: 50.94% | Val Loss: 1.1171, Val Acc: 34.34%
Epoch 7/10 | Train Loss: 0.3861, Train Acc: 54.89% | Val Loss: 1.1638, Val Acc: 32.02%
Epoch 8/10 | Train Loss: 0.3468, Train Acc: 58.74% | Val Loss: 1.1971, Val Acc: 32.58%
Epoch 9/10 | Train Loss: 0.3047, Train Acc: 62.03% | Val Loss: 1.2281, Val Acc: 30.97%
Epoch 10/10 | Train Loss: 0.2794, Train Acc: 64.62% | Val Loss: 1.2208, Val Acc: 32.65%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▃▄▅▆▆▇▇██
Train Loss,█▄▃▂▂▂▂▁▁▁
Validation Accuracy,▁▄▇▇▇█▇▇▇▇
Validation Loss,█▃▁▂▂▂▃▄▅▅

0,1
Epoch,10.0
Train Accuracy,64.61803
Train Loss,0.27936
Validation Accuracy,32.65165
Validation Loss,1.22083


[34m[1mwandb[0m: Agent Starting Run: 551gllja with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 5
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 64
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


Epoch 1/5 | Train Loss: 2.5020, Train Acc: 0.57% | Val Loss: 1.8244, Val Acc: 4.50%
Epoch 2/5 | Train Loss: 1.2311, Train Acc: 11.92% | Val Loss: 1.2817, Val Acc: 20.93%
Epoch 3/5 | Train Loss: 0.8995, Train Acc: 21.89% | Val Loss: 1.1671, Val Acc: 24.43%
Epoch 4/5 | Train Loss: 0.7564, Train Acc: 27.86% | Val Loss: 1.1105, Val Acc: 30.33%
Epoch 5/5 | Train Loss: 0.6599, Train Acc: 32.32% | Val Loss: 1.1314, Val Acc: 31.53%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▄▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▅▆██
Validation Loss,█▃▂▁▁

0,1
Epoch,5.0
Train Accuracy,32.31656
Train Loss,0.65991
Validation Accuracy,31.52574
Validation Loss,1.13136


[34m[1mwandb[0m: Agent Starting Run: yn7kumii with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.002
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7


Epoch 1/10 | Train Loss: 1.9161, Train Acc: 5.61% | Val Loss: 1.2875, Val Acc: 18.60%
Epoch 2/10 | Train Loss: 0.9112, Train Acc: 21.06% | Val Loss: 1.1558, Val Acc: 29.08%
Epoch 3/10 | Train Loss: 0.7208, Train Acc: 27.48% | Val Loss: 1.0971, Val Acc: 29.37%
Epoch 4/10 | Train Loss: 0.6244, Train Acc: 32.68% | Val Loss: 1.0877, Val Acc: 31.17%
Epoch 5/10 | Train Loss: 0.5615, Train Acc: 35.96% | Val Loss: 1.0634, Val Acc: 32.14%
Epoch 6/10 | Train Loss: 0.5119, Train Acc: 39.19% | Val Loss: 1.1036, Val Acc: 32.47%
Epoch 7/10 | Train Loss: 0.4726, Train Acc: 39.78% | Val Loss: 1.0993, Val Acc: 34.24%
Epoch 8/10 | Train Loss: 0.4467, Train Acc: 43.10% | Val Loss: 1.0893, Val Acc: 35.86%
Epoch 9/10 | Train Loss: 0.4232, Train Acc: 45.05% | Val Loss: 1.0925, Val Acc: 36.57%
Epoch 10/10 | Train Loss: 0.4046, Train Acc: 46.16% | Val Loss: 1.0973, Val Acc: 36.31%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▆▆▇▇▇██
Train Loss,█▃▂▂▂▁▁▁▁▁
Validation Accuracy,▁▅▅▆▆▆▇███
Validation Loss,█▄▂▂▁▂▂▂▂▂

0,1
Epoch,10.0
Train Accuracy,46.15692
Train Loss,0.40457
Validation Accuracy,36.31281
Validation Loss,1.09735


[34m[1mwandb[0m: Agent Starting Run: j4qnvfq6 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 5
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 32
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7


Epoch 1/10 | Train Loss: 1.7169, Train Acc: 8.77% | Val Loss: 1.2685, Val Acc: 23.73%
Epoch 2/10 | Train Loss: 0.7626, Train Acc: 29.53% | Val Loss: 1.0678, Val Acc: 31.71%
Epoch 3/10 | Train Loss: 0.6099, Train Acc: 36.58% | Val Loss: 1.0437, Val Acc: 36.34%
Epoch 4/10 | Train Loss: 0.5191, Train Acc: 41.71% | Val Loss: 1.0195, Val Acc: 36.24%
Epoch 5/10 | Train Loss: 0.4627, Train Acc: 46.12% | Val Loss: 1.0056, Val Acc: 36.77%
Epoch 6/10 | Train Loss: 0.4111, Train Acc: 49.08% | Val Loss: 1.0344, Val Acc: 37.34%
Epoch 7/10 | Train Loss: 0.3744, Train Acc: 51.32% | Val Loss: 1.0278, Val Acc: 38.77%
Epoch 8/10 | Train Loss: 0.3383, Train Acc: 54.11% | Val Loss: 1.0286, Val Acc: 39.36%
Epoch 9/10 | Train Loss: 0.3071, Train Acc: 56.25% | Val Loss: 1.0921, Val Acc: 38.91%
Epoch 10/10 | Train Loss: 0.2808, Train Acc: 58.96% | Val Loss: 1.1065, Val Acc: 37.05%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▆▆▇▇▇██
Train Loss,█▃▃▂▂▂▁▁▁▁
Validation Accuracy,▁▅▇▇▇▇███▇
Validation Loss,█▃▂▁▁▂▂▂▃▄

0,1
Epoch,10.0
Train Accuracy,58.96369
Train Loss,0.28077
Validation Accuracy,37.0481
Validation Loss,1.10651


[34m[1mwandb[0m: Agent Starting Run: 05mv53z3 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 128
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.002
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7


Epoch 1/10 | Train Loss: 1.7896, Train Acc: 9.75% | Val Loss: 1.2564, Val Acc: 26.00%
Epoch 2/10 | Train Loss: 0.7928, Train Acc: 24.66% | Val Loss: 1.0851, Val Acc: 31.40%
Epoch 3/10 | Train Loss: 0.6640, Train Acc: 29.05% | Val Loss: 1.0164, Val Acc: 34.88%
Epoch 4/10 | Train Loss: 0.5908, Train Acc: 32.35% | Val Loss: 0.9937, Val Acc: 37.29%
Epoch 5/10 | Train Loss: 0.5324, Train Acc: 33.95% | Val Loss: 0.9781, Val Acc: 36.56%
Epoch 6/10 | Train Loss: 0.5005, Train Acc: 33.74% | Val Loss: 0.9880, Val Acc: 38.19%
Epoch 7/10 | Train Loss: 0.4667, Train Acc: 38.80% | Val Loss: 0.9824, Val Acc: 38.28%
Epoch 8/10 | Train Loss: 0.4435, Train Acc: 39.60% | Val Loss: 1.0194, Val Acc: 38.13%
Epoch 9/10 | Train Loss: 0.4218, Train Acc: 41.39% | Val Loss: 1.0310, Val Acc: 39.32%
Epoch 10/10 | Train Loss: 0.4054, Train Acc: 40.99% | Val Loss: 1.0054, Val Acc: 38.47%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▆▆▆▇███
Train Loss,█▃▂▂▂▁▁▁▁▁
Validation Accuracy,▁▄▆▇▇▇▇▇██
Validation Loss,█▄▂▁▁▁▁▂▂▂

0,1
Epoch,10.0
Train Accuracy,40.9865
Train Loss,0.40536
Validation Accuracy,38.46507
Validation Loss,1.00543


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: xqzwdc3j with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 3
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 128
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.002
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7


Epoch 1/5 | Train Loss: 2.4086, Train Acc: 1.36% | Val Loss: 1.6242, Val Acc: 9.53%
Epoch 2/5 | Train Loss: 1.0647, Train Acc: 17.70% | Val Loss: 1.2007, Val Acc: 23.56%
Epoch 3/5 | Train Loss: 0.7723, Train Acc: 25.26% | Val Loss: 1.0923, Val Acc: 31.02%
Epoch 4/5 | Train Loss: 0.6484, Train Acc: 31.63% | Val Loss: 1.0722, Val Acc: 31.85%
Epoch 5/5 | Train Loss: 0.5689, Train Acc: 35.80% | Val Loss: 1.0672, Val Acc: 33.92%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▄▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▅▇▇█
Validation Loss,█▃▁▁▁

0,1
Epoch,5.0
Train Accuracy,35.79607
Train Loss,0.56893
Validation Accuracy,33.9231
Validation Loss,1.0672


[34m[1mwandb[0m: Agent Starting Run: ungsum58 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 3
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.002
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2


Epoch 1/5 | Train Loss: 1.5223, Train Acc: 11.50% | Val Loss: 1.2795, Val Acc: 22.40%
Epoch 2/5 | Train Loss: 0.7971, Train Acc: 28.53% | Val Loss: 1.1953, Val Acc: 28.78%
Epoch 3/5 | Train Loss: 0.6562, Train Acc: 36.05% | Val Loss: 1.1361, Val Acc: 30.36%
Epoch 4/5 | Train Loss: 0.5672, Train Acc: 40.81% | Val Loss: 1.1481, Val Acc: 30.74%
Epoch 5/5 | Train Loss: 0.5026, Train Acc: 45.78% | Val Loss: 1.1490, Val Acc: 31.69%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▄▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▆▇▇█
Validation Loss,█▄▁▂▂

0,1
Epoch,5.0
Train Accuracy,45.77525
Train Loss,0.50255
Validation Accuracy,31.69424
Validation Loss,1.14899


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: gye8ehq9 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 128
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7


Epoch 1/10 | Train Loss: 1.4120, Train Acc: 14.91% | Val Loss: 1.2089, Val Acc: 24.29%
Epoch 2/10 | Train Loss: 0.7266, Train Acc: 29.66% | Val Loss: 1.0843, Val Acc: 30.33%
Epoch 3/10 | Train Loss: 0.6054, Train Acc: 34.55% | Val Loss: 1.0089, Val Acc: 35.16%
Epoch 4/10 | Train Loss: 0.5357, Train Acc: 37.84% | Val Loss: 1.0299, Val Acc: 36.04%
Epoch 5/10 | Train Loss: 0.4811, Train Acc: 41.06% | Val Loss: 1.0149, Val Acc: 37.05%
Epoch 6/10 | Train Loss: 0.4336, Train Acc: 42.37% | Val Loss: 0.9954, Val Acc: 38.43%
Epoch 7/10 | Train Loss: 0.3928, Train Acc: 46.24% | Val Loss: 1.0230, Val Acc: 37.95%
Epoch 8/10 | Train Loss: 0.3647, Train Acc: 46.92% | Val Loss: 1.0373, Val Acc: 37.32%
Epoch 9/10 | Train Loss: 0.3314, Train Acc: 51.65% | Val Loss: 1.0143, Val Acc: 38.08%
Epoch 10/10 | Train Loss: 0.3099, Train Acc: 52.63% | Val Loss: 1.0742, Val Acc: 37.52%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▅▆▆▇▇██
Train Loss,█▄▃▂▂▂▂▁▁▁
Validation Accuracy,▁▄▆▇▇██▇██
Validation Loss,█▄▁▂▂▁▂▂▂▄

0,1
Epoch,10.0
Train Accuracy,52.63245
Train Loss,0.30994
Validation Accuracy,37.52298
Validation Loss,1.07419


[34m[1mwandb[0m: Agent Starting Run: xgjkxb5y with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 3
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 32
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7


Epoch 1/10 | Train Loss: 1.5446, Train Acc: 13.09% | Val Loss: 1.1931, Val Acc: 26.27%
Epoch 2/10 | Train Loss: 0.7349, Train Acc: 30.73% | Val Loss: 1.0668, Val Acc: 32.71%
Epoch 3/10 | Train Loss: 0.6098, Train Acc: 35.65% | Val Loss: 0.9942, Val Acc: 36.13%
Epoch 4/10 | Train Loss: 0.5316, Train Acc: 38.58% | Val Loss: 1.0283, Val Acc: 36.51%
Epoch 5/10 | Train Loss: 0.4786, Train Acc: 41.08% | Val Loss: 1.0032, Val Acc: 37.78%
Epoch 6/10 | Train Loss: 0.4329, Train Acc: 43.93% | Val Loss: 0.9966, Val Acc: 37.00%
Epoch 7/10 | Train Loss: 0.3979, Train Acc: 45.22% | Val Loss: 0.9927, Val Acc: 39.33%
Epoch 8/10 | Train Loss: 0.3623, Train Acc: 48.54% | Val Loss: 1.0470, Val Acc: 38.22%
Epoch 9/10 | Train Loss: 0.3312, Train Acc: 51.01% | Val Loss: 1.0303, Val Acc: 39.45%
Epoch 10/10 | Train Loss: 0.3098, Train Acc: 53.35% | Val Loss: 1.0637, Val Acc: 39.05%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▅▆▆▇▇██
Train Loss,█▃▃▂▂▂▁▁▁▁
Validation Accuracy,▁▄▆▆▇▇█▇██
Validation Loss,█▄▁▂▁▁▁▃▂▃

0,1
Epoch,10.0
Train Accuracy,53.35279
Train Loss,0.30977
Validation Accuracy,39.05484
Validation Loss,1.06369


[34m[1mwandb[0m: Agent Starting Run: bsi79rbi with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 5
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 128
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.002
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


Epoch 1/5 | Train Loss: 1.1432, Train Acc: 18.54% | Val Loss: 1.1252, Val Acc: 27.43%
Epoch 2/5 | Train Loss: 0.6860, Train Acc: 29.33% | Val Loss: 1.0390, Val Acc: 32.49%
Epoch 3/5 | Train Loss: 0.5950, Train Acc: 33.49% | Val Loss: 1.0507, Val Acc: 32.78%
Epoch 4/5 | Train Loss: 0.5381, Train Acc: 36.30% | Val Loss: 1.0434, Val Acc: 35.33%
Epoch 5/5 | Train Loss: 0.5037, Train Acc: 40.45% | Val Loss: 1.0155, Val Acc: 35.32%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▄▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▅▆██
Validation Loss,█▂▃▃▁

0,1
Epoch,5.0
Train Accuracy,40.4485
Train Loss,0.50372
Validation Accuracy,35.3171
Validation Loss,1.01551


[34m[1mwandb[0m: Agent Starting Run: yzpu1wv2 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 64
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2


Epoch 1/5 | Train Loss: 1.9016, Train Acc: 7.14% | Val Loss: 1.3096, Val Acc: 20.42%
Epoch 2/5 | Train Loss: 0.8410, Train Acc: 26.76% | Val Loss: 1.1383, Val Acc: 30.23%
Epoch 3/5 | Train Loss: 0.6741, Train Acc: 34.33% | Val Loss: 1.0216, Val Acc: 33.21%
Epoch 4/5 | Train Loss: 0.5712, Train Acc: 39.05% | Val Loss: 1.0409, Val Acc: 35.55%
Epoch 5/5 | Train Loss: 0.5167, Train Acc: 41.48% | Val Loss: 1.0333, Val Acc: 34.98%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▅▇██
Train Loss,█▃▂▁▁
Validation Accuracy,▁▆▇██
Validation Loss,█▄▁▁▁

0,1
Epoch,5.0
Train Accuracy,41.48407
Train Loss,0.51669
Validation Accuracy,34.98009
Validation Loss,1.03335


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: dktjwtpv with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 3
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 256
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2


Epoch 1/5 | Train Loss: 2.1233, Train Acc: 5.17% | Val Loss: 1.3285, Val Acc: 21.79%
Epoch 2/5 | Train Loss: 0.8820, Train Acc: 24.00% | Val Loss: 1.1298, Val Acc: 30.72%
Epoch 3/5 | Train Loss: 0.7075, Train Acc: 30.29% | Val Loss: 1.0544, Val Acc: 33.02%
Epoch 4/5 | Train Loss: 0.6235, Train Acc: 34.78% | Val Loss: 1.0216, Val Acc: 35.25%
Epoch 5/5 | Train Loss: 0.5740, Train Acc: 37.66% | Val Loss: 0.9989, Val Acc: 36.49%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▅▆▇█
Train Loss,█▂▂▁▁
Validation Accuracy,▁▅▆▇█
Validation Loss,█▄▂▁▁

0,1
Epoch,5.0
Train Accuracy,37.66181
Train Loss,0.57404
Validation Accuracy,36.48897
Validation Loss,0.99889


[34m[1mwandb[0m: Agent Starting Run: vwfvoyu6 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 3
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 32
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7


Epoch 1/5 | Train Loss: 1.6787, Train Acc: 10.58% | Val Loss: 1.2343, Val Acc: 22.76%
Epoch 2/5 | Train Loss: 0.8043, Train Acc: 26.25% | Val Loss: 1.0804, Val Acc: 32.70%
Epoch 3/5 | Train Loss: 0.6774, Train Acc: 31.46% | Val Loss: 1.0351, Val Acc: 33.99%
Epoch 4/5 | Train Loss: 0.6021, Train Acc: 35.32% | Val Loss: 0.9783, Val Acc: 34.65%
Epoch 5/5 | Train Loss: 0.5471, Train Acc: 38.52% | Val Loss: 1.0088, Val Acc: 36.19%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▅▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▆▇▇█
Validation Loss,█▄▃▁▂

0,1
Epoch,5.0
Train Accuracy,38.52055
Train Loss,0.54713
Validation Accuracy,36.19026
Validation Loss,1.0088


[34m[1mwandb[0m: Agent Starting Run: rauyijpd with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 5
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 128
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.002
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


Epoch 1/5 | Train Loss: 1.4935, Train Acc: 12.72% | Val Loss: 1.1817, Val Acc: 26.11%
Epoch 2/5 | Train Loss: 0.7804, Train Acc: 24.62% | Val Loss: 1.0605, Val Acc: 31.55%
Epoch 3/5 | Train Loss: 0.6703, Train Acc: 28.91% | Val Loss: 1.0240, Val Acc: 32.60%
Epoch 4/5 | Train Loss: 0.5997, Train Acc: 33.35% | Val Loss: 1.0145, Val Acc: 34.59%
Epoch 5/5 | Train Loss: 0.5666, Train Acc: 35.62% | Val Loss: 1.0026, Val Acc: 35.82%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▅▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▅▆▇█
Validation Loss,█▃▂▁▁

0,1
Epoch,5.0
Train Accuracy,35.62183
Train Loss,0.56657
Validation Accuracy,35.82261
Validation Loss,1.00256


[34m[1mwandb[0m: Agent Starting Run: 5rdyjyvo with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 5
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 64
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


Epoch 1/5 | Train Loss: 1.8203, Train Acc: 7.41% | Val Loss: 1.2606, Val Acc: 20.31%
Epoch 2/5 | Train Loss: 0.8257, Train Acc: 26.61% | Val Loss: 1.1224, Val Acc: 27.90%
Epoch 3/5 | Train Loss: 0.6438, Train Acc: 35.59% | Val Loss: 1.0835, Val Acc: 32.10%
Epoch 4/5 | Train Loss: 0.5432, Train Acc: 41.99% | Val Loss: 1.0452, Val Acc: 35.73%
Epoch 5/5 | Train Loss: 0.4636, Train Acc: 46.48% | Val Loss: 1.0435, Val Acc: 36.31%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▄▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▄▆██
Validation Loss,█▄▂▁▁

0,1
Epoch,5.0
Train Accuracy,46.47749
Train Loss,0.46361
Validation Accuracy,36.30515
Validation Loss,1.04345


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: j0jasbal with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 3
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 64
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


Epoch 1/10 | Train Loss: 2.2805, Train Acc: 3.05% | Val Loss: 1.4218, Val Acc: 16.82%
Epoch 2/10 | Train Loss: 0.9664, Train Acc: 20.80% | Val Loss: 1.1523, Val Acc: 27.80%
Epoch 3/10 | Train Loss: 0.7697, Train Acc: 27.22% | Val Loss: 1.0426, Val Acc: 32.58%
Epoch 4/10 | Train Loss: 0.6834, Train Acc: 30.15% | Val Loss: 1.0425, Val Acc: 34.86%
Epoch 5/10 | Train Loss: 0.6165, Train Acc: 32.20% | Val Loss: 1.0165, Val Acc: 34.67%
Epoch 6/10 | Train Loss: 0.5741, Train Acc: 36.74% | Val Loss: 0.9938, Val Acc: 37.48%
Epoch 7/10 | Train Loss: 0.5300, Train Acc: 38.20% | Val Loss: 0.9916, Val Acc: 37.33%
Epoch 8/10 | Train Loss: 0.5029, Train Acc: 40.04% | Val Loss: 0.9564, Val Acc: 39.35%
Epoch 9/10 | Train Loss: 0.4749, Train Acc: 41.39% | Val Loss: 1.0159, Val Acc: 38.65%
Epoch 10/10 | Train Loss: 0.4461, Train Acc: 43.42% | Val Loss: 0.9964, Val Acc: 39.25%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▆▆▇▇▇██
Train Loss,█▃▂▂▂▁▁▁▁▁
Validation Accuracy,▁▄▆▇▇▇▇███
Validation Loss,█▄▂▂▂▂▂▁▂▂

0,1
Epoch,10.0
Train Accuracy,43.42264
Train Loss,0.44613
Validation Accuracy,39.24632
Validation Loss,0.99643


[34m[1mwandb[0m: Agent Starting Run: hubwp4ly with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 256
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2


Epoch 1/10 | Train Loss: 1.8753, Train Acc: 8.10% | Val Loss: 1.3213, Val Acc: 21.91%
Epoch 2/10 | Train Loss: 0.8089, Train Acc: 28.17% | Val Loss: 1.0906, Val Acc: 32.49%
Epoch 3/10 | Train Loss: 0.6673, Train Acc: 34.61% | Val Loss: 1.0331, Val Acc: 34.22%
Epoch 4/10 | Train Loss: 0.5814, Train Acc: 38.95% | Val Loss: 1.0202, Val Acc: 36.04%
Epoch 5/10 | Train Loss: 0.5146, Train Acc: 42.43% | Val Loss: 0.9969, Val Acc: 36.94%
Epoch 6/10 | Train Loss: 0.4672, Train Acc: 44.21% | Val Loss: 0.9941, Val Acc: 38.72%
Epoch 7/10 | Train Loss: 0.4313, Train Acc: 46.15% | Val Loss: 1.0119, Val Acc: 39.06%
Epoch 8/10 | Train Loss: 0.3921, Train Acc: 49.65% | Val Loss: 1.0277, Val Acc: 39.58%
Epoch 9/10 | Train Loss: 0.3599, Train Acc: 51.20% | Val Loss: 1.0039, Val Acc: 38.04%
Epoch 10/10 | Train Loss: 0.3346, Train Acc: 53.58% | Val Loss: 1.0504, Val Acc: 40.23%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▆▆▇▇▇██
Train Loss,█▃▃▂▂▂▁▁▁▁
Validation Accuracy,▁▅▆▆▇▇██▇█
Validation Loss,█▃▂▂▁▁▁▂▁▂

0,1
Epoch,10.0
Train Accuracy,53.58181
Train Loss,0.33464
Validation Accuracy,40.23438
Validation Loss,1.05042


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: af9hs8it with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 5
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 128
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2


Epoch 1/10 | Train Loss: 1.2167, Train Acc: 18.37% | Val Loss: 1.1243, Val Acc: 27.76%
Epoch 2/10 | Train Loss: 0.6897, Train Acc: 32.78% | Val Loss: 1.0463, Val Acc: 32.75%
Epoch 3/10 | Train Loss: 0.5880, Train Acc: 36.85% | Val Loss: 1.0474, Val Acc: 34.27%
Epoch 4/10 | Train Loss: 0.5189, Train Acc: 40.91% | Val Loss: 0.9922, Val Acc: 37.19%
Epoch 5/10 | Train Loss: 0.4717, Train Acc: 44.87% | Val Loss: 1.0062, Val Acc: 35.43%
Epoch 6/10 | Train Loss: 0.4316, Train Acc: 47.69% | Val Loss: 1.0046, Val Acc: 37.00%
Epoch 7/10 | Train Loss: 0.3954, Train Acc: 49.71% | Val Loss: 1.0349, Val Acc: 36.14%
Epoch 8/10 | Train Loss: 0.3659, Train Acc: 52.16% | Val Loss: 1.0166, Val Acc: 37.42%
Epoch 9/10 | Train Loss: 0.3394, Train Acc: 53.23% | Val Loss: 1.0325, Val Acc: 38.28%
Epoch 10/10 | Train Loss: 0.3184, Train Acc: 55.41% | Val Loss: 1.0566, Val Acc: 37.84%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▄▅▆▇▇▇██
Train Loss,█▄▃▃▂▂▂▁▁▁
Validation Accuracy,▁▄▅▇▆▇▇▇██
Validation Loss,█▄▄▁▂▂▃▂▃▄

0,1
Epoch,10.0
Train Accuracy,55.4052
Train Loss,0.31836
Validation Accuracy,37.84467
Validation Loss,1.05661


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 3zuigw8x with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 128
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.002
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7


Epoch 1/10 | Train Loss: 2.1058, Train Acc: 4.36% | Val Loss: 1.4147, Val Acc: 15.10%
Epoch 2/10 | Train Loss: 0.9502, Train Acc: 20.76% | Val Loss: 1.1323, Val Acc: 28.53%
Epoch 3/10 | Train Loss: 0.7840, Train Acc: 26.43% | Val Loss: 1.0881, Val Acc: 31.73%
Epoch 4/10 | Train Loss: 0.7060, Train Acc: 28.24% | Val Loss: 1.0675, Val Acc: 32.61%
Epoch 5/10 | Train Loss: 0.6499, Train Acc: 29.75% | Val Loss: 1.0418, Val Acc: 33.09%
Epoch 6/10 | Train Loss: 0.6133, Train Acc: 31.09% | Val Loss: 1.0020, Val Acc: 34.45%
Epoch 7/10 | Train Loss: 0.5823, Train Acc: 32.69% | Val Loss: 1.0384, Val Acc: 31.92%
Epoch 8/10 | Train Loss: 0.5617, Train Acc: 34.55% | Val Loss: 0.9980, Val Acc: 35.41%
Epoch 9/10 | Train Loss: 0.5385, Train Acc: 36.17% | Val Loss: 1.0125, Val Acc: 35.48%
Epoch 10/10 | Train Loss: 0.5228, Train Acc: 36.48% | Val Loss: 0.9863, Val Acc: 36.47%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▅▆▆▇▇▇███
Train Loss,█▃▂▂▂▁▁▁▁▁
Validation Accuracy,▁▅▆▇▇▇▇███
Validation Loss,█▃▃▂▂▁▂▁▁▁

0,1
Epoch,10.0
Train Accuracy,36.48021
Train Loss,0.52282
Validation Accuracy,36.46599
Validation Loss,0.98628


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 4r4lvzn9 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 128
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2


Epoch 1/10 | Train Loss: 1.4297, Train Acc: 14.64% | Val Loss: 1.1908, Val Acc: 26.76%
Epoch 2/10 | Train Loss: 0.7341, Train Acc: 30.64% | Val Loss: 1.0921, Val Acc: 32.72%
Epoch 3/10 | Train Loss: 0.6245, Train Acc: 35.70% | Val Loss: 1.0501, Val Acc: 33.12%
Epoch 4/10 | Train Loss: 0.5420, Train Acc: 37.34% | Val Loss: 1.0103, Val Acc: 37.91%
Epoch 5/10 | Train Loss: 0.4940, Train Acc: 39.96% | Val Loss: 0.9715, Val Acc: 38.92%
Epoch 6/10 | Train Loss: 0.4477, Train Acc: 42.00% | Val Loss: 1.0187, Val Acc: 38.20%
Epoch 7/10 | Train Loss: 0.4161, Train Acc: 44.30% | Val Loss: 1.0197, Val Acc: 37.45%
Epoch 8/10 | Train Loss: 0.3842, Train Acc: 46.94% | Val Loss: 1.0071, Val Acc: 37.06%
Epoch 9/10 | Train Loss: 0.3530, Train Acc: 46.04% | Val Loss: 1.0290, Val Acc: 39.09%
Epoch 10/10 | Train Loss: 0.3288, Train Acc: 48.32% | Val Loss: 1.0422, Val Acc: 39.00%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▆▆▇▇███
Train Loss,█▄▃▂▂▂▂▁▁▁
Validation Accuracy,▁▄▅▇█▇▇▇██
Validation Loss,█▅▄▂▁▃▃▂▃▃

0,1
Epoch,10.0
Train Accuracy,48.31892
Train Loss,0.32884
Validation Accuracy,39.00123
Validation Loss,1.04223


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 3drtuzm4 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7


Epoch 1/5 | Train Loss: 2.1836, Train Acc: 3.38% | Val Loss: 1.4582, Val Acc: 13.53%
Epoch 2/5 | Train Loss: 0.9563, Train Acc: 21.25% | Val Loss: 1.1378, Val Acc: 27.49%
Epoch 3/5 | Train Loss: 0.7184, Train Acc: 31.60% | Val Loss: 1.0833, Val Acc: 31.92%
Epoch 4/5 | Train Loss: 0.5970, Train Acc: 39.30% | Val Loss: 1.0782, Val Acc: 34.26%
Epoch 5/5 | Train Loss: 0.5053, Train Acc: 44.71% | Val Loss: 1.0466, Val Acc: 33.75%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▄▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▆▇██
Validation Loss,█▃▂▂▁

0,1
Epoch,5.0
Train Accuracy,44.70685
Train Loss,0.50528
Validation Accuracy,33.74694
Validation Loss,1.04658


[34m[1mwandb[0m: Agent Starting Run: nxpvct4u with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.002
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


Epoch 1/5 | Train Loss: 1.5360, Train Acc: 11.30% | Val Loss: 1.2897, Val Acc: 19.24%
Epoch 2/5 | Train Loss: 0.8128, Train Acc: 27.88% | Val Loss: 1.2099, Val Acc: 24.76%
Epoch 3/5 | Train Loss: 0.6605, Train Acc: 35.89% | Val Loss: 1.1653, Val Acc: 27.82%
Epoch 4/5 | Train Loss: 0.5764, Train Acc: 41.27% | Val Loss: 1.1549, Val Acc: 30.83%
Epoch 5/5 | Train Loss: 0.5157, Train Acc: 44.76% | Val Loss: 1.1724, Val Acc: 29.70%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▄▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▄▆█▇
Validation Loss,█▄▂▁▂

0,1
Epoch,5.0
Train Accuracy,44.75546
Train Loss,0.5157
Validation Accuracy,29.69516
Validation Loss,1.17235


[34m[1mwandb[0m: Agent Starting Run: agov8jhk with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 3
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 128
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2


Epoch 1/5 | Train Loss: 1.5276, Train Acc: 13.17% | Val Loss: 1.1949, Val Acc: 26.18%
Epoch 2/5 | Train Loss: 0.7413, Train Acc: 30.28% | Val Loss: 1.0658, Val Acc: 30.46%
Epoch 3/5 | Train Loss: 0.6128, Train Acc: 36.06% | Val Loss: 1.0319, Val Acc: 34.11%
Epoch 4/5 | Train Loss: 0.5354, Train Acc: 40.94% | Val Loss: 1.0113, Val Acc: 35.52%
Epoch 5/5 | Train Loss: 0.4793, Train Acc: 44.71% | Val Loss: 1.0051, Val Acc: 35.13%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▅▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▄▇██
Validation Loss,█▃▂▁▁

0,1
Epoch,5.0
Train Accuracy,44.71322
Train Loss,0.47928
Validation Accuracy,35.13327
Validation Loss,1.00509


[34m[1mwandb[0m: Agent Starting Run: dz9imkv6 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 3
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2


Epoch 1/5 | Train Loss: 2.6602, Train Acc: 0.34% | Val Loss: 1.9427, Val Acc: 2.41%
Epoch 2/5 | Train Loss: 1.2129, Train Acc: 14.10% | Val Loss: 1.2467, Val Acc: 22.89%
Epoch 3/5 | Train Loss: 0.8189, Train Acc: 26.58% | Val Loss: 1.1124, Val Acc: 31.84%
Epoch 4/5 | Train Loss: 0.6662, Train Acc: 31.20% | Val Loss: 1.0549, Val Acc: 33.83%
Epoch 5/5 | Train Loss: 0.5659, Train Acc: 37.77% | Val Loss: 1.0316, Val Acc: 36.15%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▄▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▅▇██
Validation Loss,█▃▂▁▁

0,1
Epoch,5.0
Train Accuracy,37.77318
Train Loss,0.5659
Validation Accuracy,36.15196
Validation Loss,1.03159


[34m[1mwandb[0m: Agent Starting Run: bimrx98a with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 32
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2


Epoch 1/10 | Train Loss: 2.6693, Train Acc: 0.23% | Val Loss: 2.0011, Val Acc: 2.87%
Epoch 2/10 | Train Loss: 1.2665, Train Acc: 11.96% | Val Loss: 1.2609, Val Acc: 22.61%
Epoch 3/10 | Train Loss: 0.8606, Train Acc: 24.66% | Val Loss: 1.1270, Val Acc: 28.95%
Epoch 4/10 | Train Loss: 0.7090, Train Acc: 30.80% | Val Loss: 1.0752, Val Acc: 32.48%
Epoch 5/10 | Train Loss: 0.6241, Train Acc: 35.62% | Val Loss: 1.0040, Val Acc: 34.13%
Epoch 6/10 | Train Loss: 0.5602, Train Acc: 38.53% | Val Loss: 1.0512, Val Acc: 35.29%
Epoch 7/10 | Train Loss: 0.5123, Train Acc: 42.10% | Val Loss: 1.0205, Val Acc: 37.55%
Epoch 8/10 | Train Loss: 0.4738, Train Acc: 44.99% | Val Loss: 0.9977, Val Acc: 36.80%
Epoch 9/10 | Train Loss: 0.4345, Train Acc: 48.39% | Val Loss: 1.0131, Val Acc: 39.38%
Epoch 10/10 | Train Loss: 0.4058, Train Acc: 50.11% | Val Loss: 0.9901, Val Acc: 40.03%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▃▄▅▆▆▇▇██
Train Loss,█▄▂▂▂▁▁▁▁▁
Validation Accuracy,▁▅▆▇▇▇█▇██
Validation Loss,█▃▂▂▁▁▁▁▁▁

0,1
Epoch,10.0
Train Accuracy,50.10787
Train Loss,0.40584
Validation Accuracy,40.02757
Validation Loss,0.99012


[34m[1mwandb[0m: Agent Starting Run: lgzgki1x with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 3
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 256
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2


Epoch 1/10 | Train Loss: 2.6182, Train Acc: 0.33% | Val Loss: 1.9181, Val Acc: 2.32%
Epoch 2/10 | Train Loss: 1.2255, Train Acc: 13.05% | Val Loss: 1.2962, Val Acc: 23.94%
Epoch 3/10 | Train Loss: 0.8260, Train Acc: 26.73% | Val Loss: 1.1189, Val Acc: 30.27%
Epoch 4/10 | Train Loss: 0.6707, Train Acc: 34.00% | Val Loss: 1.0499, Val Acc: 33.66%
Epoch 5/10 | Train Loss: 0.5741, Train Acc: 39.51% | Val Loss: 1.0665, Val Acc: 34.26%
Epoch 6/10 | Train Loss: 0.5049, Train Acc: 43.61% | Val Loss: 1.0575, Val Acc: 36.35%
Epoch 7/10 | Train Loss: 0.4482, Train Acc: 45.88% | Val Loss: 1.0460, Val Acc: 37.78%
Epoch 8/10 | Train Loss: 0.3940, Train Acc: 50.93% | Val Loss: 1.1023, Val Acc: 38.30%
Epoch 9/10 | Train Loss: 0.3602, Train Acc: 54.84% | Val Loss: 1.0857, Val Acc: 38.37%
Epoch 10/10 | Train Loss: 0.3251, Train Acc: 58.46% | Val Loss: 1.1073, Val Acc: 37.73%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▃▄▅▆▆▆▇██
Train Loss,█▄▃▂▂▂▁▁▁▁
Validation Accuracy,▁▅▆▇▇█████
Validation Loss,█▃▂▁▁▁▁▁▁▁

0,1
Epoch,10.0
Train Accuracy,58.46467
Train Loss,0.32512
Validation Accuracy,37.72978
Validation Loss,1.1073


[34m[1mwandb[0m: Agent Starting Run: 6pbag76b with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 3
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 32
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


Epoch 1/10 | Train Loss: 2.1513, Train Acc: 4.99% | Val Loss: 1.3515, Val Acc: 20.07%
Epoch 2/10 | Train Loss: 0.8887, Train Acc: 23.16% | Val Loss: 1.1132, Val Acc: 30.73%
Epoch 3/10 | Train Loss: 0.7164, Train Acc: 27.76% | Val Loss: 1.0580, Val Acc: 34.42%
Epoch 4/10 | Train Loss: 0.6276, Train Acc: 31.76% | Val Loss: 1.0585, Val Acc: 36.42%
Epoch 5/10 | Train Loss: 0.5761, Train Acc: 32.89% | Val Loss: 1.0062, Val Acc: 36.20%
Epoch 6/10 | Train Loss: 0.5205, Train Acc: 36.10% | Val Loss: 1.0160, Val Acc: 38.58%
Epoch 7/10 | Train Loss: 0.4803, Train Acc: 39.12% | Val Loss: 0.9849, Val Acc: 39.96%
Epoch 8/10 | Train Loss: 0.4517, Train Acc: 42.13% | Val Loss: 0.9952, Val Acc: 38.67%
Epoch 9/10 | Train Loss: 0.4230, Train Acc: 44.47% | Val Loss: 0.9668, Val Acc: 40.20%
Epoch 10/10 | Train Loss: 0.3922, Train Acc: 46.90% | Val Loss: 1.0118, Val Acc: 39.60%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▅▆▆▇▇██
Train Loss,█▃▂▂▂▂▁▁▁▁
Validation Accuracy,▁▅▆▇▇▇█▇██
Validation Loss,█▄▃▃▂▂▁▂▁▂

0,1
Epoch,10.0
Train Accuracy,46.90444
Train Loss,0.39218
Validation Accuracy,39.59865
Validation Loss,1.01177


[34m[1mwandb[0m: Agent Starting Run: 6i0mlwtv with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 64
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


Epoch 1/5 | Train Loss: 1.4515, Train Acc: 14.39% | Val Loss: 1.2162, Val Acc: 28.08%
Epoch 2/5 | Train Loss: 0.7388, Train Acc: 29.01% | Val Loss: 1.0957, Val Acc: 31.49%
Epoch 3/5 | Train Loss: 0.6134, Train Acc: 33.45% | Val Loss: 1.0457, Val Acc: 35.45%
Epoch 4/5 | Train Loss: 0.5445, Train Acc: 35.93% | Val Loss: 1.0268, Val Acc: 36.74%
Epoch 5/5 | Train Loss: 0.4893, Train Acc: 40.11% | Val Loss: 1.0152, Val Acc: 36.49%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▅▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▄▇██
Validation Loss,█▄▂▁▁

0,1
Epoch,5.0
Train Accuracy,40.10623
Train Loss,0.48925
Validation Accuracy,36.48897
Validation Loss,1.01521


[34m[1mwandb[0m: Agent Starting Run: g2dlcvds with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 3
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 256
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7


Epoch 1/10 | Train Loss: 2.6676, Train Acc: 0.34% | Val Loss: 1.9545, Val Acc: 3.03%
Epoch 2/10 | Train Loss: 1.1933, Train Acc: 14.44% | Val Loss: 1.2369, Val Acc: 23.51%
Epoch 3/10 | Train Loss: 0.8095, Train Acc: 26.78% | Val Loss: 1.0747, Val Acc: 31.23%
Epoch 4/10 | Train Loss: 0.6847, Train Acc: 33.21% | Val Loss: 1.0595, Val Acc: 33.42%
Epoch 5/10 | Train Loss: 0.5922, Train Acc: 37.52% | Val Loss: 1.0201, Val Acc: 37.37%
Epoch 6/10 | Train Loss: 0.5336, Train Acc: 40.99% | Val Loss: 1.0036, Val Acc: 37.58%
Epoch 7/10 | Train Loss: 0.4853, Train Acc: 43.96% | Val Loss: 1.0190, Val Acc: 38.89%
Epoch 8/10 | Train Loss: 0.4446, Train Acc: 47.37% | Val Loss: 1.0135, Val Acc: 38.92%
Epoch 9/10 | Train Loss: 0.4204, Train Acc: 48.97% | Val Loss: 0.9959, Val Acc: 38.56%
Epoch 10/10 | Train Loss: 0.3794, Train Acc: 52.70% | Val Loss: 1.0406, Val Acc: 40.38%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▃▅▅▆▆▇▇██
Train Loss,█▃▂▂▂▁▁▁▁▁
Validation Accuracy,▁▅▆▇▇▇████
Validation Loss,█▃▂▁▁▁▁▁▁▁

0,1
Epoch,10.0
Train Accuracy,52.69845
Train Loss,0.37945
Validation Accuracy,40.3799
Validation Loss,1.04063


[34m[1mwandb[0m: Agent Starting Run: 868ncgoq with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 128
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.002
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2


Epoch 1/10 | Train Loss: 2.0753, Train Acc: 4.11% | Val Loss: 1.4317, Val Acc: 16.55%
Epoch 2/10 | Train Loss: 0.9952, Train Acc: 20.04% | Val Loss: 1.1717, Val Acc: 25.99%
Epoch 3/10 | Train Loss: 0.7807, Train Acc: 27.12% | Val Loss: 1.1234, Val Acc: 30.22%
Epoch 4/10 | Train Loss: 0.6722, Train Acc: 31.42% | Val Loss: 1.0917, Val Acc: 33.99%
Epoch 5/10 | Train Loss: 0.6012, Train Acc: 34.64% | Val Loss: 1.0930, Val Acc: 32.80%
Epoch 6/10 | Train Loss: 0.5506, Train Acc: 39.39% | Val Loss: 1.0846, Val Acc: 35.29%
Epoch 7/10 | Train Loss: 0.5105, Train Acc: 41.57% | Val Loss: 1.0803, Val Acc: 35.36%
Epoch 8/10 | Train Loss: 0.4795, Train Acc: 42.93% | Val Loss: 1.0831, Val Acc: 35.42%
Epoch 9/10 | Train Loss: 0.4543, Train Acc: 44.97% | Val Loss: 1.1367, Val Acc: 36.60%
Epoch 10/10 | Train Loss: 0.4342, Train Acc: 46.02% | Val Loss: 1.1124, Val Acc: 35.21%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▆▆▇▇▇██
Train Loss,█▃▂▂▂▁▁▁▁▁
Validation Accuracy,▁▄▆▇▇█████
Validation Loss,█▃▂▁▁▁▁▁▂▂

0,1
Epoch,10.0
Train Accuracy,46.01738
Train Loss,0.43421
Validation Accuracy,35.20987
Validation Loss,1.11242


[34m[1mwandb[0m: Agent Starting Run: 3m1j1gbu with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 128
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.002
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7


Epoch 1/10 | Train Loss: 2.0439, Train Acc: 4.35% | Val Loss: 1.4113, Val Acc: 16.21%
Epoch 2/10 | Train Loss: 0.9896, Train Acc: 19.89% | Val Loss: 1.1602, Val Acc: 24.47%
Epoch 3/10 | Train Loss: 0.7791, Train Acc: 28.14% | Val Loss: 1.0871, Val Acc: 28.72%
Epoch 4/10 | Train Loss: 0.6739, Train Acc: 33.35% | Val Loss: 1.0963, Val Acc: 31.20%
Epoch 5/10 | Train Loss: 0.6011, Train Acc: 35.57% | Val Loss: 1.0835, Val Acc: 32.08%
Epoch 6/10 | Train Loss: 0.5514, Train Acc: 39.50% | Val Loss: 1.0631, Val Acc: 34.38%
Epoch 7/10 | Train Loss: 0.5060, Train Acc: 41.86% | Val Loss: 1.0816, Val Acc: 35.94%
Epoch 8/10 | Train Loss: 0.4792, Train Acc: 42.92% | Val Loss: 1.0943, Val Acc: 34.83%
Epoch 9/10 | Train Loss: 0.4583, Train Acc: 43.16% | Val Loss: 1.0890, Val Acc: 34.19%
Epoch 10/10 | Train Loss: 0.4364, Train Acc: 46.33% | Val Loss: 1.0794, Val Acc: 35.59%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▆▆▇▇▇▇█
Train Loss,█▃▂▂▂▂▁▁▁▁
Validation Accuracy,▁▄▅▆▇▇██▇█
Validation Loss,█▃▁▂▁▁▁▂▂▁

0,1
Epoch,10.0
Train Accuracy,46.33041
Train Loss,0.43636
Validation Accuracy,35.59283
Validation Loss,1.07935


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 6qnn452k with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 128
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2


Epoch 1/10 | Train Loss: 2.7784, Train Acc: 0.09% | Val Loss: 2.1891, Val Acc: 1.10%
Epoch 2/10 | Train Loss: 1.2533, Train Acc: 13.55% | Val Loss: 1.2886, Val Acc: 25.31%
Epoch 3/10 | Train Loss: 0.8449, Train Acc: 24.43% | Val Loss: 1.1300, Val Acc: 32.03%
Epoch 4/10 | Train Loss: 0.7206, Train Acc: 27.83% | Val Loss: 1.1024, Val Acc: 35.71%
Epoch 5/10 | Train Loss: 0.6339, Train Acc: 30.31% | Val Loss: 1.0404, Val Acc: 34.63%
Epoch 6/10 | Train Loss: 0.5922, Train Acc: 33.22% | Val Loss: 0.9952, Val Acc: 36.98%
Epoch 7/10 | Train Loss: 0.5415, Train Acc: 36.00% | Val Loss: 1.0615, Val Acc: 37.83%
Epoch 8/10 | Train Loss: 0.5107, Train Acc: 39.02% | Val Loss: 1.0058, Val Acc: 38.61%
Epoch 9/10 | Train Loss: 0.4847, Train Acc: 39.56% | Val Loss: 0.9659, Val Acc: 39.43%
Epoch 10/10 | Train Loss: 0.4523, Train Acc: 42.12% | Val Loss: 0.9865, Val Acc: 39.51%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▃▅▆▆▇▇▇██
Train Loss,█▃▂▂▂▁▁▁▁▁
Validation Accuracy,▁▅▇▇▇█████
Validation Loss,█▃▂▂▁▁▂▁▁▁

0,1
Epoch,10.0
Train Accuracy,42.1208
Train Loss,0.45234
Validation Accuracy,39.50674
Validation Loss,0.98652


[34m[1mwandb[0m: Agent Starting Run: n3vvm9tb with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 3
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 64
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.002
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


Epoch 1/10 | Train Loss: 2.0699, Train Acc: 3.59% | Val Loss: 1.4374, Val Acc: 14.71%
Epoch 2/10 | Train Loss: 0.9933, Train Acc: 20.02% | Val Loss: 1.1598, Val Acc: 25.48%
Epoch 3/10 | Train Loss: 0.7712, Train Acc: 26.85% | Val Loss: 1.1122, Val Acc: 29.34%
Epoch 4/10 | Train Loss: 0.6660, Train Acc: 32.11% | Val Loss: 1.0751, Val Acc: 34.53%
Epoch 5/10 | Train Loss: 0.5994, Train Acc: 33.24% | Val Loss: 1.1123, Val Acc: 33.09%
Epoch 6/10 | Train Loss: 0.5479, Train Acc: 36.73% | Val Loss: 1.0847, Val Acc: 34.97%
Epoch 7/10 | Train Loss: 0.5035, Train Acc: 38.84% | Val Loss: 1.0857, Val Acc: 34.03%
Epoch 8/10 | Train Loss: 0.4740, Train Acc: 42.89% | Val Loss: 1.0839, Val Acc: 35.11%
Epoch 9/10 | Train Loss: 0.4532, Train Acc: 44.90% | Val Loss: 1.0973, Val Acc: 35.94%
Epoch 10/10 | Train Loss: 0.4315, Train Acc: 47.34% | Val Loss: 1.0871, Val Acc: 35.56%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▆▆▆▇▇██
Train Loss,█▃▂▂▂▁▁▁▁▁
Validation Accuracy,▁▅▆█▇█▇███
Validation Loss,█▃▂▁▂▁▁▁▁▁

0,1
Epoch,10.0
Train Accuracy,47.34416
Train Loss,0.43147
Validation Accuracy,35.56219
Validation Loss,1.08707


[34m[1mwandb[0m: Agent Starting Run: x5nhia56 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 64
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7


Epoch 1/5 | Train Loss: 2.4754, Train Acc: 0.83% | Val Loss: 1.7868, Val Acc: 6.69%
Epoch 2/5 | Train Loss: 1.1958, Train Acc: 13.31% | Val Loss: 1.2600, Val Acc: 22.29%
Epoch 3/5 | Train Loss: 0.8795, Train Acc: 23.08% | Val Loss: 1.1553, Val Acc: 28.88%
Epoch 4/5 | Train Loss: 0.7354, Train Acc: 28.88% | Val Loss: 1.0877, Val Acc: 30.66%
Epoch 5/5 | Train Loss: 0.6538, Train Acc: 32.04% | Val Loss: 1.0574, Val Acc: 32.44%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▄▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▅▇██
Validation Loss,█▃▂▁▁

0,1
Epoch,5.0
Train Accuracy,32.04426
Train Loss,0.65379
Validation Accuracy,32.43719
Validation Loss,1.05736


[34m[1mwandb[0m: Agent Starting Run: 8jx553qj with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 3
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 32
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.002
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7


Epoch 1/5 | Train Loss: 2.5238, Train Acc: 0.78% | Val Loss: 1.7590, Val Acc: 4.66%
Epoch 2/5 | Train Loss: 1.1441, Train Acc: 15.09% | Val Loss: 1.2295, Val Acc: 21.40%
Epoch 3/5 | Train Loss: 0.8311, Train Acc: 24.86% | Val Loss: 1.1229, Val Acc: 28.95%
Epoch 4/5 | Train Loss: 0.7009, Train Acc: 31.85% | Val Loss: 1.0919, Val Acc: 31.25%
Epoch 5/5 | Train Loss: 0.6129, Train Acc: 36.42% | Val Loss: 1.0441, Val Acc: 32.96%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▄▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▅▇██
Validation Loss,█▃▂▁▁

0,1
Epoch,5.0
Train Accuracy,36.41716
Train Loss,0.61286
Validation Accuracy,32.95803
Validation Loss,1.04405


[34m[1mwandb[0m: Agent Starting Run: mnr9yd3a with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 256
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7


Epoch 1/5 | Train Loss: 1.9298, Train Acc: 5.46% | Val Loss: 1.4077, Val Acc: 16.99%
Epoch 2/5 | Train Loss: 0.9426, Train Acc: 20.90% | Val Loss: 1.1700, Val Acc: 25.77%
Epoch 3/5 | Train Loss: 0.7395, Train Acc: 29.41% | Val Loss: 1.1092, Val Acc: 30.19%
Epoch 4/5 | Train Loss: 0.6348, Train Acc: 34.74% | Val Loss: 1.0754, Val Acc: 31.21%
Epoch 5/5 | Train Loss: 0.5547, Train Acc: 38.60% | Val Loss: 1.0668, Val Acc: 32.65%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▄▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▅▇▇█
Validation Loss,█▃▂▁▁

0,1
Epoch,5.0
Train Accuracy,38.59748
Train Loss,0.55466
Validation Accuracy,32.65165
Validation Loss,1.06678


[34m[1mwandb[0m: Agent Starting Run: 1qhiutrg with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 5
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 256
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


Epoch 1/5 | Train Loss: 2.1272, Train Acc: 4.15% | Val Loss: 1.4219, Val Acc: 14.59%
Epoch 2/5 | Train Loss: 0.9428, Train Acc: 21.92% | Val Loss: 1.1751, Val Acc: 26.50%
Epoch 3/5 | Train Loss: 0.7115, Train Acc: 31.39% | Val Loss: 1.1061, Val Acc: 31.62%
Epoch 4/5 | Train Loss: 0.5932, Train Acc: 37.48% | Val Loss: 1.0477, Val Acc: 34.83%
Epoch 5/5 | Train Loss: 0.5030, Train Acc: 42.48% | Val Loss: 1.0613, Val Acc: 36.01%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▄▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▅▇██
Validation Loss,█▃▂▁▁

0,1
Epoch,5.0
Train Accuracy,42.48312
Train Loss,0.503
Validation Accuracy,36.01409
Validation Loss,1.06131


[34m[1mwandb[0m: Agent Starting Run: 6fcuqt60 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 3
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


Epoch 1/10 | Train Loss: 2.1783, Train Acc: 3.44% | Val Loss: 1.4020, Val Acc: 16.37%
Epoch 2/10 | Train Loss: 0.9317, Train Acc: 20.79% | Val Loss: 1.1137, Val Acc: 26.15%
Epoch 3/10 | Train Loss: 0.7045, Train Acc: 30.57% | Val Loss: 1.0775, Val Acc: 32.11%
Epoch 4/10 | Train Loss: 0.5931, Train Acc: 36.93% | Val Loss: 1.0596, Val Acc: 35.48%
Epoch 5/10 | Train Loss: 0.5099, Train Acc: 42.06% | Val Loss: 1.0065, Val Acc: 35.67%
Epoch 6/10 | Train Loss: 0.4478, Train Acc: 46.66% | Val Loss: 1.0296, Val Acc: 35.77%
Epoch 7/10 | Train Loss: 0.4062, Train Acc: 50.13% | Val Loss: 1.0753, Val Acc: 35.88%
Epoch 8/10 | Train Loss: 0.3678, Train Acc: 52.42% | Val Loss: 1.0966, Val Acc: 35.29%
Epoch 9/10 | Train Loss: 0.3362, Train Acc: 54.57% | Val Loss: 1.0903, Val Acc: 37.33%
Epoch 10/10 | Train Loss: 0.3097, Train Acc: 57.67% | Val Loss: 1.1164, Val Acc: 36.70%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▃▅▅▆▇▇▇██
Train Loss,█▃▂▂▂▂▁▁▁▁
Validation Accuracy,▁▄▆▇▇▇█▇██
Validation Loss,█▃▂▂▁▁▂▃▂▃

0,1
Epoch,10.0
Train Accuracy,57.67409
Train Loss,0.3097
Validation Accuracy,36.69577
Validation Loss,1.11636


[34m[1mwandb[0m: Agent Starting Run: w2w2qbxw with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 3
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 128
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.002
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2


Epoch 1/5 | Train Loss: 1.7897, Train Acc: 7.66% | Val Loss: 1.3316, Val Acc: 19.35%
Epoch 2/5 | Train Loss: 0.8376, Train Acc: 27.01% | Val Loss: 1.1527, Val Acc: 28.32%
Epoch 3/5 | Train Loss: 0.6451, Train Acc: 35.90% | Val Loss: 1.0941, Val Acc: 32.12%
Epoch 4/5 | Train Loss: 0.5490, Train Acc: 41.05% | Val Loss: 1.0753, Val Acc: 35.03%
Epoch 5/5 | Train Loss: 0.4794, Train Acc: 45.96% | Val Loss: 1.0897, Val Acc: 35.47%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▅▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▅▇██
Validation Loss,█▃▂▁▁

0,1
Epoch,5.0
Train Accuracy,45.96138
Train Loss,0.47937
Validation Accuracy,35.47028
Validation Loss,1.0897


[34m[1mwandb[0m: Agent Starting Run: 3cqizvav with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 64
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.002
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7


Epoch 1/5 | Train Loss: 1.8595, Train Acc: 8.08% | Val Loss: 1.2841, Val Acc: 24.79%
Epoch 2/5 | Train Loss: 0.8257, Train Acc: 23.24% | Val Loss: 1.1309, Val Acc: 29.28%
Epoch 3/5 | Train Loss: 0.6872, Train Acc: 29.61% | Val Loss: 1.0597, Val Acc: 34.13%
Epoch 4/5 | Train Loss: 0.6056, Train Acc: 31.57% | Val Loss: 1.0009, Val Acc: 34.97%
Epoch 5/5 | Train Loss: 0.5559, Train Acc: 33.97% | Val Loss: 1.0440, Val Acc: 37.74%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▅▇▇█
Train Loss,█▂▂▁▁
Validation Accuracy,▁▃▆▇█
Validation Loss,█▄▂▁▂

0,1
Epoch,5.0
Train Accuracy,33.9678
Train Loss,0.55593
Validation Accuracy,37.73744
Validation Loss,1.04396


[34m[1mwandb[0m: Agent Starting Run: q07egam9 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 32
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7


Epoch 1/10 | Train Loss: 1.8746, Train Acc: 7.25% | Val Loss: 1.2918, Val Acc: 20.78%
Epoch 2/10 | Train Loss: 0.8196, Train Acc: 27.42% | Val Loss: 1.1246, Val Acc: 28.61%
Epoch 3/10 | Train Loss: 0.6593, Train Acc: 34.87% | Val Loss: 1.0380, Val Acc: 36.01%
Epoch 4/10 | Train Loss: 0.5711, Train Acc: 39.14% | Val Loss: 1.0277, Val Acc: 36.38%
Epoch 5/10 | Train Loss: 0.5077, Train Acc: 43.01% | Val Loss: 1.0386, Val Acc: 37.41%
Epoch 6/10 | Train Loss: 0.4586, Train Acc: 46.89% | Val Loss: 1.0244, Val Acc: 36.83%
Epoch 7/10 | Train Loss: 0.4213, Train Acc: 49.27% | Val Loss: 1.0007, Val Acc: 39.26%
Epoch 8/10 | Train Loss: 0.3855, Train Acc: 50.92% | Val Loss: 1.0200, Val Acc: 39.52%
Epoch 9/10 | Train Loss: 0.3476, Train Acc: 53.28% | Val Loss: 1.0164, Val Acc: 39.71%
Epoch 10/10 | Train Loss: 0.3265, Train Acc: 55.26% | Val Loss: 1.0358, Val Acc: 40.26%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▆▆▇▇▇██
Train Loss,█▃▃▂▂▂▁▁▁▁
Validation Accuracy,▁▄▆▇▇▇████
Validation Loss,█▄▂▂▂▂▁▁▁▂

0,1
Epoch,10.0
Train Accuracy,55.26062
Train Loss,0.32651
Validation Accuracy,40.25735
Validation Loss,1.0358


[34m[1mwandb[0m: Agent Starting Run: dtdg88qk with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 128
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.002
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2


Epoch 1/10 | Train Loss: 1.6402, Train Acc: 11.08% | Val Loss: 1.2156, Val Acc: 24.07%
Epoch 2/10 | Train Loss: 0.8008, Train Acc: 25.62% | Val Loss: 1.1017, Val Acc: 26.33%
Epoch 3/10 | Train Loss: 0.6868, Train Acc: 30.75% | Val Loss: 1.0964, Val Acc: 30.93%
Epoch 4/10 | Train Loss: 0.6188, Train Acc: 33.41% | Val Loss: 1.0666, Val Acc: 33.94%
Epoch 5/10 | Train Loss: 0.5744, Train Acc: 35.46% | Val Loss: 1.0048, Val Acc: 34.18%
Epoch 6/10 | Train Loss: 0.5393, Train Acc: 37.25% | Val Loss: 1.0089, Val Acc: 36.29%
Epoch 7/10 | Train Loss: 0.5107, Train Acc: 37.95% | Val Loss: 1.0042, Val Acc: 35.89%
Epoch 8/10 | Train Loss: 0.4875, Train Acc: 39.00% | Val Loss: 1.0228, Val Acc: 37.04%
Epoch 9/10 | Train Loss: 0.4663, Train Acc: 41.40% | Val Loss: 1.0105, Val Acc: 36.99%
Epoch 10/10 | Train Loss: 0.4485, Train Acc: 42.45% | Val Loss: 1.0298, Val Acc: 36.24%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▆▆▇▇▇██
Train Loss,█▃▂▂▂▂▁▁▁▁
Validation Accuracy,▁▂▅▆▆█▇███
Validation Loss,█▄▄▃▁▁▁▂▁▂

0,1
Epoch,10.0
Train Accuracy,42.44584
Train Loss,0.4485
Validation Accuracy,36.23621
Validation Loss,1.02982


[34m[1mwandb[0m: Agent Starting Run: vee7w0lx with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 5
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 64
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7


Epoch 1/5 | Train Loss: 2.6481, Train Acc: 0.33% | Val Loss: 1.9047, Val Acc: 4.39%
Epoch 2/5 | Train Loss: 1.2001, Train Acc: 14.08% | Val Loss: 1.2800, Val Acc: 22.53%
Epoch 3/5 | Train Loss: 0.8282, Train Acc: 26.16% | Val Loss: 1.1290, Val Acc: 28.84%
Epoch 4/5 | Train Loss: 0.6993, Train Acc: 31.78% | Val Loss: 1.0713, Val Acc: 35.09%
Epoch 5/5 | Train Loss: 0.6195, Train Acc: 36.09% | Val Loss: 1.0057, Val Acc: 34.41%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▄▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▅▇██
Validation Loss,█▃▂▂▁

0,1
Epoch,5.0
Train Accuracy,36.08716
Train Loss,0.61947
Validation Accuracy,34.40564
Validation Loss,1.00571


[34m[1mwandb[0m: Agent Starting Run: ke7lmppq with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 3
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 32
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7


Epoch 1/5 | Train Loss: 1.2660, Train Acc: 17.72% | Val Loss: 1.1171, Val Acc: 28.11%
Epoch 2/5 | Train Loss: 0.6835, Train Acc: 32.24% | Val Loss: 1.0256, Val Acc: 32.00%
Epoch 3/5 | Train Loss: 0.5727, Train Acc: 37.27% | Val Loss: 1.0223, Val Acc: 35.79%
Epoch 4/5 | Train Loss: 0.5006, Train Acc: 42.30% | Val Loss: 0.9923, Val Acc: 36.83%
Epoch 5/5 | Train Loss: 0.4498, Train Acc: 45.66% | Val Loss: 1.0104, Val Acc: 38.66%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▅▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▄▆▇█
Validation Loss,█▃▃▁▂

0,1
Epoch,5.0
Train Accuracy,45.65834
Train Loss,0.44978
Validation Accuracy,38.66422
Validation Loss,1.01039


[34m[1mwandb[0m: Agent Starting Run: mju5pfjf with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 3
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 64
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7


Epoch 1/5 | Train Loss: 1.8375, Train Acc: 7.11% | Val Loss: 1.2885, Val Acc: 19.95%
Epoch 2/5 | Train Loss: 0.8354, Train Acc: 25.84% | Val Loss: 1.1152, Val Acc: 29.30%
Epoch 3/5 | Train Loss: 0.6492, Train Acc: 35.69% | Val Loss: 1.0744, Val Acc: 29.61%
Epoch 4/5 | Train Loss: 0.5468, Train Acc: 42.33% | Val Loss: 1.0219, Val Acc: 35.48%
Epoch 5/5 | Train Loss: 0.4667, Train Acc: 48.01% | Val Loss: 1.0446, Val Acc: 34.57%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▄▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▅▅██
Validation Loss,█▃▂▁▂

0,1
Epoch,5.0
Train Accuracy,48.00718
Train Loss,0.46671
Validation Accuracy,34.57414
Validation Loss,1.04464


[34m[1mwandb[0m: Agent Starting Run: c1em72jh with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 5
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 128
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2


Epoch 1/5 | Train Loss: 1.2586, Train Acc: 17.57% | Val Loss: 1.1479, Val Acc: 26.56%
Epoch 2/5 | Train Loss: 0.6804, Train Acc: 32.28% | Val Loss: 1.0880, Val Acc: 31.96%
Epoch 3/5 | Train Loss: 0.5753, Train Acc: 35.73% | Val Loss: 1.0008, Val Acc: 36.09%
Epoch 4/5 | Train Loss: 0.5098, Train Acc: 39.93% | Val Loss: 1.0035, Val Acc: 37.71%
Epoch 5/5 | Train Loss: 0.4591, Train Acc: 43.62% | Val Loss: 0.9715, Val Acc: 37.98%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▅▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▄▇██
Validation Loss,█▆▂▂▁

0,1
Epoch,5.0
Train Accuracy,43.61875
Train Loss,0.45914
Validation Accuracy,37.98254
Validation Loss,0.97153


[34m[1mwandb[0m: Agent Starting Run: vwxun292 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 64
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.002
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


Epoch 1/5 | Train Loss: 2.0255, Train Acc: 4.33% | Val Loss: 1.3897, Val Acc: 17.55%
Epoch 2/5 | Train Loss: 0.9705, Train Acc: 20.21% | Val Loss: 1.1792, Val Acc: 25.99%
Epoch 3/5 | Train Loss: 0.7580, Train Acc: 27.45% | Val Loss: 1.0891, Val Acc: 32.11%
Epoch 4/5 | Train Loss: 0.6562, Train Acc: 33.22% | Val Loss: 1.1030, Val Acc: 32.25%
Epoch 5/5 | Train Loss: 0.5960, Train Acc: 36.32% | Val Loss: 1.0648, Val Acc: 32.95%


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▄▆▇█
Train Loss,█▃▂▁▁
Validation Accuracy,▁▅███
Validation Loss,█▃▂▂▁

0,1
Epoch,5.0
Train Accuracy,36.31728
Train Loss,0.59603
Validation Accuracy,32.95037
Validation Loss,1.06479


[34m[1mwandb[0m: Agent Starting Run: 2veuibym with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 5
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 32
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7


Epoch 1/10 | Train Loss: 2.8677, Train Acc: 0.02% | Val Loss: 2.5372, Val Acc: 0.21%
Epoch 2/10 | Train Loss: 1.6705, Train Acc: 5.19% | Val Loss: 1.4857, Val Acc: 14.65%
Epoch 3/10 | Train Loss: 1.0470, Train Acc: 17.54% | Val Loss: 1.2484, Val Acc: 24.09%
Epoch 4/10 | Train Loss: 0.8425, Train Acc: 24.74% | Val Loss: 1.1601, Val Acc: 30.24%
Epoch 5/10 | Train Loss: 0.7250, Train Acc: 29.41% | Val Loss: 1.1045, Val Acc: 31.40%
Epoch 6/10 | Train Loss: 0.6362, Train Acc: 32.99% | Val Loss: 1.1010, Val Acc: 33.49%
Epoch 7/10 | Train Loss: 0.5721, Train Acc: 36.71% | Val Loss: 1.1144, Val Acc: 35.42%
Epoch 8/10 | Train Loss: 0.5289, Train Acc: 39.60% | Val Loss: 1.0881, Val Acc: 34.10%
Epoch 9/10 | Train Loss: 0.4836, Train Acc: 42.21% | Val Loss: 1.0840, Val Acc: 34.34%
Epoch 10/10 | Train Loss: 0.4479, Train Acc: 44.56% | Val Loss: 1.0900, Val Acc: 34.79%


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▂▄▅▆▆▇▇██
Train Loss,█▅▃▂▂▂▁▁▁▁
Validation Accuracy,▁▄▆▇▇█████
Validation Loss,█▃▂▁▁▁▁▁▁▁

0,1
Epoch,10.0
Train Accuracy,44.56439
Train Loss,0.44793
Validation Accuracy,34.7886
Validation Loss,1.08995


In [8]:
import torch
import torch.nn as nn
import random
import wandb
import torch.nn.functional as F
import torch.optim as optim

import torch._dynamo
torch._dynamo.config.suppress_errors = True
torch._dynamo.disable()


best_config = {
    "embed_dim": 128,
    "hidden_dim": 256,
    "enc_layers": 3,
    "dec_layers": 3,
    "cell_type": "LSTM",
    "dropout": 0.2,
    "batch_size": 64,
    "bidirectional": False,
    "learning_rate": 0.001,
    "epochs": 10,
    "beam_width":3,
    "teacher_forcing_ratio": 0.5
}

def training_test(best_config):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


    train_path = "/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
    test_path = "/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv"
    train_set = read_pairs(train_path)
    test_set = read_pairs(test_path)

    # src_vocab, idx2src, tgt_vocab, idx2tgt, create_batch = build_vocab_and_prepare_batch(train_set, device)
    src_vocab, idx2src, tgt_vocab, idx2tgt, create_batch,unique_chars_latin, unique_chars_dev = build_vocab_and_prepare_batch(train_set, device)
    model = TransliterationModelattention(len(src_vocab), len(tgt_vocab), best_config["embed_dim"],best_config["hidden_dim"],
                                 best_config["enc_layers"], best_config["dec_layers"], best_config["cell_type"], best_config["dropout"], best_config["bidirectional"]).to(device)

    optimizer = optim.Adam(model.parameters(), lr=best_config["learning_rate"])
    criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab['<pad>'])

    for epoch in range(best_config["epochs"]):
        model.train()
        total_loss, total_acc = 0, 0
        random.shuffle(train_set)

        for i in range(0, len(train_set), best_config["batch_size"]):
            batch = train_set[i:i+best_config["batch_size"]]
            src, tgt = create_batch(batch)


            optimizer.zero_grad()
            outputs = model(src, tgt, best_config["teacher_forcing_ratio"])
            loss = criterion(outputs[:, 1:].reshape(-1, outputs.size(-1)), tgt[:, 1:].reshape(-1))

            preds = outputs.argmax(-1)
            acc = compute_word_level_accuracy(preds[:, 1:], tgt[:, 1:], tgt_vocab)

            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            total_acc += acc

        avg_train_loss = total_loss / (len(train_set) // best_config["batch_size"])
        avg_train_acc = total_acc / (len(train_set) // best_config["batch_size"])

        model.eval()
        test_loss, test_acc = 0, 0
        printed = 0
        with torch.no_grad():
            for i in range(0, len(test_set), best_config["batch_size"]):
                batch = test_set[i:i+ best_config["batch_size"]]
                src, tgt = create_batch(batch)
                outputs = model(src, tgt, 0)
                loss = criterion(outputs[:, 1:].reshape(-1, outputs.size(-1)), tgt[:, 1:].reshape(-1))


                preds = outputs.argmax(-1)
                acc = compute_word_level_accuracy(preds[:, 1:], tgt[:, 1:], tgt_vocab)

                test_loss += loss.item()
                test_acc += acc

                if printed < 5:
                    for j in range(min(3, src.size(0))):
                        input_seq = ''.join([idx2src.get(idx.item(), '<unk>') for idx in src[j] if idx.item() not in [src_vocab['<pad>'], src_vocab['<eos>']]])
                        target_seq = ''.join([idx2tgt.get(idx.item(), '<unk>') for idx in tgt[j][1:] if idx.item() not in [tgt_vocab['<pad>'], tgt_vocab['<eos>']]])
                        pred_seq = ''.join([idx2tgt.get(idx.item(), '<unk>') for idx in preds[j][1:] if idx.item() not in [tgt_vocab['<pad>'], tgt_vocab['<eos>']]])
                        print(f"\n Input:{input_seq} | Target:{target_seq} | Predicted:{pred_seq}")
                        print("-" * 40)
                        printed += 1

        avg_test_loss = test_loss / (len(test_set) // best_config["batch_size"])
        avg_test_acc = test_acc / (len(test_set) // best_config["batch_size"])


    print(f" test Loss: {avg_test_loss:.4f}, test Acc: {avg_test_acc:.2f}%")


training_test(best_config)


 Input:ank | Target:अंक | Predicted:अन्क
----------------------------------------

 Input:anka | Target:अंक | Predicted:अन्मा
----------------------------------------

 Input:ankit | Target:अंकित | Predicted:अं्त्त
----------------------------------------

 Input:atthas | Target:अट्टहास | Predicted:अत्त्त
----------------------------------------

 Input:addon | Target:अड्डों | Predicted:अक्ंों
----------------------------------------

 Input:athak | Target:अथक | Predicted:अध्त
----------------------------------------

 Input:ank | Target:अंक | Predicted:अंक
----------------------------------------

 Input:anka | Target:अंक | Predicted:अंका
----------------------------------------

 Input:ankit | Target:अंकित | Predicted:अंकित
----------------------------------------

 Input:atthas | Target:अट्टहास | Predicted:अत्थ
----------------------------------------

 Input:addon | Target:अड्डों | Predicted:एडडों
----------------------------------------

 Input:athak | Target:अथक | Predicted:आठक


In [9]:
best_config = {
    "embed_dim": 128,
    "hidden_dim": 256,
    "enc_layers": 3,
    "dec_layers": 3,
    "cell_type": "LSTM",
    "dropout": 0.2,
    "batch_size": 64,
    "bidirectional": False,
    "learning_rate": 0.001,
    "epochs": 10,
    "beam_width":3,
    "teacher_forcing_ratio": 0.5
}


import torch
import torch.nn as nn
import torch.optim as optim
import os
import csv
import random
import matplotlib.pyplot as plt
import pandas as pd

def training_test(best_config):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train_path = "/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
    test_path = "/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv"
    train_set = read_pairs(train_path)
    test_set = read_pairs(test_path)

    src_vocab, idx2src, tgt_vocab, idx2tgt, create_batch, unique_chars_latin, unique_chars_dev = build_vocab_and_prepare_batch(train_set, device)

    model = TransliterationModelattention(
        len(src_vocab), len(tgt_vocab),
        best_config["embed_dim"], best_config["hidden_dim"],
        best_config["enc_layers"], best_config["dec_layers"],
        best_config["cell_type"], best_config["dropout"],
        best_config["bidirectional"]
    ).to(device)

    optimizer = optim.Adam(model.parameters(), lr=best_config["learning_rate"])
    criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab['<pad>'])

    os.makedirs("predictions_attention", exist_ok=True)

    for epoch in range(best_config["epochs"]):
        model.train()
        total_loss, total_acc = 0, 0
        random.shuffle(train_set)

        for i in range(0, len(train_set), best_config["batch_size"]):
            batch = train_set[i:i + best_config["batch_size"]]
            src, tgt = create_batch(batch)

            optimizer.zero_grad()
            outputs = model(src, tgt, best_config["teacher_forcing_ratio"])
            loss = criterion(outputs[:, 1:].reshape(-1, outputs.size(-1)), tgt[:, 1:].reshape(-1))

            preds = outputs.argmax(-1)
            acc = compute_word_level_accuracy(preds[:, 1:], tgt[:, 1:], tgt_vocab)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            total_acc += acc

        avg_train_loss = total_loss / (len(train_set) // best_config["batch_size"])
        avg_train_acc = total_acc / (len(train_set) // best_config["batch_size"])

        model.eval()
        test_loss, test_acc = 0, 0
        predictions = []
        printed = 0
        

        with torch.no_grad():
            for i in range(0, len(test_set), best_config["batch_size"]):
                batch = test_set[i:i + best_config["batch_size"]]
                src, tgt = create_batch(batch)
                outputs = model(src, tgt, 0)

                loss = criterion(outputs[:, 1:].reshape(-1, outputs.size(-1)), tgt[:, 1:].reshape(-1))
                preds = outputs.argmax(-1)
                acc = compute_word_level_accuracy(preds[:, 1:], tgt[:, 1:], tgt_vocab)

                test_loss += loss.item()
                test_acc += acc

                for j in range(src.size(0)):
                    input_seq = ''.join([idx2src.get(idx.item(), '') for idx in src[j] if idx.item() not in [src_vocab['<pad>'], src_vocab['<eos>']]])
                    target_seq = ''.join([idx2tgt.get(idx.item(), '') for idx in tgt[j][1:] if idx.item() not in [tgt_vocab['<pad>'], tgt_vocab['<eos>']]])
                    pred_seq = ''.join([idx2tgt.get(idx.item(), '') for idx in preds[j][1:] if idx.item() not in [tgt_vocab['<pad>'], tgt_vocab['<eos>']]])
                    predictions.append({'Input': input_seq, 'Target': target_seq, 'Predicted': pred_seq})


                    

        avg_test_loss = test_loss / (len(test_set) // best_config["batch_size"])
        avg_test_acc = test_acc / (len(test_set) // best_config["batch_size"])

    # Save all predictions to CSV................................
    os.makedirs("predictions_vanilla", exist_ok=True)
    with open("predictions_vanilla/test_predictions.csv", "w", newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['Input', 'Target', 'Predicted'])
        writer.writeheader()
        writer.writerows(predictions)
        
   # Print 3x3 sample grid....................
    sample_df = pd.DataFrame(predictions[:9])
    print("\nSample Prediction Grid:\n")
    for i, row in sample_df.iterrows():
        print(f"{i+1}. Input: {row['Input']} | Target: {row['Target']} | Predicted: {row['Predicted']}")
        

training_test(best_config)


Sample Prediction Grid:

1. Input: ank | Target: अंक | Predicted: आंक
2. Input: anka | Target: अंक | Predicted: अंका
3. Input: ankit | Target: अंकित | Predicted: अंकीत
4. Input: anakon | Target: अंकों | Predicted: अनाकों
5. Input: ankhon | Target: अंकों | Predicted: अंखों
6. Input: ankon | Target: अंकों | Predicted: अंकों
7. Input: angkor | Target: अंकोर | Predicted: अंगकोर
8. Input: ankor | Target: अंकोर | Predicted: एंकोर
9. Input: angaarak | Target: अंगारक | Predicted: अंगारक


Heatmap................................................

In [1]:
import torch
import torch.nn as nn
import random
import torch.nn.functional as F
import wandb
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

wandb.login(key='b5d1fbca9d5170f54415e9c5a70ef09cee7a0aec')
class InputEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, layers, rnn_type='LSTM', dropout_rate=0.2, is_bidirectional=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
        rnn_class = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}[rnn_type]
        self.rnn = rnn_class(embedding_size, hidden_size, layers, dropout=dropout_rate, batch_first=True, bidirectional=is_bidirectional)
        self.is_bidirectional = is_bidirectional
        self.rnn_type = rnn_type
        self.hidden_size = hidden_size
        self.num_layers = layers

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)  # output: [B, T, H*num_directions]
        return output, hidden  # Return all outputs for attention


class Attention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))

    def forward(self, hidden, encoder_outputs, mask=None):
        # hidden: [B, H], encoder_outputs: [B, T, H]
        B, T, H = encoder_outputs.shape
        hidden = hidden.unsqueeze(1).repeat(1, T, 1)  # [B, T, H]

        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))  # [B, T, H]
        energy = energy @ self.v  # [B, T]

        if mask is not None:
            energy = energy.masked_fill(mask == 0, float('-inf')) 

        return F.softmax(energy, dim=1)  # [B, T]


class OutputDecoder(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, layers, rnn_type='LSTM', dropout_rate=0.2, is_bidirectional=False, return_attention=True):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
        
        rnn_class = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}[rnn_type]
        self.rnn = rnn_class(embedding_size + hidden_size, hidden_size, layers, dropout=dropout_rate, batch_first=True)
        self.output_layer = nn.Linear(hidden_size, vocab_size)
        self.attention = Attention(hidden_size)  # Using the updated attention mechanism
        self.rnn_type = rnn_type
        self.hidden_size = hidden_size
        self.num_layers = layers
        self.return_attention = return_attention

    def forward(self, token, hidden, encoder_outputs, mask=None):
        token = token.unsqueeze(1)  # [B, 1]
        embedded = self.embedding(token)  # [B, 1, E]
        
        # Getting the last hidden state from the encoder
        if self.rnn_type == 'LSTM':
            h = hidden[0][-1]  # last layer hidden state
        else:
            h = hidden[-1]  # last layer hidden state

        # Applying attention mechanism with optional mask
        attn_weights = self.attention(h, encoder_outputs, mask)  # [B, T]
        attn_applied = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)  # [B, 1, H]

        # Concatenating the attention-applied vector with the embedded token
        rnn_input = torch.cat((embedded, attn_applied), dim=2)  # [B, 1, E+H]
        
        # Passing the concatenated input through the RNN
        output, hidden = self.rnn(rnn_input, hidden)
        
        # Passing the output through the output layer
        output = self.output_layer(output.squeeze(1))  # [B, vocab_size]

        # Returning the output, hidden state, and attention weights (if needed)
        if self.return_attention:
            return output, hidden, attn_weights
        return output, hidden


class TransliterationModelattention(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, embedding_size, hidden_size, enc_layers, dec_layers,
                 rnn_type='LSTM', dropout_rate=0.2, is_bidirectional=False):
        super().__init__()
        self.encoder = InputEncoder(input_vocab_size, embedding_size, hidden_size, enc_layers, rnn_type, dropout_rate, is_bidirectional)
        self.decoder = OutputDecoder(output_vocab_size, embedding_size, hidden_size * (2 if is_bidirectional else 1),
                                     dec_layers, rnn_type, dropout_rate, is_bidirectional=False, return_attention= True)  # decoder not bidirectional
        self.rnn_type = rnn_type
        self.hidden_size = hidden_size
        self.is_bidirectional = is_bidirectional


    def forward(self, source, target, teacher_forcing_prob=0.5,mask = None):
        batch_size, target_len = target.size()
        output_vocab_size = self.decoder.output_layer.out_features
        predictions = torch.zeros(batch_size, target_len, output_vocab_size, device=source.device)
        all_attentions = torch.zeros(batch_size, target_len, source.size(1), device=source.device)

        encoder_outputs, encoder_hidden = self.encoder(source)

        def merge_bidirectional(state):
            return torch.cat([state[i::2] for i in range(2)], dim=2)

        def match_layers(state, required_layers):
            actual_layers = state.size(0)
            if actual_layers == required_layers:
                return state
            elif actual_layers > required_layers:
                return state[:required_layers]
            else:
                pad = torch.zeros(required_layers - actual_layers, *state.shape[1:], device=state.device)
                return torch.cat([state, pad], dim=0)

        if self.rnn_type == 'LSTM':
            h, c = encoder_hidden
            if self.encoder.is_bidirectional:
                h, c = merge_bidirectional(h), merge_bidirectional(c)
            h = match_layers(h, self.decoder.rnn.num_layers)
            c = match_layers(c, self.decoder.rnn.num_layers)
            decoder_hidden = (h, c)
        else:
            h = encoder_hidden
            if self.encoder.is_bidirectional:
                h = merge_bidirectional(h)
            h = match_layers(h, self.decoder.rnn.num_layers)
            decoder_hidden = h

        decoder_input = target[:, 0]
        for t in range(1, target_len):
            output, decoder_hidden, attn_weights = self.decoder(decoder_input, decoder_hidden, encoder_outputs,mask)
            predictions[:, t] = output
            all_attentions[:, t] = attn_weights  # store attention weights for step t
            top1 = output.argmax(1)
            decoder_input = target[:, t] if random.random() < teacher_forcing_prob else top1

        return predictions, all_attentions
        
def build_vocab_and_prepare_batch(seqs, device):
    special_tokens = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
    unique_chars_latin = sorted(set(ch for seq in seqs for ch in seq[0]))
    unique_chars_dev = sorted(set(ch for seq in seqs for ch in seq[1]))

    src_vocab = {ch: idx+len(special_tokens) for idx, ch in enumerate(unique_chars_latin)}
    src_vocab.update(special_tokens)
    tgt_vocab = {ch: idx+len(special_tokens) for idx, ch in enumerate(unique_chars_dev)}
    tgt_vocab.update(special_tokens)
    idx2src = {idx: ch for ch, idx in src_vocab.items()}
    idx2tgt = {idx: ch for ch, idx in tgt_vocab.items()}

    def encode_text(seq, vocab):
        return [vocab.get(ch, vocab['<unk>']) for ch in seq]

    def create_batch(pairs):
        src = [torch.tensor(encode_text(x, src_vocab) + [src_vocab['<eos>']]) for x, _ in pairs]
        tgt = [torch.tensor([tgt_vocab['<sos>']] + encode_text(y, tgt_vocab) + [tgt_vocab['<eos>']]) for _, y in pairs]

        src = pad_sequence(src, batch_first=True, padding_value=src_vocab['<pad>'])
        tgt = pad_sequence(tgt, batch_first=True, padding_value=tgt_vocab['<pad>'])


def plot_attention_grid(attentions, src_tokens, tgt_tokens, idx2src, idx2tgt, src_vocab, tgt_vocab):
    font_path = "/kaggle/input/noto-sans/static/NotoSansDevanagari-Regular.ttf"
    devanagari_font = fm.FontProperties(fname=font_path)

    fig, axes = plt.subplots(3, 3, figsize=(15, 12))
    axes = axes.flatten()
    for i in range(9):
        ax = axes[i]
        attn = attentions[i].cpu().numpy()

        # Mask out invalid values (e.g., -inf)
        mask = attn == -float('inf')
        attn[mask] = 0  # Replace invalid values with zeros

        input_chars = [idx2src[idx.item()] for idx in src_tokens[i] 
                       if idx.item() not in [src_vocab['<pad>'], src_vocab['<eos>'], src_vocab['<sos>']]]
        target_chars = [idx2tgt[idx.item()] for idx in tgt_tokens[i] 
                        if idx.item() not in [tgt_vocab['<pad>'], tgt_vocab['<eos>'], tgt_vocab['<sos>']]]

        # sns.heatmap(
        #     attn[1:len(target_chars)+1, :len(input_chars)],
        #     xticklabels=input_chars,
        #     yticklabels=target_chars,

        #     cmap="viridis",
        #     ax=ax,
        #     cbar=False
        # )

        ax.set_xlabel("Source (Latin)", fontproperties=devanagari_font)
        ax.set_ylabel("Target (Devanagari)", fontproperties=devanagari_font)
        ax.set_title(f"Sample {i+1}", fontproperties=devanagari_font)

        for label in ax.get_xticklabels():
            label.set_fontproperties(devanagari_font)
        for label in ax.get_yticklabels():
            label.set_fontproperties(devanagari_font)

        ax.tick_params(axis='x', labelrotation=90)

    plt.tight_layout()
    os.makedirs("plots", exist_ok=True)
    # plt.savefig("plots/attention_heatmap_grid.png", bbox_inches="tight")
    # plt.show()

import os
import csv
import random
import torch
import torch.optim as optim
import torch.nn as nn

# Define the best_config
best_config = {
    "embed_dim": 128,
    "hidden_dim": 256,
    "enc_layers": 3,
    "dec_layers": 3,
    "cell_type": "LSTM",
    "dropout": 0.2,
    "batch_size": 64,
    "bidirectional": False,
    "learning_rate": 0.001,
    "epochs": 10,
    "teacher_forcing_ratio": 0.5
}

# Create the folder if it doesn't exist
os.makedirs("predictions_attention", exist_ok=True)

def training_test(best_config):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train_path = "/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
    test_path = "/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv"
    train_set = read_pairs(train_path)
    test_set = read_pairs(test_path)

    # Prepare vocabulary and batch creation
    src_vocab, idx2src, tgt_vocab, idx2tgt, create_batch, unique_chars_latin, unique_chars_dev = build_vocab_and_prepare_batch(train_set, device)
    
    # Initialize the model
    model = TransliterationModelattention(len(src_vocab), len(tgt_vocab), best_config["embed_dim"], best_config["hidden_dim"],
                                         best_config["enc_layers"], best_config["dec_layers"], best_config["cell_type"], best_config["dropout"], best_config["bidirectional"]).to(device)

    optimizer = optim.Adam(model.parameters(), lr=best_config["learning_rate"])
    criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab['<pad>'])

    # Open CSV file to save predictions
    with open("predictions_attention/predictions.csv", "w", newline='', encoding="utf-8") as csvfile:
        fieldnames = ['Input', 'Target', 'Predicted']  # CSV headers
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()

        for epoch in range(best_config["epochs"]):
            model.train()
            total_loss, total_acc = 0, 0
            random.shuffle(train_set)
        

            for i in range(0, len(train_set), best_config["batch_size"]):
                batch = train_set[i:i + best_config["batch_size"]]
                src, tgt = create_batch(batch)
                mask = (src != 0).float() 
                optimizer.zero_grad()
                outputs,_ = model(src, tgt, best_config["teacher_forcing_ratio"],mask)
                loss = criterion(outputs[:, 1:].reshape(-1, outputs.size(-1)), tgt[:, 1:].reshape(-1))

                preds = outputs.argmax(-1)
                acc = compute_word_level_accuracy(preds[:, 1:], tgt[:, 1:], tgt_vocab)

                loss.backward()
                optimizer.step()
                total_loss += loss.item()
                total_acc += acc

            avg_train_loss = total_loss / (len(train_set) // best_config["batch_size"])
            avg_train_acc = total_acc / (len(train_set) // best_config["batch_size"])

            model.eval()
            test_loss, test_acc = 0, 0
            printed = 0
            with torch.no_grad():
                for i in range(0, len(test_set), best_config["batch_size"]):
                    batch = test_set[i:i + best_config["batch_size"]]
                    src, tgt = create_batch(batch)
                    outputs,attn_weights = model(src, tgt, 0)

                    # if printed == 0:
                    #     # Save first 9 samples for heatmap
                    #     plot_attention_grid(attn_weights[:9], src[:9], tgt[:9], idx2src, idx2tgt)
                    #     printed += 9

                    loss = criterion(outputs[:, 1:].reshape(-1, outputs.size(-1)), tgt[:, 1:].reshape(-1))

                    preds = outputs.argmax(-1)
                    acc = compute_word_level_accuracy(preds[:, 1:], tgt[:, 1:], tgt_vocab)

                    test_loss += loss.item()
                    test_acc += acc

            # # Save predictions to

            avg_test_loss = test_loss / (len(test_set) // best_config["batch_size"])
            avg_test_acc = test_acc / (len(test_set) // best_config["batch_size"])
            # Save predictions to CSV file
            for j in range(min(4, src.size(0))):  # Print the first 3 samples
                input_seq = ''.join([idx2src.get(idx.item(), '<unk>') for idx in src[j] if idx.item() not in [src_vocab['<pad>'], src_vocab['<eos>'], src_vocab['<sos>']]])
                target_seq = ''.join([idx2tgt.get(idx.item(), '<unk>') for idx in tgt[j][1:] if idx.item() not in [tgt_vocab['<pad>'], tgt_vocab['<eos>'],src_vocab['<sos>']]])
                pred_seq = ''.join([idx2tgt.get(idx.item(), '<unk>') for idx in preds[j][1:] if idx.item() not in [tgt_vocab['<pad>'], tgt_vocab['<eos>'],src_vocab['<sos>']]])
    
                # Write each row to the CSV file
                writer.writerow({'Input': input_seq, 'Target': target_seq, 'Predicted': pred_seq})
                print({'Input': input_seq, 'Target': target_seq, 'Predicted': pred_seq})
        plot_attention_grid(attn_weights[:9], src[:9], tgt[:9], idx2src, idx2tgt,src_vocab,tgt_vocab)
        

    print(f"Test Loss: {avg_test_loss:.4f}, Test Accuracy: {avg_test_acc:.2f}%")

# Call the training_test function
training_test(best_config)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mma23m021[0m ([33mma23m021-iit-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
