In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import pandas as pd
import random
import wandb
from tqdm import tqdm
import os
import math

import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.font_manager as fm

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
wandb.login(key= '1b5f670bdb4b8ed39a9bc34744dd738c9b33dede')

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mcs24m022[0m ([33mcs24m022-iit-madras-foundation[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Get Data

In [None]:
# # My PC :- Used for initial experiments
# TRAIN_FilePath = 'dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv'
# DEV_FilePath = 'dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv'
# TEST_FilePath = 'dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv'

In [3]:
# # COLAB :- Used to check if code if gpu supporting and run experiments
# from google.colab import drive
# drive.mount('/content/drive')
# TRAIN_FilePath = '/content/drive/My Drive/hi/lexicons/hi.translit.sampled.train.tsv'
# DEV_FilePath = '/content/drive/My Drive/hi/lexicons/hi.translit.sampled.dev.tsv'
# TEST_FilePath = '/content/drive/My Drive/hi/lexicons/hi.translit.sampled.test.tsv'

Mounted at /content/drive


In [5]:
# KAGGLE :- Used for hyperparameter tuning
TRAIN_FilePath = "/kaggle/input/dakshina22/hi/lexicons/hi.translit.sampled.train.tsv"
DEV_FilePath = "/kaggle/input/dakshina22/hi/lexicons/hi.translit.sampled.dev.tsv"
TEST_FilePath = "/kaggle/input/dakshina22/hi/lexicons/hi.translit.sampled.test.tsv"

## Data Loading and Preprocessing

In [6]:
# Load train, dev and test datasets
train_df = pd.read_csv(TRAIN_FilePath, sep='\t', header=None)
dev_df = pd.read_csv(DEV_FilePath, sep='\t', header=None)
test_df = pd.read_csv(TEST_FilePath, sep='\t', header=None)


# Renaming the columns
train_df.columns = ['devanagari', 'latin', 'frequency']
dev_df.columns = ['devanagari', 'latin', 'frequency']
test_df.columns = ['devanagari', 'latin', 'frequency']

# Dataset Sizes
print(f"Train Dataset Size : {train_df.shape[0]}\nDev Dataset Size   : {dev_df.shape[0]}\nTest Dataset Size  : {test_df.shape[0]}")

Train Dataset Size : 44204
Dev Dataset Size   : 4358
Test Dataset Size  : 4502


In [7]:
train_df.head()

Unnamed: 0,devanagari,latin,frequency
0,अं,an,3
1,अंकगणित,ankganit,3
2,अंकल,uncle,4
3,अंकुर,ankur,4
4,अंकुरण,ankuran,3


In [8]:
class Vocabulary:
    def __init__(self):
        self.pad_token = "<pad>"
        self.sos_token = "<sos>"
        self.eos_token = "<eos>"
        self.unk_token = "<unk>"

        # Initialize mappings
        self.char2idx = {self.pad_token: 0, self.sos_token: 1, self.eos_token: 2, self.unk_token: 3}
        self.idx2char = {0: self.pad_token, 1: self.sos_token, 2: self.eos_token, 3: self.unk_token}
        self.vocab_size = 4

    def build_vocabulary(self, text_data):
        for text in text_data:
            text = str(text)
            for char in text:
                if char not in self.char2idx:
                    self.char2idx[char] = self.vocab_size
                    self.idx2char[self.vocab_size] = char
                    self.vocab_size += 1

    def encode(self, text, add_special_tokens=True):
        indices = []
        text = str(text)
        for char in text:
            indices.append(self.char2idx.get(char, self.char2idx[self.unk_token]))

        if add_special_tokens:
            indices = [self.char2idx[self.sos_token]] + indices + [self.char2idx[self.eos_token]]

        return indices

    def decode(self, indices, remove_special_tokens=True):
        chars = []
        keys = list(self.idx2char.keys())
        for idx in indices:
            if isinstance(idx, torch.Tensor):
                idx = idx.item()
            if idx in keys:
                char = self.idx2char[idx]
                if remove_special_tokens and char in [self.pad_token, self.sos_token, self.eos_token, self.unk_token]:
                    continue
                chars.append(char)

        return "".join(chars)


In [9]:
class TransliterationDataset(Dataset):
    def __init__(self, data_path, src_vocab, tgt_vocab):
        df = pd.read_csv(data_path, sep='\t', header=None)

        # Create Dataset
        self.source_sequences = []
        self.target_sequences = []

        for idx, row in df.iterrows():
            x_seq = src_vocab.encode(row[1])
            y_seq = tgt_vocab.encode(row[0])
            self.source_sequences.append(x_seq)
            self.target_sequences.append(y_seq)

    def __len__(self):
        return len(self.source_sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.source_sequences[idx], dtype=torch.long), torch.tensor(self.target_sequences[idx], dtype=torch.long)


In [10]:
def collate_fn(batch):
    src_batch = [item[0] for item in batch]
    tgt_batch = [item[1] for item in batch]

    # Pad sequences
    src_batch_padded = pad_sequence(src_batch, batch_first=True, padding_value=0)
    tgt_batch_padded = pad_sequence(tgt_batch, batch_first=True, padding_value=0)

    return src_batch_padded, tgt_batch_padded

In [11]:
# Build Source and Target Vocabularies
src_vocab = Vocabulary()
tgt_vocab = Vocabulary()

train_df = pd.read_csv(TRAIN_FilePath, sep='\t', header=None)
src_text = []
tgt_text = []
for idx, row in train_df.iterrows():
    src_text.append(row[1])
    tgt_text.append(row[0])

src_vocab.build_vocabulary(src_text)
tgt_vocab.build_vocabulary(tgt_text)

In [12]:
# Train, Test and Dev (Validation) Dataset and Dataloaders
train_dataset = TransliterationDataset(TRAIN_FilePath, src_vocab, tgt_vocab)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

dev_dataset = TransliterationDataset(DEV_FilePath, src_vocab, tgt_vocab)
dev_loader = DataLoader(dev_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

test_dataset = TransliterationDataset(TEST_FilePath, src_vocab, tgt_vocab)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


In [13]:
# Inspect if dataloader is created as desired
for batch in train_loader:
    inputs, targets = batch

    print("Inputs shape:", inputs.shape)
    print("Targets shape:", targets.shape)

    # Check one sample
    print("Sample input:", inputs[0])
    print("Sample target:", targets[0])
    print(src_vocab.decode(inputs[0]))
    print(tgt_vocab.decode(targets[0]))
    break

Inputs shape: torch.Size([32, 14])
Targets shape: torch.Size([32, 13])
Sample input: tensor([ 1,  5,  8, 17,  4,  4,  5,  2,  0,  0,  0,  0,  0,  0])
Sample target: tensor([ 1, 16,  9, 15, 21, 16,  2,  0,  0,  0,  0,  0,  0])
nidaan
निदान


## Vanilla Model

In [15]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers=1, cell_type="RNN", dropout=0.0):
        super().__init__()

        if num_layers == 1:
            dropout = 0

        self.cell_type = cell_type

        # Embedding Layer
        self.embedding = nn.Embedding(input_size, embedding_size)

        # Recurrent Layer
        if cell_type == "LSTM":
            self.recurrent_layer = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
        elif cell_type == "GRU":
            self.recurrent_layer = nn.GRU(embedding_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
        else: # Default (RNN)
            self.recurrent_layer = nn.RNN(embedding_size, hidden_size, num_layers, dropout=dropout, batch_first=True)

        # Dropout layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, input):
        # input: batch_size x seq_len

        embeddings = self.embedding(input) # (batch, seq_len, embedding_size)
        embeddings = self.dropout(embeddings)

        if self.cell_type == 'LSTM':
            outputs, (hidden, cell) = self.recurrent_layer(embeddings)
            return outputs, (hidden, cell)
        else:
            outputs, hidden = self.recurrent_layer(embeddings)
            return outputs, hidden


In [16]:
class Decoder(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, num_layers=1, cell_type="RNN", dropout=0.0):
        super().__init__()

        if num_layers == 1:
            dropout = 0

        self.output_size = output_size
        self.cell_type = cell_type

        # Embedding Layer
        self.embedding = nn.Embedding(output_size, embedding_size)

        # Dropout layer
        self.dropout = nn.Dropout(dropout)

        # Recurrent Layer
        if cell_type == "LSTM":
            self.recurrent_layer = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
        elif cell_type == "GRU":
            self.recurrent_layer = nn.GRU(embedding_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
        else: # Default (RNN)
            self.recurrent_layer = nn.RNN(embedding_size, hidden_size, num_layers, dropout=dropout, batch_first=True)

        # Output layer
        self.fc_out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):

        input = input.unsqueeze(1)
        embeddings = self.embedding(input)
        embeddings = self.dropout(embeddings)

        if self.cell_type == "LSTM":
            hidden, cell = hidden
            outputs, (hidden, cell) = self.recurrent_layer(embeddings, (hidden, cell))
            hidden = (hidden, cell)
        else:
            outputs, hidden = self.recurrent_layer(embeddings, hidden)

        outputs = outputs.squeeze(1)
        prediction = self.fc_out(outputs)
        return prediction, hidden


In [17]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):

        batch_size = src.shape[0]
        tgt_len = tgt.shape[1]
        tgt_vocab_size = self.decoder.output_size

        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)

        if self.encoder.cell_type == 'LSTM':
            encoder_outputs, (hidden, cell) = self.encoder(src)
            decoder_hidden = (hidden, cell)
        else:
            encoder_outputs, hidden = self.encoder(src)
            decoder_hidden = hidden

        decoder_input = tgt[:, 0]
        for t in range(1, tgt_len):
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
            #print(decoder_output.shape)
            #break
            outputs[:, t] = decoder_output
            top = decoder_output.argmax(1)
            decoder_input = tgt[:, t] if random.random() < teacher_forcing_ratio else top
        return outputs


    def inference(self, src, max_len, sos_idx=1, eos_idx=2):

        batch_size = src.shape[0]
        tgt_len = max_len
        tgt_vocab_size = self.decoder.output_size
        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)

        if self.encoder.cell_type == 'LSTM':
            encoder_outputs, (hidden, cell) = self.encoder(src)
            decoder_hidden = (hidden, cell)
        else:
            encoder_outputs, hidden = self.encoder(src)
            decoder_hidden = hidden

        decoder_input = torch.tensor([sos_idx] * batch_size, device=self.device)
        for t in range(1, max_len):
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
            outputs[:, t] = decoder_output
            top = decoder_output.argmax(1)
            decoder_input = top

            # Check if all sequences have reached <eos>
            if (outputs == eos_idx).any(dim=1).all():
                break
        return outputs

## Training and Evaluation of Vanilla Model

In [18]:
def train(model, train_loader, optimizer, criterion, clip=1.0, teacher_forcing_ratio=0.5):
    model.train()
    epoch_loss = 0

    for i, (src, tgt) in enumerate(train_loader):
        src = src.to(device)
        tgt = tgt.to(device)
        #print("tgt", tgt)
        optimizer.zero_grad()

        output = model(src, tgt, teacher_forcing_ratio)
        #print("output", output)
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        tgt = tgt[:, 1:].reshape(-1)
        #print("tgt2", tgt)
        #print("output2", output)
        #break

        # Calculate loss
        loss = criterion(output, tgt)

        # Backpropagation
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        # Update parameters
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(train_loader)

In [19]:
def evaluate(model, val_loader, criterion):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for i, (src, tgt) in enumerate(val_loader):
            src = src.to(device)
            tgt = tgt.to(device)

            output = model.inference(src, tgt.shape[1])

            # Reshape output and target for loss calculation
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            tgt = tgt[:, 1:].reshape(-1)

            # Calculate loss
            loss = criterion(output, tgt)
            epoch_loss += loss.item()

    return epoch_loss / len(val_loader)

In [20]:
def transliterate(model, src_text, src_vocab, tgt_vocab, device, max_length=100):
    model.eval()

    # Convert source text to tensor
    src_indices = src_vocab.encode(src_text)
    src_tensor = torch.tensor(src_indices, dtype=torch.long).unsqueeze(0).to(device)

    # Get encoder outputs
    with torch.no_grad():
        if model.encoder.cell_type == 'LSTM':
            encoder_outputs, (hidden, cell) = model.encoder(src_tensor)
            decoder_hidden = (hidden, cell)
        else:
            encoder_outputs, hidden = model.encoder(src_tensor)
            decoder_hidden = hidden

    # Start with SOS token
    decoder_input = torch.tensor([tgt_vocab.char2idx[tgt_vocab.sos_token]], device=device)

    result_indices = [tgt_vocab.char2idx[tgt_vocab.sos_token]]

    for _ in range(max_length):
        with torch.no_grad():
            decoder_output, decoder_hidden = model.decoder(decoder_input, decoder_hidden)

        # Get the most likely next character
        top_token = decoder_output.argmax(1).item()
        result_indices.append(top_token)

        # Stop if EOS token
        if top_token == tgt_vocab.char2idx[tgt_vocab.eos_token]:
            break

        # Use predicted token as next input
        decoder_input = torch.tensor([top_token], device=device)

    # Convert indices to text
    result_text = tgt_vocab.decode(result_indices, remove_special_tokens=True)

    return result_text

In [21]:
# Accuracy calculation function
def calculate_accuracy(model, data_loader, src_vocab, tgt_vocab, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for src, tgt in data_loader:
            src = src.to(device)
            tgt = tgt.to(device)

            batch_size = src.shape[0]

            for i in range(batch_size):
                # Get source text and actual target text
                src_indices = src[i].tolist()
                src_text = src_vocab.decode(src_indices)
                actual_tgt_text = tgt_vocab.decode(tgt[i].tolist())

                # Get predicted transliteration
                predicted_tgt_text = transliterate(model, src_text, src_vocab, tgt_vocab, device)

                # Check if prediction matches
                if predicted_tgt_text == actual_tgt_text:
                    correct += 1
                total += 1

    return correct / total

## Sample Run of Vanilla Model

In [None]:
# INPUT_SIZE = src_vocab.vocab_size
# OUTPUT_SIZE = tgt_vocab.vocab_size
# EMBEDDING_SIZE = 256
# HIDDEN_SIZE = 512
# NUM_LAYERS = 2
# CELL_TYPE = "LSTM"
# DROPOUT = 0.2
# LEARNING_RATE = 0.001
# NUM_EPOCHS = 10

# # Initialize encoder, decoder, and seq2seq model
# encoder = Encoder(
#     input_size=INPUT_SIZE,
#     embedding_size=EMBEDDING_SIZE,
#     hidden_size=HIDDEN_SIZE,
#     num_layers=NUM_LAYERS,
#     cell_type=CELL_TYPE,
#     dropout=DROPOUT
# )

# decoder = Decoder(
#     output_size=OUTPUT_SIZE,
#     embedding_size=EMBEDDING_SIZE,
#     hidden_size=HIDDEN_SIZE,
#     num_layers=NUM_LAYERS,
#     cell_type=CELL_TYPE,
#     dropout=DROPOUT
# )

# model = Seq2Seq(encoder, decoder, device).to(device)
# criterion = nn.CrossEntropyLoss(ignore_index=0)
# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
# # Training loop
# print(f"Starting training for {NUM_EPOCHS} epochs...")

# for epoch in range(NUM_EPOCHS):
#     print(f"Epoch {epoch+1}/{NUM_EPOCHS}")

#     # Train model
#     train_loss = train(model, train_loader, optimizer, criterion)

#     # Evaluate model
#     valid_loss = evaluate(model, dev_loader, criterion)

#     print(f"Train Loss: {train_loss:.4f} | Valid Loss: {valid_loss:.4f}")
#     print("="*20)

Starting training for 10 epochs...
Epoch 1/10
Train Loss: 1.6911 | Valid Loss: 1.2774
Epoch 2/10
Train Loss: 0.7447 | Valid Loss: 1.0841
Epoch 3/10
Train Loss: 0.5753 | Valid Loss: 1.0521
Epoch 4/10
Train Loss: 0.4678 | Valid Loss: 1.0716
Epoch 5/10
Train Loss: 0.3957 | Valid Loss: 1.0537
Epoch 6/10
Train Loss: 0.3445 | Valid Loss: 1.0727
Epoch 7/10
Train Loss: 0.3004 | Valid Loss: 1.1386
Epoch 8/10
Train Loss: 0.2758 | Valid Loss: 1.1567
Epoch 9/10
Train Loss: 0.2444 | Valid Loss: 1.2164
Epoch 10/10
Train Loss: 0.2251 | Valid Loss: 1.2348


In [None]:
# val_accuracy = calculate_accuracy(model, dev_loader, src_vocab, tgt_vocab, device)
# print(f"\nValidation Accuracy: {val_accuracy:.4f}")


Validation Accuracy: 0.3502


## Hyperparameter Tuning of Vanilla Model

In [24]:
def sweep_hyperparameters(config=None):
    with wandb.init(config=config):
        config = wandb.config
        wandb.run.name = f"embedding_size_{str(config.embedding_size)}_num_layers_{str(config.num_layers)}_hidden_size_{config.hidden_size} \
                          _cell_type_{config.cell_type}_dp_{config.dropout}_lr_{config.learning_rate}_batch_{config.batch_size}"

        # Log in my details
        wandb.config.update({"NAME": "KILAPARTHI VISHNU VARDHAN", "ROLL NO.": "CS24M022"})

        train_dataset = TransliterationDataset(TRAIN_FilePath, src_vocab, tgt_vocab)
        train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)

        dev_dataset = TransliterationDataset(DEV_FilePath, src_vocab, tgt_vocab)
        dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn)

        INPUT_SIZE = src_vocab.vocab_size
        OUTPUT_SIZE = tgt_vocab.vocab_size

        EMBEDDING_SIZE = config.embedding_size
        HIDDEN_SIZE = config.hidden_size
        NUM_LAYERS = config.num_layers
        CELL_TYPE = config.cell_type
        DROPOUT = config.dropout
        LEARNING_RATE = config.learning_rate
        NUM_EPOCHS = 10

        # Initialize encoder, decoder, and seq2seq model
        encoder = Encoder(
            input_size=INPUT_SIZE,
            embedding_size=EMBEDDING_SIZE,
            hidden_size=HIDDEN_SIZE,
            num_layers=NUM_LAYERS,
            cell_type=CELL_TYPE,
            dropout=DROPOUT
        )

        decoder = Decoder(
            output_size=OUTPUT_SIZE,
            embedding_size=EMBEDDING_SIZE,
            hidden_size=HIDDEN_SIZE,
            num_layers=NUM_LAYERS,
            cell_type=CELL_TYPE,
            dropout=DROPOUT
        )

        model = Seq2Seq(encoder, decoder, device).to(device)
        criterion = nn.CrossEntropyLoss(ignore_index=0)
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

        for epoch in tqdm(range(NUM_EPOCHS)):
            # Train model
            train_loss = train(model, train_loader, optimizer, criterion)
            # Evaluate model
            valid_loss = evaluate(model, dev_loader, criterion)

            print(f"Train Loss: {train_loss:.4f} | Valid Loss: {valid_loss:.4f}")
            # Log the evaluation metrics
            wandb.log({
                "epoch": epoch,
                "train_loss": train_loss,
                "validation_loss": valid_loss,
                })

        # Calculate accuracy on validation set
        val_accuracy = calculate_accuracy(model, dev_loader, src_vocab, tgt_vocab, device)
        print(f"\nValidation Accuracy: {val_accuracy:.4f}")
        wandb.log({
            "val_accuracy": val_accuracy
        })

In [25]:
sweep_config = {
    "method" : "bayes",
    "metric" : {"name": "val_accuracy", "goal": "maximize"},
    "parameters" : {
        "embedding_size" : {"values" : [16, 32, 64, 256]},
        "num_layers" : {"values" : [2, 3]},
        "hidden_size" : {"values" : [32, 128, 512]},
        "cell_type" : {"values" : ["RNN", "LSTM", "GRU"]},
        "dropout" : {"values" : [ 0.2, 0.3]},
        "learning_rate" : {"values" : [0.001, 0.0005]},
        "batch_size": {"values": [32, 64, 128]}
    }
}

sweep_id = wandb.sweep(sweep_config, project = "cs24m022_DA6401_Assignment3")

Create sweep with ID: rxo139kw
Sweep URL: https://wandb.ai/cs24m022-iit-madras-foundation/cs24m022_DA6401_Assignment3/sweeps/rxo139kw


In [26]:
wandb.agent('rxo139kw', function = sweep_hyperparameters, count = 30)

[34m[1mwandb[0m: Agent Starting Run: ge8sbo9x with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 16
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 2


 10%|█         | 1/10 [00:20<03:06, 20.71s/it]

Train Loss: 1.5995 | Valid Loss: 1.3383


 20%|██        | 2/10 [00:40<02:41, 20.17s/it]

Train Loss: 0.8974 | Valid Loss: 0.9699


 30%|███       | 3/10 [01:00<02:20, 20.11s/it]

Train Loss: 0.6576 | Valid Loss: 0.8802


 40%|████      | 4/10 [01:20<02:00, 20.06s/it]

Train Loss: 0.5355 | Valid Loss: 0.8428


 50%|█████     | 5/10 [01:40<01:40, 20.13s/it]

Train Loss: 0.4676 | Valid Loss: 0.7875


 60%|██████    | 6/10 [02:00<01:20, 20.13s/it]

Train Loss: 0.4095 | Valid Loss: 0.8286


 70%|███████   | 7/10 [02:20<01:00, 20.11s/it]

Train Loss: 0.3650 | Valid Loss: 0.7641


 80%|████████  | 8/10 [02:40<00:40, 20.08s/it]

Train Loss: 0.3355 | Valid Loss: 0.7909


 90%|█████████ | 9/10 [03:01<00:20, 20.19s/it]

Train Loss: 0.2967 | Valid Loss: 0.7835


100%|██████████| 10/10 [03:21<00:00, 20.14s/it]

Train Loss: 0.2723 | Valid Loss: 0.7832






Validation Accuracy: 0.2770


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁
validation_loss,█▄▂▂▁▂▁▁▁▁

0,1
epoch,9.0
train_loss,0.27232
val_accuracy,0.27696
validation_loss,0.78321


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: mmcrgn8q with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 16
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 2


 10%|█         | 1/10 [00:30<04:36, 30.68s/it]

Train Loss: 1.9813 | Valid Loss: 2.0590


 20%|██        | 2/10 [01:01<04:05, 30.73s/it]

Train Loss: 1.5976 | Valid Loss: 1.8324


 30%|███       | 3/10 [01:32<03:36, 30.97s/it]

Train Loss: 1.3809 | Valid Loss: 1.5859


 40%|████      | 4/10 [02:04<03:07, 31.17s/it]

Train Loss: 1.2196 | Valid Loss: 1.4483


 50%|█████     | 5/10 [02:35<02:35, 31.16s/it]

Train Loss: 1.1296 | Valid Loss: 1.3504


 60%|██████    | 6/10 [03:06<02:04, 31.20s/it]

Train Loss: 1.0619 | Valid Loss: 1.2973


 70%|███████   | 7/10 [03:37<01:33, 31.21s/it]

Train Loss: 1.0132 | Valid Loss: 1.2516


 80%|████████  | 8/10 [04:09<01:02, 31.23s/it]

Train Loss: 0.9666 | Valid Loss: 1.2214


 90%|█████████ | 9/10 [04:40<00:31, 31.28s/it]

Train Loss: 0.9317 | Valid Loss: 1.1904


100%|██████████| 10/10 [05:11<00:00, 31.20s/it]

Train Loss: 0.9057 | Valid Loss: 1.1955






Validation Accuracy: 0.0771


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▆▄▃▂▂▂▁▁▁
val_accuracy,▁
validation_loss,█▆▄▃▂▂▁▁▁▁

0,1
epoch,9.0
train_loss,0.90571
val_accuracy,0.0771
validation_loss,1.19547


[34m[1mwandb[0m: Agent Starting Run: 0rywvzki with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3


 10%|█         | 1/10 [00:35<05:19, 35.47s/it]

Train Loss: 1.8968 | Valid Loss: 2.8362


 20%|██        | 2/10 [01:10<04:43, 35.39s/it]

Train Loss: 1.8231 | Valid Loss: 2.5492


 30%|███       | 3/10 [01:46<04:08, 35.47s/it]

Train Loss: 1.8155 | Valid Loss: 2.4687


 40%|████      | 4/10 [02:22<03:33, 35.55s/it]

Train Loss: 1.8137 | Valid Loss: 2.8436


 50%|█████     | 5/10 [02:57<02:57, 35.58s/it]

Train Loss: 1.7999 | Valid Loss: 2.5776


 60%|██████    | 6/10 [03:32<02:21, 35.43s/it]

Train Loss: 1.8037 | Valid Loss: 2.5721


 70%|███████   | 7/10 [04:08<01:46, 35.50s/it]

Train Loss: 1.8141 | Valid Loss: 2.3972


 80%|████████  | 8/10 [04:44<01:11, 35.61s/it]

Train Loss: 1.8111 | Valid Loss: 2.4683


 90%|█████████ | 9/10 [05:20<00:35, 35.72s/it]

Train Loss: 1.8178 | Valid Loss: 2.4626


100%|██████████| 10/10 [05:55<00:00, 35.59s/it]

Train Loss: 1.8246 | Valid Loss: 2.5224






Validation Accuracy: 0.0000


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▃▂▂▁▁▂▂▂▃
val_accuracy,▁
validation_loss,█▃▂█▄▄▁▂▂▃

0,1
epoch,9.0
train_loss,1.82459
val_accuracy,0.0
validation_loss,2.52239


[34m[1mwandb[0m: Agent Starting Run: u70por9t with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 64
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3


 10%|█         | 1/10 [00:17<02:34, 17.13s/it]

Train Loss: 1.9685 | Valid Loss: 2.2409


 20%|██        | 2/10 [00:33<02:15, 16.93s/it]

Train Loss: 1.7320 | Valid Loss: 2.1729


 30%|███       | 3/10 [00:51<02:00, 17.15s/it]

Train Loss: 1.6667 | Valid Loss: 2.0876


 40%|████      | 4/10 [01:08<01:42, 17.08s/it]

Train Loss: 1.6370 | Valid Loss: 2.0597


 50%|█████     | 5/10 [01:25<01:25, 17.00s/it]

Train Loss: 1.6267 | Valid Loss: 2.0491


 60%|██████    | 6/10 [01:42<01:07, 16.96s/it]

Train Loss: 1.6032 | Valid Loss: 2.0329


 70%|███████   | 7/10 [01:58<00:50, 16.95s/it]

Train Loss: 1.5989 | Valid Loss: 2.0235


 80%|████████  | 8/10 [02:16<00:34, 17.02s/it]

Train Loss: 1.5884 | Valid Loss: 2.0246


 90%|█████████ | 9/10 [02:33<00:17, 17.18s/it]

Train Loss: 1.5809 | Valid Loss: 2.0294


100%|██████████| 10/10 [02:50<00:00, 17.08s/it]

Train Loss: 1.5834 | Valid Loss: 2.0059






Validation Accuracy: 0.0002


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▁▁▁▁▁
val_accuracy,▁
validation_loss,█▆▃▃▂▂▂▂▂▁

0,1
epoch,9.0
train_loss,1.58342
val_accuracy,0.00023
validation_loss,2.00593


[34m[1mwandb[0m: Agent Starting Run: k6d2hdjd with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 16
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 2


 10%|█         | 1/10 [00:20<03:00, 20.03s/it]

Train Loss: 1.5775 | Valid Loss: 1.3135


 20%|██        | 2/10 [00:40<02:40, 20.10s/it]

Train Loss: 0.8876 | Valid Loss: 0.9529


 30%|███       | 3/10 [01:00<02:20, 20.05s/it]

Train Loss: 0.6511 | Valid Loss: 0.8366


 40%|████      | 4/10 [01:21<02:03, 20.60s/it]

Train Loss: 0.5321 | Valid Loss: 0.8127


 50%|█████     | 5/10 [01:42<01:44, 20.84s/it]

Train Loss: 0.4591 | Valid Loss: 0.8320


 60%|██████    | 6/10 [02:03<01:22, 20.73s/it]

Train Loss: 0.4073 | Valid Loss: 0.7720


 70%|███████   | 7/10 [02:24<01:03, 21.00s/it]

Train Loss: 0.3590 | Valid Loss: 0.8127


 80%|████████  | 8/10 [02:46<00:42, 21.02s/it]

Train Loss: 0.3246 | Valid Loss: 0.7867


 90%|█████████ | 9/10 [03:07<00:21, 21.04s/it]

Train Loss: 0.2997 | Valid Loss: 0.7891


100%|██████████| 10/10 [03:28<00:00, 20.83s/it]

Train Loss: 0.2666 | Valid Loss: 0.7891






Validation Accuracy: 0.2978


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁
validation_loss,█▃▂▂▂▁▂▁▁▁

0,1
epoch,9.0
train_loss,0.26656
val_accuracy,0.29784
validation_loss,0.78914


[34m[1mwandb[0m: Agent Starting Run: tngjv42q with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 16
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 3


 10%|█         | 1/10 [00:24<03:42, 24.77s/it]

Train Loss: 1.5215 | Valid Loss: 1.1388


 20%|██        | 2/10 [00:49<03:18, 24.84s/it]

Train Loss: 0.7751 | Valid Loss: 0.8652


 30%|███       | 3/10 [01:14<02:54, 25.00s/it]

Train Loss: 0.5674 | Valid Loss: 0.8277


 40%|████      | 4/10 [01:39<02:29, 24.86s/it]

Train Loss: 0.4665 | Valid Loss: 0.7782


 50%|█████     | 5/10 [02:04<02:04, 24.87s/it]

Train Loss: 0.3970 | Valid Loss: 0.7647


 60%|██████    | 6/10 [02:29<01:39, 24.80s/it]

Train Loss: 0.3521 | Valid Loss: 0.7528


 70%|███████   | 7/10 [02:53<01:14, 24.83s/it]

Train Loss: 0.3157 | Valid Loss: 0.7513


 80%|████████  | 8/10 [03:18<00:49, 24.85s/it]

Train Loss: 0.2797 | Valid Loss: 0.7776


 90%|█████████ | 9/10 [03:43<00:24, 24.82s/it]

Train Loss: 0.2537 | Valid Loss: 0.8011


100%|██████████| 10/10 [04:08<00:00, 24.87s/it]

Train Loss: 0.2304 | Valid Loss: 0.7823






Validation Accuracy: 0.3486


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁
validation_loss,█▃▂▁▁▁▁▁▂▂

0,1
epoch,9.0
train_loss,0.23036
val_accuracy,0.34855
validation_loss,0.7823


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 9q6vfnv1 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 16
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 3


 10%|█         | 1/10 [00:14<02:13, 14.84s/it]

Train Loss: 1.6872 | Valid Loss: 1.5923


 20%|██        | 2/10 [00:29<01:58, 14.75s/it]

Train Loss: 1.0598 | Valid Loss: 1.0315


 30%|███       | 3/10 [00:44<01:43, 14.82s/it]

Train Loss: 0.7224 | Valid Loss: 0.8638


 40%|████      | 4/10 [00:59<01:28, 14.79s/it]

Train Loss: 0.5650 | Valid Loss: 0.7956


 50%|█████     | 5/10 [01:14<01:14, 14.85s/it]

Train Loss: 0.4681 | Valid Loss: 0.7509


 60%|██████    | 6/10 [01:28<00:59, 14.80s/it]

Train Loss: 0.4063 | Valid Loss: 0.7268


 70%|███████   | 7/10 [01:43<00:44, 14.77s/it]

Train Loss: 0.3633 | Valid Loss: 0.7083


 80%|████████  | 8/10 [01:58<00:29, 14.85s/it]

Train Loss: 0.3273 | Valid Loss: 0.7345


 90%|█████████ | 9/10 [02:13<00:14, 14.86s/it]

Train Loss: 0.2912 | Valid Loss: 0.7314


100%|██████████| 10/10 [02:28<00:00, 14.83s/it]

Train Loss: 0.2706 | Valid Loss: 0.7347






Validation Accuracy: 0.3284


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▅▃▂▂▂▁▁▁▁
val_accuracy,▁
validation_loss,█▄▂▂▁▁▁▁▁▁

0,1
epoch,9.0
train_loss,0.27061
val_accuracy,0.32836
validation_loss,0.73468


[34m[1mwandb[0m: Agent Starting Run: 1c1tpku2 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 16
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 3


 10%|█         | 1/10 [00:14<02:12, 14.67s/it]

Train Loss: 1.7159 | Valid Loss: 1.5507


 20%|██        | 2/10 [00:29<01:57, 14.75s/it]

Train Loss: 1.0654 | Valid Loss: 1.0047


 30%|███       | 3/10 [00:44<01:44, 14.88s/it]

Train Loss: 0.7296 | Valid Loss: 0.8598


 40%|████      | 4/10 [00:59<01:29, 14.90s/it]

Train Loss: 0.5783 | Valid Loss: 0.7757


 50%|█████     | 5/10 [01:14<01:14, 14.89s/it]

Train Loss: 0.4923 | Valid Loss: 0.7336


 60%|██████    | 6/10 [01:29<00:59, 14.82s/it]

Train Loss: 0.4195 | Valid Loss: 0.7560


 70%|███████   | 7/10 [01:43<00:44, 14.87s/it]

Train Loss: 0.3827 | Valid Loss: 0.7235


 80%|████████  | 8/10 [01:58<00:29, 14.86s/it]

Train Loss: 0.3432 | Valid Loss: 0.7349


 90%|█████████ | 9/10 [02:13<00:14, 14.92s/it]

Train Loss: 0.3062 | Valid Loss: 0.7241


100%|██████████| 10/10 [02:28<00:00, 14.87s/it]

Train Loss: 0.2805 | Valid Loss: 0.7145






Validation Accuracy: 0.3038


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▅▃▂▂▂▁▁▁▁
val_accuracy,▁
validation_loss,█▃▂▂▁▁▁▁▁▁

0,1
epoch,9.0
train_loss,0.28049
val_accuracy,0.30381
validation_loss,0.71454


[34m[1mwandb[0m: Agent Starting Run: 8eamdjj0 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 16
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 3


 10%|█         | 1/10 [00:15<02:22, 15.87s/it]

Train Loss: 1.6951 | Valid Loss: 1.5405


 20%|██        | 2/10 [00:30<02:03, 15.41s/it]

Train Loss: 0.9948 | Valid Loss: 0.9612


 30%|███       | 3/10 [00:46<01:47, 15.31s/it]

Train Loss: 0.6466 | Valid Loss: 0.8375


 40%|████      | 4/10 [01:01<01:31, 15.23s/it]

Train Loss: 0.5036 | Valid Loss: 0.7657


 50%|█████     | 5/10 [01:16<01:16, 15.20s/it]

Train Loss: 0.4216 | Valid Loss: 0.7839


 60%|██████    | 6/10 [01:31<01:00, 15.09s/it]

Train Loss: 0.3550 | Valid Loss: 0.7545


 70%|███████   | 7/10 [01:46<00:45, 15.07s/it]

Train Loss: 0.3056 | Valid Loss: 0.7353


 80%|████████  | 8/10 [02:01<00:30, 15.03s/it]

Train Loss: 0.2773 | Valid Loss: 0.7382


 90%|█████████ | 9/10 [02:16<00:15, 15.10s/it]

Train Loss: 0.2462 | Valid Loss: 0.7668


100%|██████████| 10/10 [02:31<00:00, 15.15s/it]

Train Loss: 0.2170 | Valid Loss: 0.7692






Validation Accuracy: 0.3061


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▅▃▂▂▂▁▁▁▁
val_accuracy,▁
validation_loss,█▃▂▁▁▁▁▁▁▁

0,1
epoch,9.0
train_loss,0.21701
val_accuracy,0.3061
validation_loss,0.76919


[34m[1mwandb[0m: Agent Starting Run: basw97ok with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 16
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 3


 10%|█         | 1/10 [00:11<01:45, 11.67s/it]

Train Loss: 1.7635 | Valid Loss: 2.1641


 20%|██        | 2/10 [00:23<01:32, 11.61s/it]

Train Loss: 1.6393 | Valid Loss: 2.0442


 30%|███       | 3/10 [00:34<01:21, 11.65s/it]

Train Loss: 1.5889 | Valid Loss: 2.0030


 40%|████      | 4/10 [00:46<01:09, 11.66s/it]

Train Loss: 1.5482 | Valid Loss: 1.9944


 50%|█████     | 5/10 [00:58<00:58, 11.67s/it]

Train Loss: 1.5095 | Valid Loss: 1.9458


 60%|██████    | 6/10 [01:09<00:46, 11.64s/it]

Train Loss: 1.4539 | Valid Loss: 1.8899


 70%|███████   | 7/10 [01:21<00:34, 11.60s/it]

Train Loss: 1.4121 | Valid Loss: 1.8473


 80%|████████  | 8/10 [01:33<00:23, 11.60s/it]

Train Loss: 1.3885 | Valid Loss: 1.9331


 90%|█████████ | 9/10 [01:44<00:11, 11.62s/it]

Train Loss: 1.3626 | Valid Loss: 1.7928


100%|██████████| 10/10 [01:56<00:00, 11.64s/it]

Train Loss: 1.3232 | Valid Loss: 1.7618






Validation Accuracy: 0.0000


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▆▅▅▄▃▂▂▂▁
val_accuracy,▁
validation_loss,█▆▅▅▄▃▂▄▂▁

0,1
epoch,9.0
train_loss,1.32321
val_accuracy,0.0
validation_loss,1.76178


[34m[1mwandb[0m: Agent Starting Run: 3orejouy with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 16
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 2


 10%|█         | 1/10 [00:12<01:48, 12.01s/it]

Train Loss: 1.6869 | Valid Loss: 1.6194


 20%|██        | 2/10 [00:24<01:36, 12.07s/it]

Train Loss: 1.0947 | Valid Loss: 1.1048


 30%|███       | 3/10 [00:36<01:24, 12.11s/it]

Train Loss: 0.7590 | Valid Loss: 0.9350


 40%|████      | 4/10 [00:48<01:12, 12.10s/it]

Train Loss: 0.5972 | Valid Loss: 0.8272


 50%|█████     | 5/10 [01:00<01:01, 12.20s/it]

Train Loss: 0.5101 | Valid Loss: 0.7754


 60%|██████    | 6/10 [01:12<00:48, 12.17s/it]

Train Loss: 0.4331 | Valid Loss: 0.8379


 70%|███████   | 7/10 [01:24<00:36, 12.14s/it]

Train Loss: 0.3888 | Valid Loss: 0.8059


 80%|████████  | 8/10 [01:37<00:24, 12.22s/it]

Train Loss: 0.3425 | Valid Loss: 0.7552


 90%|█████████ | 9/10 [01:49<00:12, 12.22s/it]

Train Loss: 0.3084 | Valid Loss: 0.7459


100%|██████████| 10/10 [02:02<00:00, 12.21s/it]

Train Loss: 0.2768 | Valid Loss: 0.7720






Validation Accuracy: 0.2811


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▅▃▃▂▂▂▁▁▁
val_accuracy,▁
validation_loss,█▄▃▂▁▂▁▁▁▁

0,1
epoch,9.0
train_loss,0.27677
val_accuracy,0.28109
validation_loss,0.77204


[34m[1mwandb[0m: Agent Starting Run: mkz3rtjc with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 16
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 2


 10%|█         | 1/10 [00:12<01:50, 12.28s/it]

Train Loss: 1.4975 | Valid Loss: 1.2376


 20%|██        | 2/10 [00:24<01:38, 12.27s/it]

Train Loss: 0.7973 | Valid Loss: 0.8659


 30%|███       | 3/10 [00:37<01:26, 12.38s/it]

Train Loss: 0.5894 | Valid Loss: 0.8319


 40%|████      | 4/10 [00:49<01:13, 12.32s/it]

Train Loss: 0.4848 | Valid Loss: 0.7899


 50%|█████     | 5/10 [01:01<01:01, 12.34s/it]

Train Loss: 0.4199 | Valid Loss: 0.8085


 60%|██████    | 6/10 [01:13<00:49, 12.27s/it]

Train Loss: 0.3725 | Valid Loss: 0.7794


 70%|███████   | 7/10 [01:26<00:36, 12.29s/it]

Train Loss: 0.3350 | Valid Loss: 0.7353


 80%|████████  | 8/10 [01:38<00:24, 12.33s/it]

Train Loss: 0.3000 | Valid Loss: 0.7702


 90%|█████████ | 9/10 [01:50<00:12, 12.30s/it]

Train Loss: 0.2778 | Valid Loss: 0.7785


100%|██████████| 10/10 [02:03<00:00, 12.32s/it]

Train Loss: 0.2452 | Valid Loss: 0.7766






Validation Accuracy: 0.2625


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▂▁▁▁
val_accuracy,▁
validation_loss,█▃▂▂▂▂▁▁▂▂

0,1
epoch,9.0
train_loss,0.24515
val_accuracy,0.26251
validation_loss,0.77655


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: fzwc0ke9 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 64
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 2


 10%|█         | 1/10 [00:12<01:49, 12.21s/it]

Train Loss: 1.4822 | Valid Loss: 1.2347


 20%|██        | 2/10 [00:24<01:38, 12.29s/it]

Train Loss: 0.7175 | Valid Loss: 0.8983


 30%|███       | 3/10 [00:36<01:26, 12.29s/it]

Train Loss: 0.5022 | Valid Loss: 0.8062


 40%|████      | 4/10 [00:49<01:14, 12.35s/it]

Train Loss: 0.3996 | Valid Loss: 0.7713


 50%|█████     | 5/10 [01:01<01:01, 12.27s/it]

Train Loss: 0.3308 | Valid Loss: 0.7548


 60%|██████    | 6/10 [01:13<00:48, 12.21s/it]

Train Loss: 0.2881 | Valid Loss: 0.7501


 70%|███████   | 7/10 [01:25<00:36, 12.26s/it]

Train Loss: 0.2514 | Valid Loss: 0.7466


 80%|████████  | 8/10 [01:38<00:24, 12.27s/it]

Train Loss: 0.2205 | Valid Loss: 0.7121


 90%|█████████ | 9/10 [01:50<00:12, 12.21s/it]

Train Loss: 0.1934 | Valid Loss: 0.7411


100%|██████████| 10/10 [02:02<00:00, 12.24s/it]

Train Loss: 0.1666 | Valid Loss: 0.8168






Validation Accuracy: 0.2607


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁
validation_loss,█▃▂▂▂▂▁▁▁▂

0,1
epoch,9.0
train_loss,0.16661
val_accuracy,0.26067
validation_loss,0.81678


[34m[1mwandb[0m: Agent Starting Run: lj1a4ddm with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 16
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 3


 10%|█         | 1/10 [00:42<06:26, 42.96s/it]

Train Loss: 1.2976 | Valid Loss: 1.0248


 20%|██        | 2/10 [01:26<05:47, 43.38s/it]

Train Loss: 0.6060 | Valid Loss: 0.9024


 30%|███       | 3/10 [02:09<05:01, 43.09s/it]

Train Loss: 0.4626 | Valid Loss: 0.8356


 40%|████      | 4/10 [02:53<04:20, 43.38s/it]

Train Loss: 0.3750 | Valid Loss: 0.8121


 50%|█████     | 5/10 [03:36<03:37, 43.44s/it]

Train Loss: 0.3226 | Valid Loss: 0.8378


 60%|██████    | 6/10 [04:19<02:53, 43.34s/it]

Train Loss: 0.2775 | Valid Loss: 0.8665


 70%|███████   | 7/10 [05:03<02:10, 43.49s/it]

Train Loss: 0.2467 | Valid Loss: 0.8586


 80%|████████  | 8/10 [05:46<01:26, 43.36s/it]

Train Loss: 0.2245 | Valid Loss: 0.8858


 90%|█████████ | 9/10 [06:29<00:43, 43.13s/it]

Train Loss: 0.2005 | Valid Loss: 0.8882


100%|██████████| 10/10 [07:12<00:00, 43.24s/it]

Train Loss: 0.1829 | Valid Loss: 0.9638






Validation Accuracy: 0.3788


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁
validation_loss,█▄▂▁▂▃▃▃▄▆

0,1
epoch,9.0
train_loss,0.1829
val_accuracy,0.37884
validation_loss,0.96376


[34m[1mwandb[0m: Agent Starting Run: tgranh7p with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 16
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 3


 10%|█         | 1/10 [00:24<03:39, 24.34s/it]

Train Loss: 1.5244 | Valid Loss: 1.2002


 20%|██        | 2/10 [00:48<03:14, 24.30s/it]

Train Loss: 0.7713 | Valid Loss: 0.9271


 30%|███       | 3/10 [01:13<02:51, 24.45s/it]

Train Loss: 0.5583 | Valid Loss: 0.8164


 40%|████      | 4/10 [01:37<02:27, 24.54s/it]

Train Loss: 0.4640 | Valid Loss: 0.7881


 50%|█████     | 5/10 [02:02<02:02, 24.48s/it]

Train Loss: 0.3945 | Valid Loss: 0.7686


 60%|██████    | 6/10 [02:26<01:37, 24.43s/it]

Train Loss: 0.3477 | Valid Loss: 0.7696


 70%|███████   | 7/10 [02:51<01:13, 24.53s/it]

Train Loss: 0.3153 | Valid Loss: 0.7955


 80%|████████  | 8/10 [03:15<00:49, 24.51s/it]

Train Loss: 0.2800 | Valid Loss: 0.7588


 90%|█████████ | 9/10 [03:40<00:24, 24.52s/it]

Train Loss: 0.2575 | Valid Loss: 0.7581


100%|██████████| 10/10 [04:04<00:00, 24.48s/it]

Train Loss: 0.2336 | Valid Loss: 0.7958






Validation Accuracy: 0.3566


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁
validation_loss,█▄▂▁▁▁▂▁▁▂

0,1
epoch,9.0
train_loss,0.23362
val_accuracy,0.35659
validation_loss,0.79582


[34m[1mwandb[0m: Agent Starting Run: fmwwbqjg with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 32
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 3


 10%|█         | 1/10 [00:37<05:34, 37.18s/it]

Train Loss: 1.7212 | Valid Loss: 1.6471


 20%|██        | 2/10 [01:14<04:57, 37.15s/it]

Train Loss: 1.0221 | Valid Loss: 1.1893


 30%|███       | 3/10 [01:51<04:20, 37.18s/it]

Train Loss: 0.7630 | Valid Loss: 0.9980


 40%|████      | 4/10 [02:28<03:42, 37.14s/it]

Train Loss: 0.6477 | Valid Loss: 0.9659


 50%|█████     | 5/10 [03:05<03:05, 37.14s/it]

Train Loss: 0.5739 | Valid Loss: 0.9394


 60%|██████    | 6/10 [03:42<02:28, 37.08s/it]

Train Loss: 0.5258 | Valid Loss: 0.9155


 70%|███████   | 7/10 [04:19<01:51, 37.10s/it]

Train Loss: 0.4911 | Valid Loss: 0.8705


 80%|████████  | 8/10 [04:57<01:14, 37.23s/it]

Train Loss: 0.4628 | Valid Loss: 0.8820


 90%|█████████ | 9/10 [05:34<00:37, 37.28s/it]

Train Loss: 0.4408 | Valid Loss: 0.8599


100%|██████████| 10/10 [06:12<00:00, 37.21s/it]

Train Loss: 0.4171 | Valid Loss: 0.8565






Validation Accuracy: 0.3077


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁
validation_loss,█▄▂▂▂▂▁▁▁▁

0,1
epoch,9.0
train_loss,0.41709
val_accuracy,0.30771
validation_loss,0.85646


[34m[1mwandb[0m: Agent Starting Run: 07i1221b with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 16
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 3


 10%|█         | 1/10 [00:42<06:22, 42.49s/it]

Train Loss: 1.2520 | Valid Loss: 0.9785


 20%|██        | 2/10 [01:24<05:39, 42.44s/it]

Train Loss: 0.5860 | Valid Loss: 0.8520


 30%|███       | 3/10 [02:07<04:57, 42.52s/it]

Train Loss: 0.4438 | Valid Loss: 0.8401


 40%|████      | 4/10 [02:50<04:15, 42.60s/it]

Train Loss: 0.3678 | Valid Loss: 0.8528


 50%|█████     | 5/10 [03:32<03:32, 42.49s/it]

Train Loss: 0.3115 | Valid Loss: 0.8638


 60%|██████    | 6/10 [04:15<02:50, 42.54s/it]

Train Loss: 0.2705 | Valid Loss: 0.8510


 70%|███████   | 7/10 [04:57<02:07, 42.55s/it]

Train Loss: 0.2403 | Valid Loss: 0.8652


 80%|████████  | 8/10 [05:40<01:25, 42.54s/it]

Train Loss: 0.2133 | Valid Loss: 0.8968


 90%|█████████ | 9/10 [06:22<00:42, 42.55s/it]

Train Loss: 0.1943 | Valid Loss: 0.9083


100%|██████████| 10/10 [07:05<00:00, 42.53s/it]

Train Loss: 0.1722 | Valid Loss: 0.9781






Validation Accuracy: 0.3772


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁
validation_loss,█▂▁▂▂▂▂▄▄█

0,1
epoch,9.0
train_loss,0.17224
val_accuracy,0.37724
validation_loss,0.97811


[34m[1mwandb[0m: Agent Starting Run: ufqz9jj1 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 32
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 3


 10%|█         | 1/10 [00:24<03:44, 24.95s/it]

Train Loss: 1.3209 | Valid Loss: 1.0268


 20%|██        | 2/10 [00:50<03:20, 25.03s/it]

Train Loss: 0.5880 | Valid Loss: 0.8508


 30%|███       | 3/10 [01:14<02:53, 24.74s/it]

Train Loss: 0.4295 | Valid Loss: 0.7877


 40%|████      | 4/10 [01:38<02:27, 24.66s/it]

Train Loss: 0.3478 | Valid Loss: 0.7615


 50%|█████     | 5/10 [02:03<02:02, 24.52s/it]

Train Loss: 0.2887 | Valid Loss: 0.7587


 60%|██████    | 6/10 [02:27<01:37, 24.47s/it]

Train Loss: 0.2498 | Valid Loss: 0.7582


 70%|███████   | 7/10 [02:52<01:13, 24.47s/it]

Train Loss: 0.2134 | Valid Loss: 0.8162


 80%|████████  | 8/10 [03:16<00:49, 24.53s/it]

Train Loss: 0.1870 | Valid Loss: 0.8207


 90%|█████████ | 9/10 [03:41<00:24, 24.53s/it]

Train Loss: 0.1647 | Valid Loss: 0.8262


100%|██████████| 10/10 [04:05<00:00, 24.55s/it]

Train Loss: 0.1484 | Valid Loss: 0.8597






Validation Accuracy: 0.3784


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁
validation_loss,█▃▂▁▁▁▃▃▃▄

0,1
epoch,9.0
train_loss,0.14843
val_accuracy,0.37838
validation_loss,0.85974


[34m[1mwandb[0m: Agent Starting Run: g115nymo with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 16
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 3


 10%|█         | 1/10 [00:43<06:27, 43.03s/it]

Train Loss: 1.2979 | Valid Loss: 1.0188


 20%|██        | 2/10 [01:31<06:08, 46.10s/it]

Train Loss: 0.6049 | Valid Loss: 0.9151


 30%|███       | 3/10 [02:18<05:26, 46.62s/it]

Train Loss: 0.4562 | Valid Loss: 0.8513


 40%|████      | 4/10 [03:01<04:30, 45.02s/it]

Train Loss: 0.3754 | Valid Loss: 0.8472


 50%|█████     | 5/10 [03:43<03:39, 43.97s/it]

Train Loss: 0.3212 | Valid Loss: 0.8852


 60%|██████    | 6/10 [04:25<02:53, 43.45s/it]

Train Loss: 0.2770 | Valid Loss: 0.8479


 70%|███████   | 7/10 [05:07<02:09, 43.02s/it]

Train Loss: 0.2466 | Valid Loss: 0.8676


 80%|████████  | 8/10 [05:51<01:26, 43.29s/it]

Train Loss: 0.2202 | Valid Loss: 0.8726


 90%|█████████ | 9/10 [06:34<00:43, 43.26s/it]

Train Loss: 0.1994 | Valid Loss: 0.8844


100%|██████████| 10/10 [07:17<00:00, 43.79s/it]

Train Loss: 0.1822 | Valid Loss: 0.9551






Validation Accuracy: 0.3598


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁
validation_loss,█▄▁▁▃▁▂▂▃▅

0,1
epoch,9.0
train_loss,0.18217
val_accuracy,0.3598
validation_loss,0.95506


[34m[1mwandb[0m: Agent Starting Run: wjund09f with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 3


 10%|█         | 1/10 [00:25<03:50, 25.59s/it]

Train Loss: 1.0082 | Valid Loss: 0.9056


 20%|██        | 2/10 [00:50<03:21, 25.21s/it]

Train Loss: 0.4439 | Valid Loss: 0.8065


 30%|███       | 3/10 [01:15<02:55, 25.03s/it]

Train Loss: 0.3376 | Valid Loss: 0.7715


 40%|████      | 4/10 [01:40<02:29, 24.95s/it]

Train Loss: 0.2729 | Valid Loss: 0.8232


 50%|█████     | 5/10 [02:04<02:03, 24.73s/it]

Train Loss: 0.2333 | Valid Loss: 0.8228


 60%|██████    | 6/10 [02:29<01:38, 24.71s/it]

Train Loss: 0.1969 | Valid Loss: 0.8273


 70%|███████   | 7/10 [02:54<01:14, 24.82s/it]

Train Loss: 0.1634 | Valid Loss: 0.8363


 80%|████████  | 8/10 [03:19<00:49, 24.98s/it]

Train Loss: 0.1445 | Valid Loss: 0.8665


 90%|█████████ | 9/10 [03:44<00:24, 24.92s/it]

Train Loss: 0.1317 | Valid Loss: 0.8910


100%|██████████| 10/10 [04:09<00:00, 24.92s/it]

Train Loss: 0.1207 | Valid Loss: 0.9490






Validation Accuracy: 0.3844


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁
validation_loss,▆▂▁▃▃▃▄▅▆█

0,1
epoch,9.0
train_loss,0.12071
val_accuracy,0.38435
validation_loss,0.94898


[34m[1mwandb[0m: Agent Starting Run: v65s0bex with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 3


 10%|█         | 1/10 [00:44<06:40, 44.46s/it]

Train Loss: 0.8795 | Valid Loss: 0.9973


 20%|██        | 2/10 [01:31<06:09, 46.20s/it]

Train Loss: 0.4263 | Valid Loss: 0.8765


 30%|███       | 3/10 [02:19<05:27, 46.79s/it]

Train Loss: 0.3306 | Valid Loss: 0.8354


 40%|████      | 4/10 [03:04<04:37, 46.25s/it]

Train Loss: 0.2776 | Valid Loss: 0.8782


 50%|█████     | 5/10 [03:49<03:48, 45.67s/it]

Train Loss: 0.2314 | Valid Loss: 0.9117


 60%|██████    | 6/10 [04:32<02:59, 44.87s/it]

Train Loss: 0.2018 | Valid Loss: 0.8867


 70%|███████   | 7/10 [05:20<02:17, 45.83s/it]

Train Loss: 0.1841 | Valid Loss: 0.8902


 80%|████████  | 8/10 [06:07<01:32, 46.23s/it]

Train Loss: 0.1617 | Valid Loss: 0.9462


 90%|█████████ | 9/10 [06:52<00:45, 45.69s/it]

Train Loss: 0.1532 | Valid Loss: 0.9470


100%|██████████| 10/10 [07:35<00:00, 45.53s/it]

Train Loss: 0.1451 | Valid Loss: 0.9655






Validation Accuracy: 0.3731


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁
validation_loss,█▃▁▃▄▃▃▆▆▇

0,1
epoch,9.0
train_loss,0.14509
val_accuracy,0.37311
validation_loss,0.9655


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 0naz0isf with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 16
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3


 10%|█         | 1/10 [00:44<06:43, 44.82s/it]

Train Loss: 1.0535 | Valid Loss: 0.9489


 20%|██        | 2/10 [01:28<05:54, 44.28s/it]

Train Loss: 0.5356 | Valid Loss: 0.8993


 30%|███       | 3/10 [02:11<05:04, 43.45s/it]

Train Loss: 0.4356 | Valid Loss: 0.8797


 40%|████      | 4/10 [02:53<04:18, 43.05s/it]

Train Loss: 0.3802 | Valid Loss: 0.9320


 50%|█████     | 5/10 [03:38<03:38, 43.62s/it]

Train Loss: 0.3496 | Valid Loss: 0.8923


 60%|██████    | 6/10 [04:27<03:01, 45.37s/it]

Train Loss: 0.3317 | Valid Loss: 0.9043


 70%|███████   | 7/10 [05:15<02:19, 46.52s/it]

Train Loss: 0.3123 | Valid Loss: 0.9354


 80%|████████  | 8/10 [06:05<01:34, 47.35s/it]

Train Loss: 0.3088 | Valid Loss: 0.9187


 90%|█████████ | 9/10 [06:51<00:47, 47.04s/it]

Train Loss: 0.2961 | Valid Loss: 0.9439


100%|██████████| 10/10 [07:34<00:00, 45.48s/it]

Train Loss: 0.2931 | Valid Loss: 0.9334






Validation Accuracy: 0.3217


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▃▂▂▂▁▁▁▁▁
val_accuracy,▁
validation_loss,█▃▁▆▂▃▇▅▇▆

0,1
epoch,9.0
train_loss,0.29311
val_accuracy,0.32171
validation_loss,0.93342


[34m[1mwandb[0m: Agent Starting Run: mfq73b00 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 64
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 3


 10%|█         | 1/10 [00:14<02:09, 14.44s/it]

Train Loss: 1.4527 | Valid Loss: 1.1886


 20%|██        | 2/10 [00:29<01:57, 14.63s/it]

Train Loss: 0.6462 | Valid Loss: 0.8413


 30%|███       | 3/10 [00:43<01:41, 14.56s/it]

Train Loss: 0.4554 | Valid Loss: 0.7905


 40%|████      | 4/10 [00:58<01:27, 14.54s/it]

Train Loss: 0.3655 | Valid Loss: 0.7242


 50%|█████     | 5/10 [01:12<01:12, 14.54s/it]

Train Loss: 0.3051 | Valid Loss: 0.7738


 60%|██████    | 6/10 [01:27<00:58, 14.62s/it]

Train Loss: 0.2582 | Valid Loss: 0.7828


 70%|███████   | 7/10 [01:42<00:43, 14.64s/it]

Train Loss: 0.2280 | Valid Loss: 0.7488


 80%|████████  | 8/10 [01:56<00:29, 14.68s/it]

Train Loss: 0.1939 | Valid Loss: 0.8173


 90%|█████████ | 9/10 [02:11<00:14, 14.64s/it]

Train Loss: 0.1691 | Valid Loss: 0.7868


100%|██████████| 10/10 [02:26<00:00, 14.61s/it]

Train Loss: 0.1504 | Valid Loss: 0.8324






Validation Accuracy: 0.3449


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁
validation_loss,█▃▂▁▂▂▁▂▂▃

0,1
epoch,9.0
train_loss,0.15043
val_accuracy,0.34488
validation_loss,0.83245


[34m[1mwandb[0m: Agent Starting Run: vhxk54sq with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 16
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3


 10%|█         | 1/10 [00:14<02:10, 14.46s/it]

Train Loss: 1.4312 | Valid Loss: 1.1411


 20%|██        | 2/10 [00:28<01:55, 14.47s/it]

Train Loss: 0.6589 | Valid Loss: 0.7723


 30%|███       | 3/10 [00:43<01:41, 14.49s/it]

Train Loss: 0.4542 | Valid Loss: 0.7657


 40%|████      | 4/10 [00:58<01:27, 14.58s/it]

Train Loss: 0.3596 | Valid Loss: 0.7425


 50%|█████     | 5/10 [01:12<01:12, 14.54s/it]

Train Loss: 0.3118 | Valid Loss: 0.7032


 60%|██████    | 6/10 [01:27<00:58, 14.54s/it]

Train Loss: 0.2625 | Valid Loss: 0.7717


 70%|███████   | 7/10 [01:41<00:43, 14.51s/it]

Train Loss: 0.2325 | Valid Loss: 0.7590


 80%|████████  | 8/10 [01:56<00:29, 14.53s/it]

Train Loss: 0.2054 | Valid Loss: 0.7779


 90%|█████████ | 9/10 [02:11<00:14, 14.72s/it]

Train Loss: 0.1879 | Valid Loss: 0.7568


100%|██████████| 10/10 [02:27<00:00, 14.74s/it]

Train Loss: 0.1726 | Valid Loss: 0.7811






Validation Accuracy: 0.3254


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁
validation_loss,█▂▂▂▁▂▂▂▂▂

0,1
epoch,9.0
train_loss,0.17255
val_accuracy,0.32538
validation_loss,0.78111


[34m[1mwandb[0m: Agent Starting Run: m7ro3ivk with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 32
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 3


 10%|█         | 1/10 [00:43<06:27, 43.09s/it]

Train Loss: 1.1055 | Valid Loss: 0.9708


 20%|██        | 2/10 [01:26<05:45, 43.23s/it]

Train Loss: 0.5072 | Valid Loss: 0.8415


 30%|███       | 3/10 [02:09<05:01, 43.11s/it]

Train Loss: 0.3856 | Valid Loss: 0.8504


 40%|████      | 4/10 [02:52<04:17, 42.99s/it]

Train Loss: 0.3190 | Valid Loss: 0.8383


 50%|█████     | 5/10 [03:35<03:35, 43.01s/it]

Train Loss: 0.2705 | Valid Loss: 0.8516


 60%|██████    | 6/10 [04:18<02:52, 43.03s/it]

Train Loss: 0.2360 | Valid Loss: 0.8901


 70%|███████   | 7/10 [05:01<02:09, 43.11s/it]

Train Loss: 0.2067 | Valid Loss: 0.9052


 80%|████████  | 8/10 [05:48<01:28, 44.27s/it]

Train Loss: 0.1844 | Valid Loss: 0.9143


 90%|█████████ | 9/10 [06:37<00:45, 45.91s/it]

Train Loss: 0.1635 | Valid Loss: 0.9460


100%|██████████| 10/10 [07:27<00:00, 44.77s/it]

Train Loss: 0.1493 | Valid Loss: 1.0046






Validation Accuracy: 0.3763


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁
validation_loss,▇▁▂▁▂▃▄▄▆█

0,1
epoch,9.0
train_loss,0.14927
val_accuracy,0.37632
validation_loss,1.00457


[34m[1mwandb[0m: Agent Starting Run: 3a6fmxw9 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 3


 10%|█         | 1/10 [00:47<07:11, 47.99s/it]

Train Loss: 0.8708 | Valid Loss: 0.9096


 20%|██        | 2/10 [01:34<06:15, 46.98s/it]

Train Loss: 0.4254 | Valid Loss: 0.8486


 30%|███       | 3/10 [02:20<05:25, 46.54s/it]

Train Loss: 0.3271 | Valid Loss: 0.8257


 40%|████      | 4/10 [03:05<04:36, 46.08s/it]

Train Loss: 0.2741 | Valid Loss: 0.8254


 50%|█████     | 5/10 [03:51<03:50, 46.03s/it]

Train Loss: 0.2290 | Valid Loss: 0.8693


 60%|██████    | 6/10 [04:35<03:01, 45.30s/it]

Train Loss: 0.2049 | Valid Loss: 0.8879


 70%|███████   | 7/10 [05:18<02:13, 44.54s/it]

Train Loss: 0.1789 | Valid Loss: 0.9051


 80%|████████  | 8/10 [06:01<01:28, 44.07s/it]

Train Loss: 0.1685 | Valid Loss: 0.9272


 90%|█████████ | 9/10 [06:44<00:43, 43.68s/it]

Train Loss: 0.1502 | Valid Loss: 0.9688


100%|██████████| 10/10 [07:27<00:00, 44.72s/it]

Train Loss: 0.1414 | Valid Loss: 0.9686






Validation Accuracy: 0.3765


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁
validation_loss,▅▂▁▁▃▄▅▆██

0,1
epoch,9.0
train_loss,0.14141
val_accuracy,0.37655
validation_loss,0.96859


[34m[1mwandb[0m: Agent Starting Run: mjoqz2eu with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 64
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 3


 10%|█         | 1/10 [00:15<02:17, 15.32s/it]

Train Loss: 1.4876 | Valid Loss: 1.1724


 20%|██        | 2/10 [00:31<02:08, 16.00s/it]

Train Loss: 0.6705 | Valid Loss: 0.8510


 30%|███       | 3/10 [00:47<01:51, 15.93s/it]

Train Loss: 0.4595 | Valid Loss: 0.7901


 40%|████      | 4/10 [01:02<01:34, 15.68s/it]

Train Loss: 0.3668 | Valid Loss: 0.7471


 50%|█████     | 5/10 [01:18<01:17, 15.52s/it]

Train Loss: 0.3059 | Valid Loss: 0.7261


 60%|██████    | 6/10 [01:33<01:01, 15.48s/it]

Train Loss: 0.2646 | Valid Loss: 0.7244


 70%|███████   | 7/10 [01:48<00:46, 15.46s/it]

Train Loss: 0.2305 | Valid Loss: 0.7507


 80%|████████  | 8/10 [02:03<00:30, 15.25s/it]

Train Loss: 0.1985 | Valid Loss: 0.7713


 90%|█████████ | 9/10 [02:18<00:15, 15.09s/it]

Train Loss: 0.1759 | Valid Loss: 0.7740


100%|██████████| 10/10 [02:33<00:00, 15.30s/it]

Train Loss: 0.1556 | Valid Loss: 0.7803






Validation Accuracy: 0.3258


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁
validation_loss,█▃▂▁▁▁▁▂▂▂

0,1
epoch,9.0
train_loss,0.15563
val_accuracy,0.32584
validation_loss,0.78035


[34m[1mwandb[0m: Agent Starting Run: h66z2l64 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 16
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 2


 10%|█         | 1/10 [00:35<05:18, 35.41s/it]

Train Loss: 1.3433 | Valid Loss: 1.1000


 20%|██        | 2/10 [01:10<04:42, 35.29s/it]

Train Loss: 0.6753 | Valid Loss: 0.9311


 30%|███       | 3/10 [01:45<04:07, 35.31s/it]

Train Loss: 0.5068 | Valid Loss: 0.8754


 40%|████      | 4/10 [02:21<03:32, 35.33s/it]

Train Loss: 0.4142 | Valid Loss: 0.8225


 50%|█████     | 5/10 [02:56<02:56, 35.26s/it]

Train Loss: 0.3501 | Valid Loss: 0.8534


 60%|██████    | 6/10 [03:31<02:20, 35.15s/it]

Train Loss: 0.3096 | Valid Loss: 0.8537


 70%|███████   | 7/10 [04:06<01:45, 35.19s/it]

Train Loss: 0.2688 | Valid Loss: 0.8543


 80%|████████  | 8/10 [04:41<01:10, 35.23s/it]

Train Loss: 0.2368 | Valid Loss: 0.8830


 90%|█████████ | 9/10 [05:16<00:35, 35.13s/it]

Train Loss: 0.2127 | Valid Loss: 0.9033


100%|██████████| 10/10 [05:52<00:00, 35.22s/it]

Train Loss: 0.1932 | Valid Loss: 0.9179






Validation Accuracy: 0.3084


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁
validation_loss,█▄▂▁▂▂▂▃▃▃

0,1
epoch,9.0
train_loss,0.19318
val_accuracy,0.3084
validation_loss,0.91793


[34m[1mwandb[0m: Agent Starting Run: 8t5q5aj9 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 2


 10%|█         | 1/10 [00:12<01:48, 12.09s/it]

Train Loss: 0.9609 | Valid Loss: 0.9013


 20%|██        | 2/10 [00:24<01:37, 12.18s/it]

Train Loss: 0.4356 | Valid Loss: 0.8227


 30%|███       | 3/10 [00:36<01:24, 12.09s/it]

Train Loss: 0.3295 | Valid Loss: 0.7580


 40%|████      | 4/10 [00:48<01:12, 12.04s/it]

Train Loss: 0.2675 | Valid Loss: 0.7532


 50%|█████     | 5/10 [01:00<01:00, 12.11s/it]

Train Loss: 0.2216 | Valid Loss: 0.7720


 60%|██████    | 6/10 [01:12<00:48, 12.01s/it]

Train Loss: 0.1940 | Valid Loss: 0.7986


 70%|███████   | 7/10 [01:24<00:36, 12.00s/it]

Train Loss: 0.1737 | Valid Loss: 0.8267


 80%|████████  | 8/10 [01:36<00:24, 12.04s/it]

Train Loss: 0.1540 | Valid Loss: 0.8422


 90%|█████████ | 9/10 [01:48<00:11, 11.99s/it]

Train Loss: 0.1393 | Valid Loss: 0.8706


100%|██████████| 10/10 [02:00<00:00, 12.05s/it]

Train Loss: 0.1304 | Valid Loss: 0.8933






Validation Accuracy: 0.3343


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁
validation_loss,█▄▁▁▂▃▄▅▇█

0,1
epoch,9.0
train_loss,0.1304
val_accuracy,0.33433
validation_loss,0.89333


[34m[1mwandb[0m: Agent Starting Run: tvzw6uan with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3


 10%|█         | 1/10 [00:14<02:13, 14.78s/it]

Train Loss: 0.9803 | Valid Loss: 0.8979


 20%|██        | 2/10 [00:29<01:59, 14.95s/it]

Train Loss: 0.4158 | Valid Loss: 0.8394


 30%|███       | 3/10 [00:44<01:44, 14.98s/it]

Train Loss: 0.3273 | Valid Loss: 0.7979


 40%|████      | 4/10 [01:00<01:30, 15.14s/it]

Train Loss: 0.2647 | Valid Loss: 0.7766


 50%|█████     | 5/10 [01:15<01:15, 15.14s/it]

Train Loss: 0.2291 | Valid Loss: 0.7849


 60%|██████    | 6/10 [01:30<01:00, 15.07s/it]

Train Loss: 0.1941 | Valid Loss: 0.8000


 70%|███████   | 7/10 [01:45<00:45, 15.10s/it]

Train Loss: 0.1740 | Valid Loss: 0.7871


 80%|████████  | 8/10 [02:00<00:30, 15.22s/it]

Train Loss: 0.1613 | Valid Loss: 0.8034


 90%|█████████ | 9/10 [02:15<00:15, 15.14s/it]

Train Loss: 0.1461 | Valid Loss: 0.8616


100%|██████████| 10/10 [02:30<00:00, 15.07s/it]

Train Loss: 0.1365 | Valid Loss: 0.8908






Validation Accuracy: 0.3646


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▃▃▂▂▁▁▁▁▁
val_accuracy,▁
validation_loss,█▅▂▁▁▂▂▃▆█

0,1
epoch,9.0
train_loss,0.13648
val_accuracy,0.36462
validation_loss,0.89084


## Evaluate Best Vanilla Model on Test Dataset

In [27]:
INPUT_SIZE = src_vocab.vocab_size
OUTPUT_SIZE = tgt_vocab.vocab_size
EMBEDDING_SIZE = 256
HIDDEN_SIZE = 512
NUM_LAYERS = 3
CELL_TYPE = "GRU"
DROPOUT = 0.2
LEARNING_RATE = 0.0005
BATCH_SIZE = 64
NUM_EPOCHS = 10


train_dataset = TransliterationDataset(TRAIN_FilePath, src_vocab, tgt_vocab)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

dev_dataset = TransliterationDataset(DEV_FilePath, src_vocab, tgt_vocab)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

test_dataset = TransliterationDataset(TEST_FilePath, src_vocab, tgt_vocab)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# Initialize encoder, decoder, and seq2seq model
encoder = Encoder(
    input_size=INPUT_SIZE,
    embedding_size=EMBEDDING_SIZE,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    cell_type=CELL_TYPE,
    dropout=DROPOUT
)

decoder = Decoder(
    output_size=OUTPUT_SIZE,
    embedding_size=EMBEDDING_SIZE,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    cell_type=CELL_TYPE,
    dropout=DROPOUT
)

model = Seq2Seq(encoder, decoder, device).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

for epoch in tqdm(range(NUM_EPOCHS)):
    # Train model
    train_loss = train(model, train_loader, optimizer, criterion)
    # Evaluate model
    valid_loss = evaluate(model, dev_loader, criterion)

    print(f"Train Loss: {train_loss:.4f} | Valid Loss: {valid_loss:.4f}")

 10%|█         | 1/10 [00:24<03:38, 24.26s/it]

Train Loss: 1.5922 | Valid Loss: 1.1444


 20%|██        | 2/10 [00:48<03:15, 24.42s/it]

Train Loss: 0.7158 | Valid Loss: 1.0068


 30%|███       | 3/10 [01:13<02:51, 24.43s/it]

Train Loss: 0.5420 | Valid Loss: 1.0317


 40%|████      | 4/10 [01:38<02:27, 24.57s/it]

Train Loss: 0.4400 | Valid Loss: 1.0014


 50%|█████     | 5/10 [02:02<02:02, 24.60s/it]

Train Loss: 0.3637 | Valid Loss: 1.0284


 60%|██████    | 6/10 [02:27<01:38, 24.70s/it]

Train Loss: 0.3066 | Valid Loss: 1.0625


 70%|███████   | 7/10 [02:53<01:15, 25.13s/it]

Train Loss: 0.2666 | Valid Loss: 1.0582


 80%|████████  | 8/10 [03:19<00:50, 25.36s/it]

Train Loss: 0.2321 | Valid Loss: 1.1479


 90%|█████████ | 9/10 [03:45<00:25, 25.60s/it]

Train Loss: 0.2074 | Valid Loss: 1.1636


100%|██████████| 10/10 [04:11<00:00, 25.14s/it]

Train Loss: 0.1855 | Valid Loss: 1.2250





In [28]:
train_accuracy = calculate_accuracy(model, train_loader, src_vocab, tgt_vocab, device)
val_accuracy = calculate_accuracy(model, dev_loader, src_vocab, tgt_vocab, device)
test_accuracy = calculate_accuracy(model, test_loader, src_vocab, tgt_vocab, device)

print(f"Train Accuracy : {train_accuracy*100:6.2f}%")
print(f"Val Accuracy   : {val_accuracy*100:6.2f}%")
print(f"Test Accuracy  : {test_accuracy*100:6.2f}%")

Train Accuracy :  83.37%
Val Accuracy   :  38.96%
Test Accuracy  :  39.27%


In [15]:
model.eval()
latin = []
correct_native = []
predicted_native = []

with torch.no_grad():
    for src, tgt in test_loader:
        src = src.to(device)
        tgt = tgt.to(device)

        batch_size = src.shape[0]

        for i in range(batch_size):
            # Get source text and actual target text
            src_indices = src[i].tolist()
            src_text = src_vocab.decode(src_indices)
            actual_tgt_text = tgt_vocab.decode(tgt[i].tolist())

            # Get predicted transliteration
            predicted_tgt_text = transliterate(model, src_text, src_vocab, tgt_vocab, device)

            latin.append(src_text)
            correct_native.append(actual_tgt_text)
            predicted_native.append(predicted_tgt_text)

os.makedirs("predictions_vanilla", exist_ok=True)
df = pd.DataFrame({
    "Latin": latin,
    "Correct Native": correct_native,
    "Predicted Native": predicted_native
})
df["Correct"] = df["Correct Native"] == df["Predicted Native"]

excel_path = "predictions_vanilla/test_predictions.xlsx"
df.to_excel(excel_path, index=False)

wandb.init(project="cs24m022_da6401_assignment3", name="vanilla_seq2seq_run")

samples = df.sample(n=10)
wandb_table = wandb.Table(columns=["Latin", "Correct Native", "Predicted Native", "Correct"])

for _, row in samples.iterrows():
    wandb_table.add_data(row["Latin"], row["Correct Native"], row["Predicted Native"], str(row["Correct"]))

wandb.log({"Random Test Predictions Sample": wandb_table})


NameError: name 'model' is not defined

## SEQ2SEQ Model with Attention

In [14]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers=1, cell_type="RNN", dropout=0.0):
        super().__init__()

        if num_layers == 1:
            dropout = 0

        self.cell_type = cell_type

        # Embedding Layer
        self.embedding = nn.Embedding(input_size, embedding_size)

        # Recurrent Layer
        if cell_type == "LSTM":
            self.recurrent_layer = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
        elif cell_type == "GRU":
            self.recurrent_layer = nn.GRU(embedding_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
        else: # Default (RNN)
            self.recurrent_layer = nn.RNN(embedding_size, hidden_size, num_layers, dropout=dropout, batch_first=True)

        # Dropout layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, input):
        # input: batch_size x seq_len

        embeddings = self.embedding(input)
        embeddings = self.dropout(embeddings)

        if self.cell_type == 'LSTM':
            outputs, (hidden, cell) = self.recurrent_layer(embeddings)
            return outputs, (hidden, cell)
        else:
            outputs, hidden = self.recurrent_layer(embeddings)
            return outputs, hidden

In [15]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()

        # Attention parameters for: e_jt = Vatt^T tanh(Uatt * s_t-1 + Watt * c_j)
        self.Uatt = nn.Linear(hidden_size, hidden_size)
        self.Watt = nn.Linear(hidden_size, hidden_size)
        self.Vatt = nn.Linear(hidden_size, 1)

    def forward(self, hidden, encoder_outputs):

        batch_size = encoder_outputs.shape[0]
        src_len = encoder_outputs.shape[1]

        if isinstance(hidden, tuple):  # LSTM
            hidden_state = hidden[0]
            hidden_state = hidden_state[-1]
        else:
            hidden_state = hidden[-1]

        hidden_expanded = hidden_state.unsqueeze(1)

        # Uatt * s_t-1
        uatt_term = self.Uatt(hidden_expanded)

        # Watt * c_j
        watt_term = self.Watt(encoder_outputs)


        combined = torch.tanh(uatt_term + watt_term)

        # Apply Vatt^T
        energy = self.Vatt(combined)
        energy = energy.squeeze(2)

        attention_weights = F.softmax(energy, dim=1)

        # Create context vector
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs)
        context = context.squeeze(1)

        return context, attention_weights

In [16]:
class AttentionDecoder(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, attention, num_layers=1, cell_type="RNN", dropout=0.0):
        super().__init__()

        if num_layers == 1:
            dropout = 0

        self.output_size = output_size
        self.cell_type = cell_type
        self.attention = attention
        self.hidden_size = hidden_size

        # Embedding Layer
        self.embedding = nn.Embedding(output_size, embedding_size)

        # Dropout layer
        self.dropout = nn.Dropout(dropout)

        if cell_type == "LSTM":
            self.recurrent_layer = nn.LSTM(embedding_size + hidden_size, hidden_size, num_layers,
                                         dropout=dropout, batch_first=True)
        elif cell_type == "GRU":
            self.recurrent_layer = nn.GRU(embedding_size + hidden_size, hidden_size, num_layers,
                                        dropout=dropout, batch_first=True)
        else: # Default (RNN)
            self.recurrent_layer = nn.RNN(embedding_size + hidden_size, hidden_size, num_layers,
                                        dropout=dropout, batch_first=True)

        # Output layer
        self.fc_out = nn.Linear(hidden_size * 2, output_size)

    def forward(self, input, hidden, encoder_outputs):

        input = input.unsqueeze(1)
        embeddings = self.embedding(input)
        embeddings = self.dropout(embeddings)

        # Calculate attention context vector
        context, attention_weights = self.attention(hidden, encoder_outputs)

        # Concatenate embeddings and context vector
        context = context.unsqueeze(1)
        rnn_input = torch.cat((embeddings, context), dim=2)

        if self.cell_type == "LSTM":
            outputs, (hidden, cell) = self.recurrent_layer(rnn_input, hidden)
            hidden_state = hidden
            hidden_tuple = (hidden, cell)
        else:
            outputs, hidden = self.recurrent_layer(rnn_input, hidden)
            hidden_state = hidden
            hidden_tuple = hidden

        if isinstance(hidden_state, tuple):  # LSTM
            last_hidden = hidden_state[0][-1]
        else:
            last_hidden = hidden_state[-1]

        last_hidden = last_hidden.squeeze(0) if last_hidden.dim() > 2 else last_hidden

        outputs = outputs.squeeze(1)
        context = context.squeeze(1)

        output_vector = torch.cat((outputs, context), dim=1)
        prediction = self.fc_out(output_vector)

        return prediction, hidden_tuple, attention_weights

In [17]:
class Seq2SeqWithAttention(nn.Module):
    def __init__(self, encoder, decoder, attention, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.attention = attention
        self.device = device

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        # src: [batch_size, src_len]
        # tgt: [batch_size, tgt_len]

        batch_size = src.shape[0]
        tgt_len = tgt.shape[1]
        tgt_vocab_size = self.decoder.output_size

        # Tensor to store decoder outputs
        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)

        # Store attention weights for visualization
        attentions = torch.zeros(batch_size, tgt_len, src.shape[1]).to(self.device)

        # Encode the source sequence
        if self.encoder.cell_type == 'LSTM':
            encoder_outputs, (hidden, cell) = self.encoder(src)
            decoder_hidden = (hidden, cell)
        else:
            encoder_outputs, hidden = self.encoder(src)
            decoder_hidden = hidden

        # First input to the decoder is the <sos> token
        decoder_input = tgt[:, 0]

        # Start decoding
        for t in range(1, tgt_len):
            # Decode
            decoder_output, decoder_hidden, attention_weights = self.decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )

            # Store decoder output and attention
            outputs[:, t] = decoder_output
            attentions[:, t] = attention_weights

            # Teacher forcing: use ground truth or predicted token as next input
            teacher_force = random.random() < teacher_forcing_ratio
            top = decoder_output.argmax(1)
            decoder_input = tgt[:, t] if teacher_force else top

        return outputs, attentions

    def inference(self, src, max_len, sos_idx=1, eos_idx=2):
        # src: [batch_size, src_len]
        batch_size = src.shape[0]

        # Tensor to store decoder outputs
        outputs = torch.zeros(batch_size, max_len, self.decoder.output_size).to(self.device)

        # Store attention weights for visualization
        attentions = torch.zeros(batch_size, max_len, src.shape[1]).to(self.device)

        # Encode the source sequence
        if self.encoder.cell_type == 'LSTM':
            encoder_outputs, (hidden, cell) = self.encoder(src)
            decoder_hidden = (hidden, cell)
        else:
            encoder_outputs, hidden = self.encoder(src)
            decoder_hidden = hidden

        # First input to the decoder is the <sos> token
        decoder_input = torch.tensor([sos_idx] * batch_size, device=self.device)

        # Start decoding
        for t in range(1, max_len):
            # Decode
            decoder_output, decoder_hidden, attention_weights = self.decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )

            # Store decoder output and attention
            outputs[:, t] = decoder_output
            attentions[:, t] = attention_weights

            # Use predicted token as next input
            top = decoder_output.argmax(1)
            decoder_input = top

            # Check if all sequences have reached <eos>
            if (top == eos_idx).all():
                break

        return outputs, attentions


## Training and Evaluation of Attention Seq2Seq Model

In [18]:
def train_attention_model(model, train_loader, optimizer, criterion, clip=1.0, teacher_forcing_ratio=0.5):
    model.train()
    epoch_loss = 0

    for i, (src, tgt) in enumerate(train_loader):
        src = src.to(device)
        tgt = tgt.to(device)
        #print("tgt", tgt)
        optimizer.zero_grad()

        output, _ = model(src, tgt, teacher_forcing_ratio)
        #print("output", output)

        # Reshape output and target for loss calculation
        # output: [batch_size, tgt_len, output_dim]
        # tgt: [batch_size, tgt_len]
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)  # Remove first token (SOS)
        tgt = tgt[:, 1:].reshape(-1)  # Remove first token (SOS)
        #print("tgt2", tgt)
        #print("output2", output)
        #break
        # Calculate loss
        loss = criterion(output, tgt)

        # Backpropagation
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        # Update parameters
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(train_loader)


In [19]:
def evaluate_attention_model(model, val_loader, criterion):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for i, (src, tgt) in enumerate(val_loader):
            src = src.to(device)
            tgt = tgt.to(device)

            output, _ = model.inference(src, tgt.shape[1])

            # Reshape output and target for loss calculation
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            tgt = tgt[:, 1:].reshape(-1)

            # Calculate loss
            loss = criterion(output, tgt)
            epoch_loss += loss.item()

    return epoch_loss / len(val_loader)

In [20]:
def transliterate_attention_model(model, src_text, src_vocab, tgt_vocab, device, max_length=100):
    model.eval()

    # Convert source text to tensor
    src_indices = src_vocab.encode(src_text)
    src_tensor = torch.tensor(src_indices, dtype=torch.long).unsqueeze(0).to(device)

    # Get encoder outputs
    with torch.no_grad():
        if model.encoder.cell_type == 'LSTM':
            encoder_outputs, (hidden, cell) = model.encoder(src_tensor)
            decoder_hidden = (hidden, cell)
        else:
            encoder_outputs, hidden = model.encoder(src_tensor)
            decoder_hidden = hidden

    # Start with SOS token

    decoder_input = torch.tensor([tgt_vocab.char2idx[tgt_vocab.sos_token]], device=device)

    result_indices = [tgt_vocab.char2idx[tgt_vocab.sos_token]]

    for _ in range(max_length):
        with torch.no_grad():
            decoder_output, decoder_hidden, attention_weights = model.decoder(decoder_input, decoder_hidden, encoder_outputs)

        # Get the most likely next character
        top_token = decoder_output.argmax(1).item()
        result_indices.append(top_token)

        # Stop if EOS token
        if top_token == tgt_vocab.char2idx[tgt_vocab.eos_token]:
            break

        # Use predicted token as next input
        decoder_input = torch.tensor([top_token], device=device)

    # Convert indices to text
    result_text = tgt_vocab.decode(result_indices, remove_special_tokens=True)

    return result_text

In [21]:
# Accuracy calculation function
def calculate_accuracy_attention_model(model, data_loader, src_vocab, tgt_vocab, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for src, tgt in data_loader:
            src = src.to(device)
            tgt = tgt.to(device)

            batch_size = src.shape[0]

            for i in range(batch_size):
                # Get source text and actual target text
                src_indices = src[i].tolist()
                src_text = src_vocab.decode(src_indices)
                actual_tgt_text = tgt_vocab.decode(tgt[i].tolist())

                # Get predicted transliteration
                predicted_tgt_text = transliterate_attention_model(model, src_text, src_vocab, tgt_vocab, device)

                # Check if prediction matches
                if predicted_tgt_text == actual_tgt_text:
                    correct += 1
                total += 1

    return correct / total

## Sample Run of Attention Seq2Seq Model

In [20]:
# INPUT_SIZE = src_vocab.vocab_size
# OUTPUT_SIZE = tgt_vocab.vocab_size
# EMBEDDING_SIZE = 256
# HIDDEN_SIZE = 512
# NUM_LAYERS = 2
# CELL_TYPE = "LSTM"
# DROPOUT = 0.2
# LEARNING_RATE = 0.001
# NUM_EPOCHS = 10

# # Initialize encoder, attention, decoder, and seq2seqwithattention model
# encoder = Encoder(
#     input_size=INPUT_SIZE,
#     embedding_size=EMBEDDING_SIZE,
#     hidden_size=HIDDEN_SIZE,
#     num_layers=NUM_LAYERS,
#     cell_type=CELL_TYPE,
#     dropout=DROPOUT
# )

# attention = Attention(HIDDEN_SIZE)

# decoder = AttentionDecoder(
#     output_size=OUTPUT_SIZE,
#     embedding_size=EMBEDDING_SIZE,
#     hidden_size=HIDDEN_SIZE,
#     attention=attention,
#     num_layers=NUM_LAYERS,
#     cell_type=CELL_TYPE,
#     dropout=DROPOUT
# )

# model = Seq2SeqWithAttention(encoder, decoder, attention, device).to(device)
# criterion = nn.CrossEntropyLoss(ignore_index=0)
# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [21]:
# # Training loop
# print(f"Starting training for {NUM_EPOCHS} epochs...")

# for epoch in range(NUM_EPOCHS):
#     print(f"Epoch {epoch+1}/{NUM_EPOCHS}")

#     # Train model
#     train_loss = train_attention_model(model, train_loader, optimizer, criterion)

#     # Evaluate model
#     valid_loss = evaluate_attention_model(model, dev_loader, criterion)

#     print(f"Train Loss: {train_loss:.4f} | Valid Loss: {valid_loss:.4f}")
#     print("="*20)

Starting training for 10 epochs...
Epoch 1/10
Train Loss: 1.0431 | Valid Loss: 1.0556
Epoch 2/10
Train Loss: 0.6093 | Valid Loss: 0.9648
Epoch 3/10
Train Loss: 0.5000 | Valid Loss: 0.9998
Epoch 4/10
Train Loss: 0.4313 | Valid Loss: 0.9416
Epoch 5/10
Train Loss: 0.3761 | Valid Loss: 1.0126
Epoch 6/10
Train Loss: 0.3315 | Valid Loss: 0.9924
Epoch 7/10
Train Loss: 0.2904 | Valid Loss: 1.0180
Epoch 8/10
Train Loss: 0.2647 | Valid Loss: 1.0341
Epoch 9/10
Train Loss: 0.2411 | Valid Loss: 1.0818
Epoch 10/10
Train Loss: 0.2176 | Valid Loss: 1.1194


In [22]:
# val_accuracy = calculate_accuracy_attention_model(model, dev_loader, src_vocab, tgt_vocab, device)
# print(f"\nValidation Accuracy: {val_accuracy:.4f}")


Validation Accuracy: 0.3791


## Hyperparameter Sweep of Attention Seq2Seq Model

In [22]:
def sweep_hyperparameters(config=None):
    with wandb.init(config=config):
        config = wandb.config
        wandb.run.name = f"embedding_size_{str(config.embedding_size)}_num_layers_{str(config.num_layers)}_hidden_size_{config.hidden_size} \
                          _cell_type_{config.cell_type}_dp_{config.dropout}_lr_{config.learning_rate}_batch_{config.batch_size}"

        # Log in my details
        wandb.config.update({"NAME": "KILAPARTHI VISHNU VARDHAN", "ROLL NO.": "CS24M022"})

        train_dataset = TransliterationDataset(TRAIN_FilePath, src_vocab, tgt_vocab)
        train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)

        dev_dataset = TransliterationDataset(DEV_FilePath, src_vocab, tgt_vocab)
        dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn)

        INPUT_SIZE = src_vocab.vocab_size
        OUTPUT_SIZE = tgt_vocab.vocab_size

        EMBEDDING_SIZE = config.embedding_size
        HIDDEN_SIZE = config.hidden_size
        NUM_LAYERS = config.num_layers
        CELL_TYPE = config.cell_type
        DROPOUT = config.dropout
        LEARNING_RATE = config.learning_rate
        NUM_EPOCHS = 10

        # Initialize encoder, decoder, and seq2seq model
        encoder = Encoder(
            input_size=INPUT_SIZE,
            embedding_size=EMBEDDING_SIZE,
            hidden_size=HIDDEN_SIZE,
            num_layers=NUM_LAYERS,
            cell_type=CELL_TYPE,
            dropout=DROPOUT
        )

        attention = Attention(HIDDEN_SIZE)

        decoder = AttentionDecoder(
            output_size=OUTPUT_SIZE,
            embedding_size=EMBEDDING_SIZE,
            hidden_size=HIDDEN_SIZE,
            attention=attention,
            num_layers=NUM_LAYERS,
            cell_type=CELL_TYPE,
            dropout=DROPOUT
        )

        model = Seq2SeqWithAttention(encoder, decoder, attention, device).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

        for epoch in tqdm(range(NUM_EPOCHS)):
            # Train model
            train_loss = train_attention_model(model, train_loader, optimizer, criterion)
            # Evaluate model
            valid_loss = evaluate_attention_model(model, dev_loader, criterion)

            print(f"Train Loss: {train_loss:.4f} | Valid Loss: {valid_loss:.4f}")
            # Log the evaluation metrics
            wandb.log({
                "epoch": epoch,
                "train_loss": train_loss,
                "validation_loss": valid_loss,
                })

        # Calculate accuracy on validation set
        val_accuracy = calculate_accuracy_attention_model(model, dev_loader, src_vocab, tgt_vocab, device)
        print(f"\nValidation Accuracy: {val_accuracy:.4f}")
        wandb.log({
            "val_accuracy": val_accuracy
        })

In [23]:
sweep_config = {
    "method" : "bayes",
    "metric" : {"name": "val_accuracy", "goal": "maximize"},
    "parameters" : {
        "embedding_size" : {"values" : [16, 32, 64, 256]},
        "num_layers" : {"values" : [2, 3]},
        "hidden_size" : {"values" : [32, 128, 512]},
        "cell_type" : {"values" : ["RNN", "LSTM", "GRU"]},
        "dropout" : {"values" : [ 0.2, 0.3]},
        "learning_rate" : {"values" : [0.001, 0.0005]},
        "batch_size": {"values": [32, 64, 128]}
    }
}

# sweep_config = {
#     "method" : "bayes",
#     "metric" : {"name": "val_accuracy", "goal": "maximize"},
#     "parameters" : {
#         "embedding_size" : {"values" : [32]},
#         "num_layers" : {"values" : [1]},
#         "hidden_size" : {"values" : [512, 2048]},
#         "cell_type" : {"values" : [ "LSTM", "GRU"]},
#         "dropout" : {"values" : [ 0.4]},
#         "learning_rate" : {"values" : [ 0.0005]},
#         "batch_size": {"values": [32, 64]}
#     }
# }


sweep_id = wandb.sweep(sweep_config, project = "cs24m022_DA6401_Assignment3")

Create sweep with ID: tk2okbfr
Sweep URL: https://wandb.ai/cs24m022-iit-madras-foundation/cs24m022_DA6401_Assignment3/sweeps/tk2okbfr


In [24]:
sweep_id = 'ukpef1o2'
wandb.agent('ukpef1o2', function = sweep_hyperparameters, count = 40)

[34m[1mwandb[0m: Agent Starting Run: nc08qgtb with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


## NOTE:- Sweep output for attention model is not presented as it was done in separate file on kaggle

## Evaluate Best Attention Seq2Seq Model on Test Dataset

In [25]:
# Best Hyperparameters
INPUT_SIZE = src_vocab.vocab_size
OUTPUT_SIZE = tgt_vocab.vocab_size
EMBEDDING_SIZE = 64
HIDDEN_SIZE = 512
NUM_LAYERS = 3
CELL_TYPE = "GRU"
DROPOUT = 0.4
LEARNING_RATE = 0.0005
BATCH_SIZE = 64
NUM_EPOCHS = 10

# Load train, dev and test dataloaders
train_dataset = TransliterationDataset(TRAIN_FilePath, src_vocab, tgt_vocab)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

dev_dataset = TransliterationDataset(DEV_FilePath, src_vocab, tgt_vocab)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

test_dataset = TransliterationDataset(TEST_FilePath, src_vocab, tgt_vocab)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)


# Initialize encoder, decoder, and seq2seq model
encoder = Encoder(
    input_size=INPUT_SIZE,
    embedding_size=EMBEDDING_SIZE,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    cell_type=CELL_TYPE,
    dropout=DROPOUT
)

attention = Attention(HIDDEN_SIZE)

decoder = AttentionDecoder(
    output_size=OUTPUT_SIZE,
    embedding_size=EMBEDDING_SIZE,
    hidden_size=HIDDEN_SIZE,
    attention=attention,
    num_layers=NUM_LAYERS,
    cell_type=CELL_TYPE,
    dropout=DROPOUT
)

model = Seq2SeqWithAttention(encoder, decoder, attention, device).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

for epoch in tqdm(range(NUM_EPOCHS)):
    # Train model
    train_loss = train_attention_model(model, train_loader, optimizer, criterion)
    # Evaluate model
    valid_loss = evaluate_attention_model(model, dev_loader, criterion)

    print(f"Train Loss: {train_loss:.4f} | Valid Loss: {valid_loss:.4f}")


  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:42<06:25, 42.81s/it][A

Train Loss: 0.7711 | Valid Loss: 0.8770


 10%|█         | 1/10 [01:23<12:27, 83.07s/it]

Train Loss: 1.0691 | Valid Loss: 0.8941



 20%|██        | 2/10 [01:24<05:37, 42.24s/it][A

Train Loss: 0.4395 | Valid Loss: 0.7966



 30%|███       | 3/10 [02:06<04:53, 41.94s/it][A

Train Loss: 0.3745 | Valid Loss: 0.7759


 20%|██        | 2/10 [02:45<11:00, 82.58s/it]

Train Loss: 0.5171 | Valid Loss: 0.7727



 40%|████      | 4/10 [02:49<04:14, 42.43s/it][A

Train Loss: 0.3398 | Valid Loss: 0.7597



 50%|█████     | 5/10 [03:32<03:32, 42.50s/it][A

Train Loss: 0.3128 | Valid Loss: 0.7437


 30%|███       | 3/10 [04:08<09:41, 83.03s/it]

Train Loss: 0.4156 | Valid Loss: 0.7493



 60%|██████    | 6/10 [04:15<02:50, 42.69s/it][A

Train Loss: 0.2900 | Valid Loss: 0.7429



 70%|███████   | 7/10 [04:57<02:07, 42.60s/it][A

Train Loss: 0.2768 | Valid Loss: 0.7405


 40%|████      | 4/10 [05:32<08:18, 83.11s/it]

Train Loss: 0.3643 | Valid Loss: 0.7196



 80%|████████  | 8/10 [05:40<01:25, 42.83s/it][A

Train Loss: 0.2624 | Valid Loss: 0.7820



 90%|█████████ | 9/10 [06:23<00:42, 42.71s/it][A

Train Loss: 0.2422 | Valid Loss: 0.7418


 50%|█████     | 5/10 [06:55<06:55, 83.17s/it]

Train Loss: 0.3221 | Valid Loss: 0.7507



100%|██████████| 10/10 [07:05<00:00, 42.56s/it][A

Train Loss: 0.2379 | Valid Loss: 0.7403



 60%|██████    | 6/10 [07:55<05:00, 75.24s/it]

Train Loss: 0.2940 | Valid Loss: 0.7195

Validation Accuracy: 0.3387


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▂▁▁▁
val_accuracy,▁
validation_loss,█▄▃▂▁▁▁▃▁▁

0,1
epoch,9.0
train_loss,0.23786
val_accuracy,0.33869
validation_loss,0.74025


 70%|███████   | 7/10 [08:35<03:11, 63.74s/it]

Train Loss: 0.2664 | Valid Loss: 0.7180


 80%|████████  | 8/10 [09:14<01:51, 55.87s/it]

Train Loss: 0.2420 | Valid Loss: 0.7353


 90%|█████████ | 9/10 [09:53<00:50, 50.66s/it]

Train Loss: 0.2267 | Valid Loss: 0.7429


100%|██████████| 10/10 [10:32<00:00, 63.22s/it]

Train Loss: 0.2143 | Valid Loss: 0.7443





In [26]:
train_accuracy = calculate_accuracy_attention_model(model, train_loader, src_vocab, tgt_vocab, device)
val_accuracy = calculate_accuracy_attention_model(model, dev_loader, src_vocab, tgt_vocab, device)
test_accuracy = calculate_accuracy_attention_model(model, test_loader, src_vocab, tgt_vocab, device)

print(f"Train Accuracy : {train_accuracy*100:6.2f}%")
print(f"Val Accuracy   : {val_accuracy*100:6.2f}%")
print(f"Test Accuracy  : {test_accuracy*100:6.2f}%")

KeyboardInterrupt: 

In [35]:
model.eval()
latin = []
correct_native = []
predicted_native = []

with torch.no_grad():
    for src, tgt in test_loader:
        src = src.to(device)
        tgt = tgt.to(device)

        batch_size = src.shape[0]

        for i in range(batch_size):
            # Get source text and actual target text
            src_indices = src[i].tolist()
            src_text = src_vocab.decode(src_indices)
            actual_tgt_text = tgt_vocab.decode(tgt[i].tolist())

            # Get predicted transliteration
            predicted_tgt_text = transliterate_attention_model(model, src_text, src_vocab, tgt_vocab, device)

            latin.append(src_text)
            correct_native.append(actual_tgt_text)
            predicted_native.append(predicted_tgt_text)

os.makedirs("predictions_attention", exist_ok=True)
df = pd.DataFrame({
    "Latin": latin,
    "Correct Native": correct_native,
    "Predicted Native": predicted_native
})
df["Correct"] = df["Correct Native"] == df["Predicted Native"]

excel_path = "predictions_attention/test_predictions.xlsx"
df.to_excel(excel_path, index=False)

wandb.init(project="cs24m022_da6401_assignment3", name="attention_seq2seq_run")

samples = df.sample(n=10)
wandb_table = wandb.Table(columns=["Latin", "Correct Native", "Predicted Native", "Correct"])

for _, row in samples.iterrows():
    wandb_table.add_data(row["Latin"], row["Correct Native"], row["Predicted Native"], str(row["Correct"]))

wandb.log({"Random Test Predictions Sample": wandb_table})


In [27]:
from google.colab import files
import requests

In [28]:
!mkdir -p ~/.fonts
font_url = "https://github.com/google/fonts/raw/main/ofl/notosansdevanagari/NotoSansDevanagari%5Bwdth%2Cwght%5D.ttf"
font_path = os.path.expanduser("~/.fonts/NotoSansDevanagari.ttf")

r = requests.get(font_url)
with open(font_path, 'wb') as f:
    f.write(r.content)

!rm -rf ~/.cache/matplotlib
fm.fontManager.addfont(font_path)
hindi_font = fm.FontProperties(fname=font_path)

In [29]:
def get_attention_weights(model, src_text, src_vocab, tgt_vocab, device, max_length=100):

    model.eval()
    src_indices = src_vocab.encode(src_text)
    src_tensor = torch.tensor(src_indices, dtype=torch.long).unsqueeze(0).to(device)

    # Get encoder outputs
    with torch.no_grad():
        if model.encoder.cell_type == 'LSTM':
            encoder_outputs, (hidden, cell) = model.encoder(src_tensor)
            decoder_hidden = (hidden, cell)
        else:
            encoder_outputs, hidden = model.encoder(src_tensor)
            decoder_hidden = hidden

    # Start with SOS token
    decoder_input = torch.tensor([tgt_vocab.char2idx[tgt_vocab.sos_token]], device=device)
    predicted = []
    attention_weights_list = []

    for _ in range(max_length):
        with torch.no_grad():
            decoder_output, decoder_hidden, attention_weights = model.decoder(decoder_input, decoder_hidden, encoder_outputs)

        attention_weights = attention_weights.squeeze(0).cpu().numpy()
        #print(len(src_indices), len(attention_weights))

        attention_weights_list.append(attention_weights)
        top_token = decoder_output.argmax(1).item()
        predicted.append(top_token)

        if top_token == tgt_vocab.char2idx[tgt_vocab.eos_token]:
            break

        decoder_input = torch.tensor([top_token], device=device)
    return attention_weights_list, predicted

In [30]:
def plot_attention_heatmap(model, dataloader, src_vocab, tgt_vocab, device, num_samples=10):

    model.eval()

    # Define plot
    num_rows = math.ceil(num_samples / 3)
    num_cols = min(num_samples, 3)

    plt.figure(figsize=(30, 10 * num_rows))

    # Get sample
    data = []
    for batch in dataloader:
        src, tgt = batch
        for i in range(len(src)):
            data.append((src[i], tgt[i]))
    samples = random.sample(data, min(len(data), num_samples))

    # Colormap
    colors = [(1, 1, 1), (0, 0, 1)]
    cmap = LinearSegmentedColormap.from_list("GBlue", colors, N=100)

    for idx, (src_tensor, tgt_tensor) in enumerate(samples):
        src_text = src_vocab.decode(src_tensor.tolist(), remove_special_tokens=True)
        tgt_text = tgt_vocab.decode(tgt_tensor.tolist(), remove_special_tokens=True)

        ax = plt.subplot(num_rows, num_cols, idx + 1)

        src_indices = src_vocab.encode(src_text)
        attention_weights_list, tgt_predictions_tokens = get_attention_weights(model, src_text, src_vocab, tgt_vocab, device)
        attention_mtx = np.array(attention_weights_list)

        src_chars = [src_vocab.idx2char[i] for i in src_indices]
        tgt_pred_chars = [tgt_vocab.idx2char[i] for i in tgt_predictions_tokens]

        im = ax.imshow(attention_mtx, cmap=cmap)

        # Set tick labels
        ax.set_xticks(np.arange(len(src_chars)))
        ax.set_yticks(np.arange(len(tgt_pred_chars)))
        ax.set_xticklabels(src_chars, fontproperties=hindi_font, fontsize=20)
        ax.set_yticklabels(tgt_pred_chars, fontproperties=hindi_font, fontsize=20)
        ax.set_title(f"Sample {idx+1}: '{src_text}' → '{tgt_text}' (pred: '{''.join(tgt_pred_chars)}')",fontproperties=hindi_font, fontsize=20)

    plt.tight_layout()
    plt.colorbar(im, ax=plt.gcf().get_axes())
    wandb.init(project="cs24m022_da6401_assignment3", name="Attention_Heatmap")
    wandb.log({"Attention Heat Maps": plt})
    plt.show()


In [None]:
plot_attention_heatmap(model, test_loader, src_vocab, tgt_vocab, device, num_samples=10)

  plt.tight_layout()


  util.ensure_matplotlib_figure(data).savefig(buf, format=self.format)
  fig.canvas.print_figure(bytes_io, **kw)


In [41]:
# ───────────────────────────────────────────────────────────────
# Final Cell: Sample Words → Attention Viz → Log 3‑Column Table
# ───────────────────────────────────────────────────────────────
import os, json, random, torch, wandb
from IPython.display import HTML

# 1) Manual list of Latin words
sampled = ["Damn", "world", "need", "model", "example"]
print("Sampled words:", sampled)

# 2) Inline HTML generator
def create_interactive_connectivity(attn_matrix, input_seq, output_seq, filename="attention.html"):
    html_template = """<!DOCTYPE html><html><head><meta charset="UTF-8"><title>Attention</title><style>
 body{font-family:'Arial Unicode MS','Noto Sans Devanagari',sans-serif;margin:20px;text-align:center;}
 .container{display:inline-block;}
 .controls{margin:12px;}
 .slider{width:180px;}
 .output-chars,.input-chars{display:flex;justify-content:center;margin:10px;}
 .char{padding:6px 10px;margin:3px;font-size:18px;position:relative;cursor:pointer;min-width:24px;transition:all .2s;}
 .output-char{background:#f0f0f0;border-radius:4px;}
 .input-char{background:#e0e0e0;border-radius:4px;}
 .selected{background:#4caf50;color:#fff;font-weight:bold;box-shadow:0 0 6px rgba(76,175,80,.5);}
 .highlighted{background:rgba(76,175,80,.3);transform:scale(1.1);}
 .connection-line{position:absolute;background:rgba(0,200,0,.5);height:3px;transform-origin:left center;z-index:-1;pointer-events:none;}
</style></head><body>
 <div class="container">
  <div class="controls">
    Threshold:
    <input id="thr" type="range" min="0" max="100" value="30" class="slider">
    <span id="tv">0.30</span>
  </div>
  <div class="output-chars" id="outs"></div>
  <div class="input-chars" id="ins"></div>
 </div>
 <script>
  const A={attn_matrix},IN={input_seq},OUT={output_seq};
  let cur=0,thr=0.3;
  function init(){renderOut();renderIn();draw();
    document.getElementById('thr').oninput=e=>{thr=e.target.value/100;document.getElementById('tv').textContent=thr.toFixed(2);draw();};
    window.onresize=draw;
  }
  function renderOut(){let c=document.getElementById('outs');c.innerHTML='';
    OUT.forEach((ch,i)=>{let d=document.createElement('div');
      d.className=`char output-char ${i===cur?'selected':''}`;d.textContent=ch;d.dataset.i=i;
      d.onmouseover=d.onclick=()=>{cur=i;renderOut();draw();};c.appendChild(d);
    });
  }
  function renderIn(){let c=document.getElementById('ins');c.innerHTML='';
    IN.forEach((ch,i)=>{let d=document.createElement('div');
      d.className='char input-char';d.textContent=ch;d.dataset.i=i;c.appendChild(d);
    });
  }
  function draw(){
    document.querySelectorAll('.connection-line').forEach(e=>e.remove());
    document.querySelectorAll('.input-char').forEach(e=>e.classList.remove('highlighted'));
    let o=document.querySelector(`.output-char[data-i="${cur}"]`);if(!o)return;
    let R=o.getBoundingClientRect(),W=A[cur],M=Math.max(...W);
    W.forEach((w,i)=>{let n=w/M; if(n<thr) return;
      let inp=document.querySelector(`.input-char[data-i="${i}"]`);
      inp.classList.add('highlighted');let S=inp.getBoundingClientRect();
      let x1=R.left+R.width/2-window.scrollX,y1=R.top+R.height-window.scrollY,
          x2=S.left+S.width/2-window.scrollX,y2=S.top-window.scrollY;
      let L=Math.hypot(x2-x1,y2-y1),ang=Math.atan2(y2-y1,x2-x1)*180/Math.PI;
      let line=document.createElement('div');line.className='connection-line';
      Object.assign(line.style,{width:`${L}px`,left:`${x1}px`,top:`${y1}px`,transform:`rotate(${ang}deg)`,opacity:n});
      document.body.appendChild(line);
    });
  }
  document.addEventListener('DOMContentLoaded',init);
</script></body></html>"""

    # sanitize attention matrix
    clean = []
    for row in attn_matrix:
        if hasattr(row, "cpu"):
            row = row.cpu().numpy()
        clean.append([float(x) for x in row])
    attn_json = json.dumps(clean)
    inp_json  = json.dumps(input_seq)
    out_json  = json.dumps(output_seq)

    html = html_template.replace("{attn_matrix}", attn_json)\
                        .replace("{input_seq}", inp_json)\
                        .replace("{output_seq}", out_json)
    with open(filename, "w", encoding="utf-8") as f:
        f.write(html)
    return filename

# 3) Build & log the W&B table
wandb.init(project="cs24m022_da6401_assignment3", resume="allow")
table = wandb.Table(columns=["Input (Latin)", "Prediction (Devanagari)", "Attention (HTML)"])

model.eval()
for i, latin in enumerate(sampled):
    attn_weights, pred_ids = get_attention_weights(model, latin, src_vocab, tgt_vocab, device)
    pred_str = tgt_vocab.decode(pred_ids, remove_special_tokens=True)

    html_file = create_interactive_connectivity(
        attn_matrix=attn_weights,
        input_seq=list(latin),
        output_seq=list(pred_str),
        filename=f"attn_{i}.html"
    )
    html_txt = open(html_file, "r", encoding="utf-8").read()
    table.add_data(latin, pred_str, wandb.Html(html_txt))
    os.remove(html_file)

# Log & summary
wandb.log({"Interactive_Attention_Table": table})
wandb.run.summary["Interactive_Attention_Examples"] = table

# Optional display
display(HTML(html_file))


Sampled words: ['Damn', 'world', 'need', 'model', 'example']


0,1
epoch,9.0
train_loss,0.23786
val_accuracy,0.33869
validation_loss,0.74025
