<a href="https://colab.research.google.com/github/SaiRajesh228/DA6401_Assignment3/blob/main/withoutAttention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from torch.utils.data import Dataset, DataLoader
import random
import os
import pickle
import json
import pandas as pd
from tqdm.auto import tqdm
import csv
import wandb

# Log in to Weights & Biases with the provided key
wandb.login(key='32f6049439fd96afecb91b2853dcb24d77f2f9d3')

# For reproducibility
def set_random_seeds(seed=42):
    """Set random seeds for reproducibility across libraries"""
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Character vocabulary class
class CharacterVocabulary:
    """Character-level vocabulary for transliteration tasks"""
    def __init__(self, token_list=None, special_tokens=['<pad>','<bos>','<eos>','<unk>']):
        self.special_tokens = special_tokens
        self.idx_to_char = list(special_tokens) + (token_list or [])
        self.char_to_idx = {ch:i for i,ch in enumerate(self.idx_to_char)}

    @classmethod
    def create_from_texts(cls, text_list):
        """Build vocabulary from a list of text samples"""
        unique_chars = sorted({char for text in text_list for char in text})
        return cls(token_list=unique_chars)

    @classmethod
    def create_from_file(cls, file_path, src_col='src', tgt_col='tgt', is_csv=True):
        """Build vocabulary from a data file (CSV or TSV)"""
        if is_csv:
            df = pd.read_csv(file_path, header=None, names=[src_col, tgt_col])
            texts = df[src_col].dropna().tolist() + df[tgt_col].dropna().tolist()
        else:
            texts = []
            with open(file_path, encoding='utf-8') as f:
                for line in f:
                    parts = line.strip().split('\t')
                    if len(parts) >= 2:
                        texts.extend([parts[0], parts[1]])

        return cls.create_from_texts(texts)

    def save(self, path):
        """Save vocabulary to JSON file"""
        with open(path, 'w', encoding='utf-8') as f:
            json.dump(self.idx_to_char, f, ensure_ascii=False)

    @classmethod
    def load(cls, path):
        """Load vocabulary from JSON file"""
        with open(path, encoding='utf-8') as f:
            idx_to_char = json.load(f)

        vocab = cls(token_list=[])
        vocab.idx_to_char = idx_to_char
        vocab.char_to_idx = {c:i for i,c in enumerate(idx_to_char)}
        return vocab

    def tokenize(self, text, add_bos=False, add_eos=False):
        """Convert text to a sequence of indices"""
        indices = []
        if add_bos: indices.append(self.char_to_idx['<bos>'])
        for c in text:
            indices.append(self.char_to_idx.get(c, self.char_to_idx['<unk>']))
        if add_eos: indices.append(self.char_to_idx['<eos>'])
        return indices

    def detokenize(self, indices, remove_special=True, join=True):
        """Convert a sequence of indices back to text"""
        if hasattr(indices, 'tolist'):
            indices = indices.tolist()

        chars = [self.idx_to_char[i] for i in indices if i < len(self.idx_to_char)]

        if remove_special:
            chars = [c for c in chars if c not in self.special_tokens]

        return ''.join(chars) if join else chars

    def batch_detokenize(self, batch_indices, remove_special=True):
        """Decode a batch of index sequences"""
        return [self.detokenize(seq, remove_special=remove_special) for seq in batch_indices]

    def get_statistics(self):
        """Get vocabulary statistics"""
        return {
            'total_size': len(self.idx_to_char),
            'special_tokens': len(self.special_tokens),
            'character_count': len(self.idx_to_char) - len(self.special_tokens)
        }

    def __len__(self):
        return len(self.idx_to_char)

    @property
    def pad_id(self): return self.char_to_idx['<pad>']

    @property
    def bos_id(self): return self.char_to_idx['<bos>']

    @property
    def eos_id(self): return self.char_to_idx['<eos>']

    @property
    def unk_id(self): return self.char_to_idx['<unk>']

    @property
    def vocab_size(self): return len(self.idx_to_char)

# Data processing
class TransliterationDataset(Dataset):
    """Dataset class for transliteration tasks"""

    def __init__(self, file_path, source_vocab, target_vocab, dataset_type='dakshina'):
        self.examples = []
        self.dataset_type = dataset_type

        if dataset_type == 'dakshina':
            for src, tgt in self._read_tsv_file(file_path):
                src_ids = source_vocab.tokenize(src, add_bos=True, add_eos=True)
                tgt_ids = target_vocab.tokenize(tgt, add_bos=True, add_eos=True)
                self.examples.append((
                    torch.tensor(src_ids, dtype=torch.long),
                    torch.tensor(tgt_ids, dtype=torch.long)
                ))
        else:
            raise ValueError(f"Unsupported dataset type: {dataset_type}")

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]

    def _read_tsv_file(self, path):
        """Read a tab-separated file with source and target text"""
        with open(path, encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) >= 2:
                    yield parts[1], parts[0]  # Dakshina format: target, source

def create_batches(batch, src_vocab, tgt_vocab):
    """Custom collate function for variable-length sequences"""
    srcs, tgts = zip(*batch)
    src_padded = pad_sequence(srcs, batch_first=True, padding_value=src_vocab.pad_id)
    tgt_padded = pad_sequence(tgts, batch_first=True, padding_value=tgt_vocab.pad_id)
    src_lengths = torch.tensor([len(s) for s in srcs], dtype=torch.long)
    return src_padded, src_lengths, tgt_padded

def load_data(
        language='te',
        dataset_type='dakshina',
        dataset_path=None,
        batch_size=64,
        device='cpu',
        worker_count=2,
        prefetch_factor=4,
        persistent_workers=True,
        cache_dir='./cache',
        use_cached_vocab=True
    ):
    """Load transliteration datasets and vocabulary"""
    if dataset_path is None:
        dataset_path = os.path.join(
            '/content/dakshina_dataset_v1.0',
            language, 'lexicons'
        )

    # Create cache directory if it doesn't exist
    if use_cached_vocab:
        os.makedirs(cache_dir, exist_ok=True)
        vocab_cache_path = os.path.join(cache_dir, f"{language}_{dataset_type}_vocab.pkl")

    # Try to load cached vocabularies
    if use_cached_vocab and os.path.exists(vocab_cache_path):
        print(f"Loading cached vocabularies from {vocab_cache_path}")
        with open(vocab_cache_path, 'rb') as f:
            src_vocab, tgt_vocab = pickle.load(f)
    else:
        # Build vocabularies from data
        all_src, all_tgt = [], []

        for split in ['train', 'dev']:
            file_path = os.path.join(dataset_path, f"{language}.translit.sampled.{split}.tsv")
            with open(file_path, encoding='utf-8') as f:
                for line in f:
                    parts = line.strip().split('\t')
                    if len(parts) >= 2:
                        all_src.append(parts[1])  # Dakshina format has target, source
                        all_tgt.append(parts[0])

        # Build vocabularies
        src_vocab = CharacterVocabulary.create_from_texts(all_src)
        tgt_vocab = CharacterVocabulary.create_from_texts(all_tgt)

        # Cache vocabularies
        if use_cached_vocab:
            with open(vocab_cache_path, 'wb') as f:
                pickle.dump((src_vocab, tgt_vocab), f)

    # DataLoader configuration
    loader_config = dict(
        batch_size=batch_size,
        num_workers=worker_count,
        prefetch_factor=prefetch_factor,
        persistent_workers=persistent_workers and worker_count > 0,
        pin_memory=(device == 'cuda')
    )

    # Create data loaders for each split
    data_loaders = {}

    splits = {'train': 'train', 'dev': 'dev', 'test': 'test'}
    for split_name, file_split in splits.items():
        file_path = os.path.join(dataset_path, f"{language}.translit.sampled.{file_split}.tsv")
        dataset = TransliterationDataset(file_path, src_vocab, tgt_vocab, dataset_type='dakshina')
        data_loaders[split_name] = DataLoader(
            dataset,
            shuffle=(split_name == 'train'),
            collate_fn=lambda b: create_batches(b, src_vocab, tgt_vocab),
            **loader_config
        )

    return data_loaders, src_vocab, tgt_vocab

# Model Components
class RNNEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1,
                 rnn_type='LSTM', dropout=0.0, bidirectional=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.bidirectional = bidirectional
        self.rnn_type = rnn_type
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim

        # Output size will be doubled if bidirectional
        self.output_dim = hidden_dim * 2 if bidirectional else hidden_dim

        rnn_classes = {'LSTM': nn.LSTM, 'GRU': nn.GRU, 'RNN': nn.RNN}
        if rnn_type not in rnn_classes:
            raise ValueError(f"Unsupported RNN type: {rnn_type}")

        self.rnn = rnn_classes[rnn_type](embedding_dim,
                                       hidden_dim,
                                       num_layers=num_layers,
                                       dropout=dropout if num_layers > 1 else 0.0,
                                       batch_first=True,
                                       bidirectional=bidirectional)

    def forward(self, inputs, lengths):
        # inputs: [batch_size, seq_len], lengths: [batch_size]
        embedded = self.embedding(inputs)  # [batch_size, seq_len, embedding_dim]
        packed_input = pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, hidden_states = self.rnn(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)  # [batch_size, seq_len, hidden_dim*dirs]

        # Process hidden state based on RNN type and bidirectionality
        if self.bidirectional:
            if self.rnn_type == 'LSTM':
                # For LSTM we have both hidden and cell states
                h_n, c_n = hidden_states
                # Combine forward and backward states by averaging
                h_n = torch.add(h_n[0:self.num_layers], h_n[self.num_layers:]) / 2
                c_n = torch.add(c_n[0:self.num_layers], c_n[self.num_layers:]) / 2
                hidden_states = (h_n, c_n)
            else:
                # For GRU/RNN we only have hidden state
                hidden_states = torch.add(hidden_states[0:self.num_layers], hidden_states[self.num_layers:]) / 2

        return output, hidden_states

class RNNDecoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, encoder_hidden_dim, decoder_hidden_dim,
                 num_layers=1, rnn_type="LSTM", dropout=0.0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn_type = rnn_type

        # Input to the RNN is just the embedding
        rnn_input_dim = embedding_dim
        # Input to the final layer is RNN output and embedding
        fc_input_dim = decoder_hidden_dim + embedding_dim

        rnn_classes = {"LSTM": nn.LSTM, "GRU": nn.GRU, "RNN": nn.RNN}
        if rnn_type not in rnn_classes:
            raise ValueError(f"Unsupported RNN type: {rnn_type}")

        self.rnn = rnn_classes[rnn_type](rnn_input_dim, decoder_hidden_dim,
                                        num_layers=num_layers,
                                        dropout=dropout if num_layers > 1 else 0.0,
                                        batch_first=True)
        self.output_layer = nn.Linear(fc_input_dim, vocab_size)

    def forward(self, input_token, hidden, encoder_outputs, mask):
        """
        input_token : [batch_size]  current input token
        hidden      : initial state for this step
        encoder_outputs : [batch_size, src_len, encoder_hidden_dim]  (unused without attention)
        mask        : [batch_size, src_len]  (unused without attention)
        """
        # Embed the current token
        embedded = self.embedding(input_token).unsqueeze(1)  # [batch_size, 1, embedding_dim]

        # Run through RNN
        output, hidden = self.rnn(embedded, hidden)  # [batch_size, 1, decoder_hidden_dim]
        output = output.squeeze(1)  # [batch_size, decoder_hidden_dim]
        embedded = embedded.squeeze(1)  # [batch_size, embedding_dim]

        # Concatenate RNN output and embedding to predict next token
        logits = self.output_layer(torch.cat((output, embedded), dim=1))

        return logits, hidden, None  # Return None for attention weights

class Seq2SeqModel(nn.Module):
    def __init__(self, encoder, decoder, pad_idx, device='cpu'):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.pad_idx = pad_idx
        self.device = device

    def forward(self, src, src_lengths, tgt, teacher_forcing_ratio=0.5):
        """Forward pass with teacher forcing"""
        # Encode the source sequence
        encoder_outputs, hidden = self.encoder(src, src_lengths)
        # Create mask (1 for real tokens, 0 for padding)
        mask = (src != self.pad_idx)

        batch_size, target_length = tgt.size()
        output_dim = self.decoder.output_layer.out_features

        # Initialize tensor to store decoder outputs
        outputs = torch.zeros(batch_size, target_length-1, output_dim, device=self.device)

        # First input to the decoder is the <bos> token
        decoder_input = tgt[:, 0]

        # Teacher forcing is applied with probability teacher_forcing_ratio
        for t in range(1, target_length):
            # Pass through decoder
            decoder_output, hidden, _ = self.decoder(decoder_input, hidden, encoder_outputs, mask)

            # Store the output
            outputs[:, t-1] = decoder_output

            # Decide whether to use teacher forcing
            use_teacher_forcing = random.random() < teacher_forcing_ratio

            if use_teacher_forcing:
                # Teacher forcing: use ground-truth as next input
                decoder_input = tgt[:, t]
            else:
                # No teacher forcing: use model's prediction as next input
                decoder_input = decoder_output.argmax(1)

        return outputs

    def generate(self, src, src_lengths, tgt_vocab, max_len=50):
        """Generate a translation using greedy decoding"""
        # Encode the source sequence
        encoder_outputs, hidden = self.encoder(src, src_lengths)
        # Create mask
        mask = (src != self.pad_idx)

        batch_size = src.size(0)

        # First input is the <bos> token
        decoder_input = torch.full((batch_size,), tgt_vocab.bos_id, device=self.device, dtype=torch.long)

        # List to store generated tokens
        generated_tokens = []

        # Generate tokens one by one
        for _ in range(max_len):
            decoder_output, hidden, _ = self.decoder(decoder_input, hidden, encoder_outputs, mask)

            # Get the most likely token
            next_token = decoder_output.argmax(1)

            # Add to our generated tokens
            generated_tokens.append(next_token.unsqueeze(1))

            # Update the decoder input for the next step
            decoder_input = next_token

            # Stop if all sequences have generated the <eos> token
            if (next_token == tgt_vocab.eos_id).all():
                break

        # Concatenate all tokens
        return torch.cat(generated_tokens, dim=1)

# Training and evaluation utilities
def calculate_accuracy(model, data_loader, tgt_vocab, src_vocab, device):
    """Calculate accuracy and collect prediction details"""
    model.eval()
    correct = total = 0

    # Lists to store detailed results
    correct_sources = []
    correct_targets = []
    correct_predictions = []

    incorrect_sources = []
    incorrect_targets = []
    incorrect_predictions = []

    with torch.no_grad():
        for src, src_lengths, tgt in data_loader:
            src, src_lengths, tgt = (x.to(device) for x in (src, src_lengths, tgt))
            predictions = model.generate(src, src_lengths, tgt_vocab, max_len=tgt.size(1))

            # Process each example in the batch
            for idx in range(src.size(0)):
                # Convert indices to strings
                predicted_text = tgt_vocab.detokenize(predictions[idx].cpu().tolist())
                target_text = tgt_vocab.detokenize(tgt[idx, 1:].cpu().tolist())  # Skip <bos>
                source_text = src_vocab.detokenize(src[idx].cpu().tolist())

                # Check if prediction matches target
                is_correct = (predicted_text == target_text)
                correct += is_correct

                # Store detailed results
                if is_correct:
                    correct_sources.append(source_text)
                    correct_targets.append(target_text)
                    correct_predictions.append(predicted_text)
                else:
                    incorrect_sources.append(source_text)
                    incorrect_targets.append(target_text)
                    incorrect_predictions.append(predicted_text)

            total += src.size(0)

    accuracy = correct / total if total else 0.0
    return (
        accuracy,
        (correct_sources, correct_targets, correct_predictions),
        (incorrect_sources, incorrect_targets, incorrect_predictions)
    )

def save_predictions(src_list, tgt_list, pred_list, file_name):
    """Save prediction details to CSV file for analysis"""
    with open(file_name, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Source', 'Target', 'Predicted'])
        for row in zip(src_list, tgt_list, pred_list):
            writer.writerow(row)

    return file_name

def train_model(
    model,
    data_loaders,
    src_vocab,
    tgt_vocab,
    device,
    config,
    save_path=None,
    log_to_wandb=True
):
    """Train a sequence-to-sequence model"""
    criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab.pad_id)

    # Select optimizer based on config
    if config['optimizer'].lower() == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
    elif config['optimizer'].lower() == 'nadam':
        optimizer = optim.NAdam(model.parameters(), lr=config['learning_rate'])
    else:
        optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])

    # Track best validation accuracy
    best_val_acc = 0.0

    # Main training loop
    for epoch in tqdm(range(1, config['epochs'] + 1), desc="Epochs", position=0):
        model.train()
        total_loss = 0.0

        # Training batches with progress bar
        train_loader = tqdm(data_loaders['train'], desc=f"Train {epoch}", leave=False, position=1)
        for src, src_lengths, tgt in train_loader:
            src, src_lengths, tgt = src.to(device), src_lengths.to(device), tgt.to(device)

            optimizer.zero_grad()
            output = model(src, src_lengths, tgt, teacher_forcing_ratio=config['teacher_forcing'])
            loss = criterion(output.reshape(-1, output.size(-1)), tgt[:,1:].reshape(-1))
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            total_loss += loss.item()

        train_loader.close()
        train_loss = total_loss / len(data_loaders['train'])

        # Validation loss
        val_loss = 0.0
        val_loader = tqdm(data_loaders['dev'], desc=f"Val {epoch}", leave=False, position=1)
        model.eval()
        with torch.no_grad():
            for src, src_lengths, tgt in val_loader:
                src, src_lengths, tgt = src.to(device), src_lengths.to(device), tgt.to(device)
                output = model(src, src_lengths, tgt, teacher_forcing_ratio=0.0)  # No teacher forcing in validation
                val_loss += criterion(output.reshape(-1, output.size(-1)),
                                    tgt[:,1:].reshape(-1)).item()
        val_loader.close()
        val_loss /= len(data_loaders['dev'])

        # Compute accuracy metrics
        train_results = calculate_accuracy(model, data_loaders['train'], tgt_vocab, src_vocab, device)
        train_acc = train_results[0]

        val_results = calculate_accuracy(model, data_loaders['dev'], tgt_vocab, src_vocab, device)
        val_acc = val_results[0]

        # Save model if it's the best so far
        if val_acc > best_val_acc and save_path:
            best_val_acc = val_acc
            torch.save(model.state_dict(), save_path)
            print(f"New best model saved with validation accuracy: {val_acc:.4f}")

            # Save prediction analysis for milestone epochs
            if epoch == config['epochs'] or epoch % 5 == 0:
                correct_data = val_results[1]
                incorrect_data = val_results[2]

                save_predictions(
                    correct_data[0], correct_data[1], correct_data[2],
                    f"correct_predictions_epoch_{epoch}.csv"
                )

                save_predictions(
                    incorrect_data[0], incorrect_data[1], incorrect_data[2],
                    f"incorrect_predictions_epoch_{epoch}.csv"
                )

        # Log metrics
        print(f"Epoch {epoch}/{config['epochs']}:")
        print(f"  Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
        print(f"  Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

        if log_to_wandb:
            wandb.log({
                'epoch': epoch,
                'train_loss': train_loss,
                'validation_loss': val_loss,
                'train_accuracy': train_acc,
                'validation_accuracy': val_acc
            })

    # Final evaluation on test set
    test_results = calculate_accuracy(model, data_loaders['test'], tgt_vocab, src_vocab, device)
    test_acc = test_results[0]
    print(f"Final test accuracy: {test_acc:.4f}")

    if log_to_wandb:
        wandb.log({'test_accuracy': test_acc})

    return model, test_acc

# Hyperparameter sweep configuration
def get_sweep_config():
    """Define the hyperparameter sweep configuration for wandb"""
    sweep_config = {
        'method': 'bayes',  # Use Bayesian optimization
        'name': 'Transliteration_without_Attention',
        'metric': {'name': 'validation_accuracy', 'goal': 'maximize'},
        'parameters': {
            # Model architecture
            'embedding_dim': {'values': [128, 256, 512]},
            'hidden_dim': {'values': [128, 256, 512, 1024]},
            'num_layers': {'values': [1, 2, 3, 4]},
            'rnn_type': {'values': ['RNN', 'GRU', 'LSTM']},
            'bidirectional': {'values': [True, False]},

            # Training parameters
            'dropout': {'values': [0.0, 0.1, 0.2, 0.3, 0.5]},
            'learning_rate': {'values': [1e-4, 2e-4, 5e-4, 8e-4, 1e-3]},
            'batch_size': {'values': [32, 64, 128]},
            'epochs': {'values': [10, 15, 20]},
            'teacher_forcing': {'values': [0.3, 0.5, 0.7, 1.0]},
            'optimizer': {'values': ['Adam', 'NAdam']},
            'seed': {'values': [42, 43, 44, 45, 46]},
        }
    }
    return sweep_config

def run_sweep_objective():
    """Objective function for wandb sweep"""
    # Initialize wandb run and get config
    run = wandb.init()
    config = wandb.config

    # Convert to a normal dictionary for our function
    experiment_config = {
        'language': 'te',  # Telugu
        'rnn_type': config.rnn_type,
        'embedding_dim': config.embedding_dim,
        'hidden_dim': config.hidden_dim,
        'num_layers': config.num_layers,
        'dropout': config.dropout,
        'bidirectional': config.bidirectional,
        'batch_size': config.batch_size,
        'epochs': config.epochs,
        'learning_rate': config.learning_rate,
        'teacher_forcing': config.teacher_forcing,
        'optimizer': config.optimizer,
        'seed': config.seed
    }

    # Set seeds for reproducibility
    set_random_seeds(experiment_config['seed'])

    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Download and extract the dataset if necessary
    if not os.path.exists('/content/dakshina_dataset_v1.0'):
        print("Downloading Dakshina dataset...")
        !wget "https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar"
        !tar xopf dakshina_dataset_v1.0.tar

    # Create a unique run name based on config
    run_name = f"{experiment_config['rnn_type']}_{experiment_config['num_layers']}l_{experiment_config['embedding_dim']}e_{experiment_config['hidden_dim']}h_" \
               f"{'bid' if experiment_config['bidirectional'] else 'uni'}_{experiment_config['dropout']}d_" \
               f"{experiment_config['teacher_forcing']}tf_{experiment_config['optimizer']}"
    wandb.run.name = run_name

    # Load data
    print(f"Loading {experiment_config['language']} data...")
    data_loaders, src_vocab, tgt_vocab = load_data(
        language=experiment_config['language'],
        batch_size=experiment_config['batch_size'],
        device=device
    )

    # Create model components
    print("Building model...")
    encoder = RNNEncoder(
        src_vocab.vocab_size,
        experiment_config['embedding_dim'],
        experiment_config['hidden_dim'],
        num_layers=experiment_config['num_layers'],
        rnn_type=experiment_config['rnn_type'],
        dropout=experiment_config['dropout'],
        bidirectional=experiment_config['bidirectional']
    ).to(device)

    # Calculate encoder output dimension (doubled if bidirectional)
    encoder_output_dim = experiment_config['hidden_dim'] * 2 if experiment_config['bidirectional'] else experiment_config['hidden_dim']

    decoder = RNNDecoder(
        tgt_vocab.vocab_size,
        experiment_config['embedding_dim'],
        encoder_output_dim,
        experiment_config['hidden_dim'],
        num_layers=experiment_config['num_layers'],
        rnn_type=experiment_config['rnn_type'],
        dropout=experiment_config['dropout']
    ).to(device)

    model = Seq2SeqModel(encoder, decoder, pad_idx=src_vocab.pad_id, device=device).to(device)

    # Train the model
    print("Training model...")
    model_save_path = f"model_{run_name}.pt"

    model, test_acc = train_model(
        model=model,
        data_loaders=data_loaders,
        src_vocab=src_vocab,
        tgt_vocab=tgt_vocab,
        device=device,
        config=experiment_config,
        save_path=model_save_path,
        log_to_wandb=True
    )

    # Wandb finish happens automatically when this function returns

def run_transliteration_experiment(config=None, use_wandb=True, run_sweep=False, sweep_count=20):
    """Run a transliteration experiment with the given config or using a sweep"""

    if run_sweep:
        # Run a hyperparameter sweep
        sweep_config = get_sweep_config()
        sweep_id = wandb.sweep(sweep_config, project="DA6401_Assignment_3")
        wandb.agent(sweep_id, function=run_sweep_objective, count=sweep_count)
        return None, None, None

    # Run a single experiment with the given config
    if config is None:
        config = {
            'language': 'te',  # Telugu
            'rnn_type': 'LSTM',
            'embedding_dim': 256,
            'hidden_dim': 512,
            'num_layers': 2,
            'dropout': 0.3,
            'bidirectional': True,
            'batch_size': 64,
            'epochs': 10,
            'learning_rate': 0.001,
            'teacher_forcing': 0.5,
            'optimizer': 'Adam',
            'seed': 42
        }

    # Set seeds for reproducibility
    set_random_seeds(config['seed'])

    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Initialize wandb
    if use_wandb:
        run_name = f"{config['rnn_type']}_{config['num_layers']}l_{config['embedding_dim']}e_{config['hidden_dim']}h_" \
                  f"{'bid' if config['bidirectional'] else 'uni'}_{config['dropout']}d_" \
                  f"{config['teacher_forcing']}tf_{config['optimizer']}"

        wandb.init(
            project="DA6401_Assignment_3",
            name=run_name,
            config=config
        )

    # Download and extract the dataset if necessary
    if not os.path.exists('/content/dakshina_dataset_v1.0'):
        print("Downloading Dakshina dataset...")
        !wget "https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar"
        !tar xopf dakshina_dataset_v1.0.tar

    # Load data
    print(f"Loading {config['language']} data...")
    data_loaders, src_vocab, tgt_vocab = load_data(
        language=config['language'],
        batch_size=config['batch_size'],
        device=device
    )

    # Create model components
    print("Building model...")
    encoder = RNNEncoder(
        src_vocab.vocab_size,
        config['embedding_dim'],
        config['hidden_dim'],
        num_layers=config['num_layers'],
        rnn_type=config['rnn_type'],
        dropout=config['dropout'],
        bidirectional=config['bidirectional']
    ).to(device)

    # Calculate encoder output dimension (doubled if bidirectional)
    encoder_output_dim = config['hidden_dim'] * 2 if config['bidirectional'] else config['hidden_dim']

    decoder = RNNDecoder(
        tgt_vocab.vocab_size,
        config['embedding_dim'],
        encoder_output_dim,
        config['hidden_dim'],
        num_layers=config['num_layers'],
        rnn_type=config['rnn_type'],
        dropout=config['dropout']
    ).to(device)

    model = Seq2SeqModel(encoder, decoder, pad_idx=src_vocab.pad_id, device=device).to(device)

    # Train the model
    print("Training model...")
    model_save_path = f"transliteration_model_{config['language']}_{config['rnn_type']}.pt"

    model, test_acc = train_model(
        model=model,
        data_loaders=data_loaders,
        src_vocab=src_vocab,
        tgt_vocab=tgt_vocab,
        device=device,
        config=config,
        save_path=model_save_path,
        log_to_wandb=use_wandb
    )

    print(f"Training complete! Final test accuracy: {test_acc:.4f}")
    print(f"Model saved to {model_save_path}")

    # Finish the wandb run
    if use_wandb:
        wandb.finish()

    return model, src_vocab, tgt_vocab

# Run the experiment when executed directly
if __name__ == "__main__":
    # To run a single experiment:
    # run_transliteration_experiment(use_wandb=True)

    # To run a hyperparameter sweep:
    run_transliteration_experiment(use_wandb=True, run_sweep=True, sweep_count=20)

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mcs24m040[0m ([33mcs24m040-iit-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Create sweep with ID: 2z2823jo
Sweep URL: https://wandb.ai/cs24m040-iit-madras/DA6401_Assignment_3/sweeps/2z2823jo


[34m[1mwandb[0m: Agent Starting Run: 5tofmmua with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	embedding_dim: 512
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: Adam
[34m[1mwandb[0m: 	rnn_type: GRU
[34m[1mwandb[0m: 	seed: 42
[34m[1mwandb[0m: 	teacher_forcing: 0.7


Using device: cuda
Downloading Dakshina dataset...
--2025-05-20 10:00:53--  https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.101.207, 142.250.141.207, 142.251.2.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.101.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2008340480 (1.9G) [application/x-tar]
Saving to: ‘dakshina_dataset_v1.0.tar’


2025-05-20 10:01:02 (224 MB/s) - ‘dakshina_dataset_v1.0.tar’ saved [2008340480/2008340480]

Loading te data...
Building model...
Training model...


Epochs:   0%|          | 0/15 [00:00<?, ?it/s]

Train 1:   0%|          | 0/458 [00:00<?, ?it/s]

Val 1:   0%|          | 0/45 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0174
Epoch 1/15:
  Train Loss: 2.1723, Train Acc: 0.0194
  Val Loss: 2.2186, Val Acc: 0.0174


Train 2:   0%|          | 0/458 [00:00<?, ?it/s]

Val 2:   0%|          | 0/45 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0873
Epoch 2/15:
  Train Loss: 1.2993, Train Acc: 0.0926
  Val Loss: 1.8053, Val Acc: 0.0873


Train 3:   0%|          | 0/458 [00:00<?, ?it/s]

Val 3:   0%|          | 0/45 [00:00<?, ?it/s]

Epoch 3/15:
  Train Loss: 0.9632, Train Acc: 0.0845
  Val Loss: 1.5359, Val Acc: 0.0816


Train 4:   0%|          | 0/458 [00:00<?, ?it/s]

Val 4:   0%|          | 0/45 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0910
Epoch 4/15:
  Train Loss: 0.7884, Train Acc: 0.0855
  Val Loss: 1.4174, Val Acc: 0.0910


Train 5:   0%|          | 0/458 [00:00<?, ?it/s]

Val 5:   0%|          | 0/45 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0980
Epoch 5/15:
  Train Loss: 0.6773, Train Acc: 0.0794
  Val Loss: 1.2766, Val Acc: 0.0980


Train 6:   0%|          | 0/458 [00:00<?, ?it/s]

Val 6:   0%|          | 0/45 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.1036
Epoch 6/15:
  Train Loss: 0.5929, Train Acc: 0.0966
  Val Loss: 1.2586, Val Acc: 0.1036


Train 7:   0%|          | 0/458 [00:00<?, ?it/s]

Val 7:   0%|          | 0/45 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.1131
Epoch 7/15:
  Train Loss: 0.5426, Train Acc: 0.0806
  Val Loss: 1.2037, Val Acc: 0.1131


Train 8:   0%|          | 0/458 [00:00<?, ?it/s]

Val 8:   0%|          | 0/45 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.1241
Epoch 8/15:
  Train Loss: 0.4958, Train Acc: 0.1169
  Val Loss: 1.1753, Val Acc: 0.1241


Train 9:   0%|          | 0/458 [00:00<?, ?it/s]

Val 9:   0%|          | 0/45 [00:00<?, ?it/s]

Epoch 9/15:
  Train Loss: 0.4645, Train Acc: 0.0160
  Val Loss: 1.1628, Val Acc: 0.0320


Train 10:   0%|          | 0/458 [00:00<?, ?it/s]

Val 10:   0%|          | 0/45 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.1318
Epoch 10/15:
  Train Loss: 0.4339, Train Acc: 0.1172
  Val Loss: 1.1408, Val Acc: 0.1318


Train 11:   0%|          | 0/458 [00:00<?, ?it/s]

Val 11:   0%|          | 0/45 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.1585
Epoch 11/15:
  Train Loss: 0.4028, Train Acc: 0.1813
  Val Loss: 1.1265, Val Acc: 0.1585


Train 12:   0%|          | 0/458 [00:00<?, ?it/s]

Val 12:   0%|          | 0/45 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.3039
Epoch 12/15:
  Train Loss: 0.3846, Train Acc: 0.4112
  Val Loss: 1.1430, Val Acc: 0.3039


Train 13:   0%|          | 0/458 [00:00<?, ?it/s]

Val 13:   0%|          | 0/45 [00:00<?, ?it/s]

Epoch 13/15:
  Train Loss: 0.3643, Train Acc: 0.2375
  Val Loss: 1.1406, Val Acc: 0.1804


Train 14:   0%|          | 0/458 [00:00<?, ?it/s]

Val 14:   0%|          | 0/45 [00:00<?, ?it/s]

Epoch 14/15:
  Train Loss: 0.3484, Train Acc: 0.2958
  Val Loss: 1.0864, Val Acc: 0.2143


Train 15:   0%|          | 0/458 [00:00<?, ?it/s]

Val 15:   0%|          | 0/45 [00:00<?, ?it/s]

Epoch 15/15:
  Train Loss: 0.3368, Train Acc: 0.3962
  Val Loss: 1.1086, Val Acc: 0.2870
Final test accuracy: 0.2868


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
test_accuracy,▁
train_accuracy,▁▂▂▂▂▂▂▃▁▃▄█▅▆█
train_loss,█▅▃▃▂▂▂▂▁▁▁▁▁▁▁
validation_accuracy,▁▃▃▃▃▃▃▄▁▄▄█▅▆█
validation_loss,█▅▄▃▂▂▂▂▁▁▁▁▁▁▁

0,1
epoch,15.0
test_accuracy,0.28676
train_accuracy,0.39619
train_loss,0.33682
validation_accuracy,0.287
validation_loss,1.10855


[34m[1mwandb[0m: Agent Starting Run: abw72wwm with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 512
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.0002
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: Adam
[34m[1mwandb[0m: 	rnn_type: GRU
[34m[1mwandb[0m: 	seed: 44
[34m[1mwandb[0m: 	teacher_forcing: 0.7


Using device: cuda
Loading te data...
Loading cached vocabularies from ./cache/te_dakshina_vocab.pkl
Building model...
Training model...


Epochs:   0%|          | 0/15 [00:00<?, ?it/s]

Train 1:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 1:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.1311
Epoch 1/15:
  Train Loss: 1.6150, Train Acc: 0.1387
  Val Loss: 1.5996, Val Acc: 0.1311


Train 2:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 2:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 2/15:
  Train Loss: 0.7385, Train Acc: 0.0318
  Val Loss: 1.2014, Val Acc: 0.0753


Train 3:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 3:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.2997
Epoch 3/15:
  Train Loss: 0.5138, Train Acc: 0.3384
  Val Loss: 1.0710, Val Acc: 0.2997


Train 4:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 4:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.3908
Epoch 4/15:
  Train Loss: 0.4069, Train Acc: 0.4799
  Val Loss: 0.9749, Val Acc: 0.3908


Train 5:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 5:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 5/15:
  Train Loss: 0.3410, Train Acc: 0.5046
  Val Loss: 0.9794, Val Acc: 0.3827


Train 6:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 6:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 6/15:
  Train Loss: 0.2952, Train Acc: 0.4761
  Val Loss: 0.9434, Val Acc: 0.3567


Train 7:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 7:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.4346
Epoch 7/15:
  Train Loss: 0.2590, Train Acc: 0.6191
  Val Loss: 0.9591, Val Acc: 0.4346


Train 8:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 8:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 8/15:
  Train Loss: 0.2300, Train Acc: 0.3216
  Val Loss: 0.9867, Val Acc: 0.2613


Train 9:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 9:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 9/15:
  Train Loss: 0.2055, Train Acc: 0.2939
  Val Loss: 0.9850, Val Acc: 0.2180


Train 10:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 10:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 10/15:
  Train Loss: 0.1844, Train Acc: 0.6250
  Val Loss: 0.9781, Val Acc: 0.4281


Train 11:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 11:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 11/15:
  Train Loss: 0.1697, Train Acc: 0.6616
  Val Loss: 1.0052, Val Acc: 0.4260


Train 12:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 12:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 12/15:
  Train Loss: 0.1539, Train Acc: 0.4487
  Val Loss: 1.0135, Val Acc: 0.2840


Train 13:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 13:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 13/15:
  Train Loss: 0.1400, Train Acc: 0.3559
  Val Loss: 1.0084, Val Acc: 0.2646


Train 14:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 14:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 14/15:
  Train Loss: 0.1276, Train Acc: 0.2888
  Val Loss: 1.0544, Val Acc: 0.2643


Train 15:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 15:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 15/15:
  Train Loss: 0.1185, Train Acc: 0.3872
  Val Loss: 1.0769, Val Acc: 0.3210
Final test accuracy: 0.3283


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
test_accuracy,▁
train_accuracy,▂▁▄▆▆▆█▄▄██▆▅▄▅
train_loss,█▄▃▂▂▂▂▂▁▁▁▁▁▁▁
validation_accuracy,▂▁▅▇▇▆█▅▄██▅▅▅▆
validation_loss,█▄▂▁▁▁▁▁▁▁▂▂▂▂▂

0,1
epoch,15.0
test_accuracy,0.32835
train_accuracy,0.38721
train_loss,0.11848
validation_accuracy,0.32096
validation_loss,1.0769


[34m[1mwandb[0m: Agent Starting Run: kglyx313 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 512
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: NAdam
[34m[1mwandb[0m: 	rnn_type: GRU
[34m[1mwandb[0m: 	seed: 46
[34m[1mwandb[0m: 	teacher_forcing: 0.3


Using device: cuda
Loading te data...
Loading cached vocabularies from ./cache/te_dakshina_vocab.pkl
Building model...
Training model...


Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Train 1:   0%|          | 0/915 [00:00<?, ?it/s]

Val 1:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.1406
Epoch 1/10:
  Train Loss: 1.7608, Train Acc: 0.1643
  Val Loss: 1.2337, Val Acc: 0.1406


Train 2:   0%|          | 0/915 [00:00<?, ?it/s]

Val 2:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.2794
Epoch 2/10:
  Train Loss: 0.8544, Train Acc: 0.3306
  Val Loss: 0.9548, Val Acc: 0.2794


Train 3:   0%|          | 0/915 [00:00<?, ?it/s]

Val 3:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.3192
Epoch 3/10:
  Train Loss: 0.6323, Train Acc: 0.4197
  Val Loss: 0.8471, Val Acc: 0.3192


Train 4:   0%|          | 0/915 [00:00<?, ?it/s]

Val 4:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.3417
Epoch 4/10:
  Train Loss: 0.5277, Train Acc: 0.4494
  Val Loss: 0.8181, Val Acc: 0.3417


Train 5:   0%|          | 0/915 [00:00<?, ?it/s]

Val 5:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.3672
Epoch 5/10:
  Train Loss: 0.4616, Train Acc: 0.5046
  Val Loss: 0.8105, Val Acc: 0.3672


Train 6:   0%|          | 0/915 [00:00<?, ?it/s]

Val 6:   0%|          | 0/89 [00:00<?, ?it/s]

Epoch 6/10:
  Train Loss: 0.4132, Train Acc: 0.5014
  Val Loss: 0.7935, Val Acc: 0.3623


Train 7:   0%|          | 0/915 [00:00<?, ?it/s]

Val 7:   0%|          | 0/89 [00:00<?, ?it/s]

Epoch 7/10:
  Train Loss: 0.3715, Train Acc: 0.5039
  Val Loss: 0.7785, Val Acc: 0.3516


Train 8:   0%|          | 0/915 [00:00<?, ?it/s]

Val 8:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.4228
Epoch 8/10:
  Train Loss: 0.3420, Train Acc: 0.6437
  Val Loss: 0.7972, Val Acc: 0.4228


Train 9:   0%|          | 0/915 [00:00<?, ?it/s]

Val 9:   0%|          | 0/89 [00:00<?, ?it/s]

Epoch 9/10:
  Train Loss: 0.3213, Train Acc: 0.4728
  Val Loss: 0.8073, Val Acc: 0.3217


Train 10:   0%|          | 0/915 [00:00<?, ?it/s]

Val 10:   0%|          | 0/89 [00:00<?, ?it/s]

Epoch 10/10:
  Train Loss: 0.2991, Train Acc: 0.6346
  Val Loss: 0.7968, Val Acc: 0.4139
Final test accuracy: 0.3999


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
train_accuracy,▁▃▅▅▆▆▆█▆█
train_loss,█▄▃▂▂▂▁▁▁▁
validation_accuracy,▁▄▅▆▇▆▆█▅█
validation_loss,█▄▂▂▁▁▁▁▁▁

0,1
epoch,10.0
test_accuracy,0.39986
train_accuracy,0.63459
train_loss,0.29909
validation_accuracy,0.41387
validation_loss,0.79682


[34m[1mwandb[0m: Agent Starting Run: i2rcc2kn with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 512
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	optimizer: Adam
[34m[1mwandb[0m: 	rnn_type: RNN
[34m[1mwandb[0m: 	seed: 43
[34m[1mwandb[0m: 	teacher_forcing: 0.5


Using device: cuda
Loading te data...
Loading cached vocabularies from ./cache/te_dakshina_vocab.pkl
Building model...
Training model...


Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Train 1:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 1:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 1/10:
  Train Loss: 2.7240, Train Acc: 0.0001
  Val Loss: 2.9773, Val Acc: 0.0000


Train 2:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 2:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 2/10:
  Train Loss: 2.5651, Train Acc: 0.0001
  Val Loss: 2.9340, Val Acc: 0.0000


Train 3:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 3:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 3/10:
  Train Loss: 2.5038, Train Acc: 0.0000
  Val Loss: 2.9133, Val Acc: 0.0000


Train 4:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 4:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 4/10:
  Train Loss: 2.4461, Train Acc: 0.0002
  Val Loss: 2.7988, Val Acc: 0.0000


Train 5:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 5:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 5/10:
  Train Loss: 2.4054, Train Acc: 0.0001
  Val Loss: 2.8271, Val Acc: 0.0000


Train 6:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 6:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 6/10:
  Train Loss: 2.3721, Train Acc: 0.0002
  Val Loss: 2.7842, Val Acc: 0.0000


Train 7:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 7:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 7/10:
  Train Loss: 2.3361, Train Acc: 0.0002
  Val Loss: 2.7213, Val Acc: 0.0000


Train 8:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 8:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0007
Epoch 8/10:
  Train Loss: 2.2634, Train Acc: 0.0002
  Val Loss: 2.6129, Val Acc: 0.0007


Train 9:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 9:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 9/10:
  Train Loss: 2.2180, Train Acc: 0.0003
  Val Loss: 2.6735, Val Acc: 0.0002


Train 10:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 10:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 10/10:
  Train Loss: 2.1948, Train Acc: 0.0002
  Val Loss: 2.5865, Val Acc: 0.0007
Final test accuracy: 0.0000


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
train_accuracy,▃▄▁▅▄▅▄▅█▅
train_loss,█▆▅▄▄▃▃▂▁▁
validation_accuracy,▁▁▁▁▁▁▁█▃█
validation_loss,█▇▇▅▅▅▃▁▃▁

0,1
epoch,10.0
test_accuracy,0.0
train_accuracy,0.00022
train_loss,2.19477
validation_accuracy,0.0007
validation_loss,2.58652


[34m[1mwandb[0m: Agent Starting Run: p6gl2qgu with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.0008
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: NAdam
[34m[1mwandb[0m: 	rnn_type: LSTM
[34m[1mwandb[0m: 	seed: 44
[34m[1mwandb[0m: 	teacher_forcing: 1


Using device: cuda
Loading te data...
Loading cached vocabularies from ./cache/te_dakshina_vocab.pkl
Building model...
Training model...


Epochs:   0%|          | 0/15 [00:00<?, ?it/s]

Train 1:   0%|          | 0/915 [00:00<?, ?it/s]

Val 1:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0651
Epoch 1/15:
  Train Loss: 1.3434, Train Acc: 0.0246
  Val Loss: 2.1916, Val Acc: 0.0651


Train 2:   0%|          | 0/915 [00:00<?, ?it/s]

Val 2:   0%|          | 0/89 [00:00<?, ?it/s]

Epoch 2/15:
  Train Loss: 0.5614, Train Acc: 0.0246
  Val Loss: 1.8501, Val Acc: 0.0649


Train 3:   0%|          | 0/915 [00:00<?, ?it/s]

Val 3:   0%|          | 0/89 [00:00<?, ?it/s]

Epoch 3/15:
  Train Loss: 0.4114, Train Acc: 0.0248
  Val Loss: 1.6947, Val Acc: 0.0535


Train 4:   0%|          | 0/915 [00:00<?, ?it/s]

Val 4:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0663
Epoch 4/15:
  Train Loss: 0.3376, Train Acc: 0.0327
  Val Loss: 1.7024, Val Acc: 0.0663


Train 5:   0%|          | 0/915 [00:00<?, ?it/s]

Val 5:   0%|          | 0/89 [00:00<?, ?it/s]

Epoch 5/15:
  Train Loss: 0.2926, Train Acc: 0.0276
  Val Loss: 1.6716, Val Acc: 0.0558


Train 6:   0%|          | 0/915 [00:00<?, ?it/s]

Val 6:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0677
Epoch 6/15:
  Train Loss: 0.2602, Train Acc: 0.0376
  Val Loss: 1.6743, Val Acc: 0.0677


Train 7:   0%|          | 0/915 [00:00<?, ?it/s]

Val 7:   0%|          | 0/89 [00:00<?, ?it/s]

Epoch 7/15:
  Train Loss: 0.2359, Train Acc: 0.0351
  Val Loss: 1.6993, Val Acc: 0.0619


Train 8:   0%|          | 0/915 [00:00<?, ?it/s]

Val 8:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0704
Epoch 8/15:
  Train Loss: 0.2150, Train Acc: 0.0403
  Val Loss: 1.7026, Val Acc: 0.0704


Train 9:   0%|          | 0/915 [00:00<?, ?it/s]

Val 9:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0871
Epoch 9/15:
  Train Loss: 0.1993, Train Acc: 0.0531
  Val Loss: 1.6872, Val Acc: 0.0871


Train 10:   0%|          | 0/915 [00:00<?, ?it/s]

Val 10:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.1024
Epoch 10/15:
  Train Loss: 0.1854, Train Acc: 0.0623
  Val Loss: 1.7864, Val Acc: 0.1024


Train 11:   0%|          | 0/915 [00:00<?, ?it/s]

Val 11:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.1446
Epoch 11/15:
  Train Loss: 0.1731, Train Acc: 0.0901
  Val Loss: 1.7756, Val Acc: 0.1446


Train 12:   0%|          | 0/915 [00:00<?, ?it/s]

Val 12:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.1795
Epoch 12/15:
  Train Loss: 0.1625, Train Acc: 0.1198
  Val Loss: 1.8135, Val Acc: 0.1795


Train 13:   0%|          | 0/915 [00:00<?, ?it/s]

Val 13:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.2031
Epoch 13/15:
  Train Loss: 0.1532, Train Acc: 0.1513
  Val Loss: 1.8278, Val Acc: 0.2031


Train 14:   0%|          | 0/915 [00:00<?, ?it/s]

Val 14:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.2733
Epoch 14/15:
  Train Loss: 0.1452, Train Acc: 0.2675
  Val Loss: 1.8216, Val Acc: 0.2733


Train 15:   0%|          | 0/915 [00:00<?, ?it/s]

Val 15:   0%|          | 0/89 [00:00<?, ?it/s]

Epoch 15/15:
  Train Loss: 0.1371, Train Acc: 0.2530
  Val Loss: 1.8543, Val Acc: 0.2525
Final test accuracy: 0.2290


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
test_accuracy,▁
train_accuracy,▁▁▁▁▁▁▁▁▂▂▃▄▅██
train_loss,█▃▃▂▂▂▂▁▁▁▁▁▁▁▁
validation_accuracy,▁▁▁▁▁▁▁▂▂▃▄▅▆█▇
validation_loss,█▃▁▁▁▁▁▁▁▃▂▃▃▃▃

0,1
epoch,15.0
test_accuracy,0.22899
train_accuracy,0.25301
train_loss,0.13713
validation_accuracy,0.25251
validation_loss,1.85434


[34m[1mwandb[0m: Agent Starting Run: yoqyf9dc with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: Adam
[34m[1mwandb[0m: 	rnn_type: GRU
[34m[1mwandb[0m: 	seed: 46
[34m[1mwandb[0m: 	teacher_forcing: 0.5


Using device: cuda
Loading te data...
Loading cached vocabularies from ./cache/te_dakshina_vocab.pkl
Building model...
Training model...


Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Train 1:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 1:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.2486
Epoch 1/10:
  Train Loss: 1.3685, Train Acc: 0.2754
  Val Loss: 1.1046, Val Acc: 0.2486


Train 2:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 2:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.3451
Epoch 2/10:
  Train Loss: 0.5899, Train Acc: 0.4031
  Val Loss: 0.9394, Val Acc: 0.3451


Train 3:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 3:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.3595
Epoch 3/10:
  Train Loss: 0.4338, Train Acc: 0.4884
  Val Loss: 0.8652, Val Acc: 0.3595


Train 4:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 4:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.4513
Epoch 4/10:
  Train Loss: 0.3563, Train Acc: 0.6240
  Val Loss: 0.8275, Val Acc: 0.4513


Train 5:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 5:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 5/10:
  Train Loss: 0.3036, Train Acc: 0.5399
  Val Loss: 0.8134, Val Acc: 0.3836


Train 6:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 6:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 6/10:
  Train Loss: 0.2617, Train Acc: 0.5644
  Val Loss: 0.8644, Val Acc: 0.3757


Train 7:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 7:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 7/10:
  Train Loss: 0.2324, Train Acc: 0.6165
  Val Loss: 0.8550, Val Acc: 0.3848


Train 8:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 8:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 8/10:
  Train Loss: 0.2078, Train Acc: 0.5766
  Val Loss: 0.9106, Val Acc: 0.3644


Train 9:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 9:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 9/10:
  Train Loss: 0.1907, Train Acc: 0.7382
  Val Loss: 0.8979, Val Acc: 0.4510


Train 10:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 10:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 10/10:
  Train Loss: 0.1709, Train Acc: 0.6594
  Val Loss: 0.9282, Val Acc: 0.3913
Final test accuracy: 0.3774


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
train_accuracy,▁▃▄▆▅▅▆▆█▇
train_loss,█▃▃▂▂▂▁▁▁▁
validation_accuracy,▁▄▅█▆▅▆▅█▆
validation_loss,█▄▂▁▁▂▂▃▃▄

0,1
epoch,10.0
test_accuracy,0.37741
train_accuracy,0.65937
train_loss,0.17093
validation_accuracy,0.39134
validation_loss,0.92817


[34m[1mwandb[0m: Agent Starting Run: m36omgl4 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 512
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.0008
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: NAdam
[34m[1mwandb[0m: 	rnn_type: GRU
[34m[1mwandb[0m: 	seed: 44
[34m[1mwandb[0m: 	teacher_forcing: 1


Using device: cuda
Loading te data...
Loading cached vocabularies from ./cache/te_dakshina_vocab.pkl
Building model...
Training model...


Epochs:   0%|          | 0/20 [00:00<?, ?it/s]

Train 1:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 1:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0060
Epoch 1/20:
  Train Loss: 1.2626, Train Acc: 0.0034
  Val Loss: 2.4469, Val Acc: 0.0060


Train 2:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 2:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0151
Epoch 2/20:
  Train Loss: 0.6264, Train Acc: 0.0071
  Val Loss: 2.2398, Val Acc: 0.0151


Train 3:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 3:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0164
Epoch 3/20:
  Train Loss: 0.5007, Train Acc: 0.0084
  Val Loss: 2.1510, Val Acc: 0.0164


Train 4:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 4:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0171
Epoch 4/20:
  Train Loss: 0.4372, Train Acc: 0.0097
  Val Loss: 2.2828, Val Acc: 0.0171


Train 5:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 5:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0197
Epoch 5/20:
  Train Loss: 0.3979, Train Acc: 0.0126
  Val Loss: 2.2755, Val Acc: 0.0197


Train 6:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 6:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 6/20:
  Train Loss: 0.3722, Train Acc: 0.0109
  Val Loss: 2.2979, Val Acc: 0.0146


Train 7:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 7:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0227
Epoch 7/20:
  Train Loss: 0.3487, Train Acc: 0.0169
  Val Loss: 2.2745, Val Acc: 0.0227


Train 8:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 8:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 8/20:
  Train Loss: 0.3306, Train Acc: 0.0151
  Val Loss: 2.3367, Val Acc: 0.0206


Train 9:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 9:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 9/20:
  Train Loss: 0.3151, Train Acc: 0.0111
  Val Loss: 2.3898, Val Acc: 0.0160


Train 10:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 10:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 10/20:
  Train Loss: 0.3022, Train Acc: 0.0166
  Val Loss: 2.3540, Val Acc: 0.0227


Train 11:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 11:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 11/20:
  Train Loss: 0.2893, Train Acc: 0.0154
  Val Loss: 2.4077, Val Acc: 0.0225


Train 12:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 12:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0391
Epoch 12/20:
  Train Loss: 0.2787, Train Acc: 0.0323
  Val Loss: 2.3467, Val Acc: 0.0391


Train 13:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 13:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 13/20:
  Train Loss: 0.2680, Train Acc: 0.0279
  Val Loss: 2.3467, Val Acc: 0.0362


Train 14:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 14:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 14/20:
  Train Loss: 0.2591, Train Acc: 0.0271
  Val Loss: 2.4558, Val Acc: 0.0352


Train 15:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 15:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 15/20:
  Train Loss: 0.2508, Train Acc: 0.0190
  Val Loss: 2.3590, Val Acc: 0.0208


Train 16:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 16:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 16/20:
  Train Loss: 0.2447, Train Acc: 0.0219
  Val Loss: 2.5260, Val Acc: 0.0239


Train 17:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 17:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 17/20:
  Train Loss: 0.2374, Train Acc: 0.0208
  Val Loss: 2.4101, Val Acc: 0.0246


Train 18:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 18:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 18/20:
  Train Loss: 0.2295, Train Acc: 0.0190
  Val Loss: 2.4196, Val Acc: 0.0201


Train 19:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 19:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 19/20:
  Train Loss: 0.2235, Train Acc: 0.0202
  Val Loss: 2.4301, Val Acc: 0.0206


Train 20:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 20:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 20/20:
  Train Loss: 0.2168, Train Acc: 0.0222
  Val Loss: 2.5076, Val Acc: 0.0220
Final test accuracy: 0.0259


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_accuracy,▁
train_accuracy,▁▂▂▃▃▃▄▄▃▄▄█▇▇▅▅▅▅▅▆
train_loss,█▄▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
validation_accuracy,▁▃▃▃▄▃▅▄▃▅▅█▇▇▄▅▅▄▄▄
validation_loss,▇▃▁▃▃▄▃▄▅▅▆▅▅▇▅█▆▆▆█

0,1
epoch,20.0
test_accuracy,0.02593
train_accuracy,0.0222
train_loss,0.21685
validation_accuracy,0.022
validation_loss,2.50763


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 3uyrug7n with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: Adam
[34m[1mwandb[0m: 	rnn_type: RNN
[34m[1mwandb[0m: 	seed: 43
[34m[1mwandb[0m: 	teacher_forcing: 0.5


Using device: cuda
Loading te data...
Loading cached vocabularies from ./cache/te_dakshina_vocab.pkl
Building model...
Training model...


Epochs:   0%|          | 0/15 [00:00<?, ?it/s]

Train 1:   0%|          | 0/915 [00:00<?, ?it/s]

Val 1:   0%|          | 0/89 [00:00<?, ?it/s]

Epoch 1/15:
  Train Loss: 2.7059, Train Acc: 0.0004
  Val Loss: 2.6857, Val Acc: 0.0000


Train 2:   0%|          | 0/915 [00:00<?, ?it/s]

Val 2:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0004
Epoch 2/15:
  Train Loss: 2.3298, Train Acc: 0.0011
  Val Loss: 2.5334, Val Acc: 0.0004


Train 3:   0%|          | 0/915 [00:00<?, ?it/s]

Val 3:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0014
Epoch 3/15:
  Train Loss: 2.1961, Train Acc: 0.0023
  Val Loss: 2.4109, Val Acc: 0.0014


Train 4:   0%|          | 0/915 [00:00<?, ?it/s]

Val 4:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0042
Epoch 4/15:
  Train Loss: 2.0499, Train Acc: 0.0042
  Val Loss: 2.2903, Val Acc: 0.0042


Train 5:   0%|          | 0/915 [00:00<?, ?it/s]

Val 5:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0084
Epoch 5/15:
  Train Loss: 1.9123, Train Acc: 0.0091
  Val Loss: 2.1741, Val Acc: 0.0084


Train 6:   0%|          | 0/915 [00:00<?, ?it/s]

Val 6:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0146
Epoch 6/15:
  Train Loss: 1.7824, Train Acc: 0.0180
  Val Loss: 2.0822, Val Acc: 0.0146


Train 7:   0%|          | 0/915 [00:00<?, ?it/s]

Val 7:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0216
Epoch 7/15:
  Train Loss: 1.6728, Train Acc: 0.0290
  Val Loss: 1.9997, Val Acc: 0.0216


Train 8:   0%|          | 0/915 [00:00<?, ?it/s]

Val 8:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0324
Epoch 8/15:
  Train Loss: 1.5794, Train Acc: 0.0390
  Val Loss: 1.9012, Val Acc: 0.0324


Train 9:   0%|          | 0/915 [00:00<?, ?it/s]

Val 9:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0482
Epoch 9/15:
  Train Loss: 1.4832, Train Acc: 0.0560
  Val Loss: 1.8173, Val Acc: 0.0482


Train 10:   0%|          | 0/915 [00:00<?, ?it/s]

Val 10:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0593
Epoch 10/15:
  Train Loss: 1.4036, Train Acc: 0.0699
  Val Loss: 1.7572, Val Acc: 0.0593


Train 11:   0%|          | 0/915 [00:00<?, ?it/s]

Val 11:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0670
Epoch 11/15:
  Train Loss: 1.3329, Train Acc: 0.0803
  Val Loss: 1.7095, Val Acc: 0.0670


Train 12:   0%|          | 0/915 [00:00<?, ?it/s]

Val 12:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0867
Epoch 12/15:
  Train Loss: 1.2592, Train Acc: 0.1042
  Val Loss: 1.6632, Val Acc: 0.0867


Train 13:   0%|          | 0/915 [00:00<?, ?it/s]

Val 13:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.1008
Epoch 13/15:
  Train Loss: 1.2038, Train Acc: 0.1220
  Val Loss: 1.5960, Val Acc: 0.1008


Train 14:   0%|          | 0/915 [00:00<?, ?it/s]

Val 14:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.1126
Epoch 14/15:
  Train Loss: 1.1469, Train Acc: 0.1366
  Val Loss: 1.5485, Val Acc: 0.1126


Train 15:   0%|          | 0/915 [00:00<?, ?it/s]

Val 15:   0%|          | 0/89 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.1262
Epoch 15/15:
  Train Loss: 1.0994, Train Acc: 0.1569
  Val Loss: 1.4884, Val Acc: 0.1262
Final test accuracy: 0.1194


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
test_accuracy,▁
train_accuracy,▁▁▁▁▁▂▂▃▃▄▅▆▆▇█
train_loss,█▆▆▅▅▄▃▃▃▂▂▂▁▁▁
validation_accuracy,▁▁▁▁▁▂▂▃▄▄▅▆▇▇█
validation_loss,█▇▆▆▅▄▄▃▃▃▂▂▂▁▁

0,1
epoch,15.0
test_accuracy,0.11937
train_accuracy,0.15693
train_loss,1.09945
validation_accuracy,0.12617
validation_loss,1.4884


[34m[1mwandb[0m: Agent Starting Run: 7s4aa18p with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: NAdam
[34m[1mwandb[0m: 	rnn_type: RNN
[34m[1mwandb[0m: 	seed: 43
[34m[1mwandb[0m: 	teacher_forcing: 0.3


Using device: cuda
Loading te data...
Loading cached vocabularies from ./cache/te_dakshina_vocab.pkl
Building model...
Training model...


Epochs:   0%|          | 0/20 [00:00<?, ?it/s]

Train 1:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 1:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0002
Epoch 1/20:
  Train Loss: 2.8711, Train Acc: 0.0002
  Val Loss: 2.6661, Val Acc: 0.0002


Train 2:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 2:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 2/20:
  Train Loss: 2.4961, Train Acc: 0.0005
  Val Loss: 2.5540, Val Acc: 0.0000


Train 3:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 3:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0004
Epoch 3/20:
  Train Loss: 2.4058, Train Acc: 0.0008
  Val Loss: 2.4784, Val Acc: 0.0004


Train 4:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 4:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0007
Epoch 4/20:
  Train Loss: 2.3259, Train Acc: 0.0014
  Val Loss: 2.4130, Val Acc: 0.0007


Train 5:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 5:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0019
Epoch 5/20:
  Train Loss: 2.2475, Train Acc: 0.0023
  Val Loss: 2.3146, Val Acc: 0.0019


Train 6:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 6:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0026
Epoch 6/20:
  Train Loss: 2.1703, Train Acc: 0.0039
  Val Loss: 2.2278, Val Acc: 0.0026


Train 7:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 7:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0044
Epoch 7/20:
  Train Loss: 2.0872, Train Acc: 0.0068
  Val Loss: 2.1626, Val Acc: 0.0044


Train 8:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 8:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0069
Epoch 8/20:
  Train Loss: 2.0123, Train Acc: 0.0096
  Val Loss: 2.0959, Val Acc: 0.0069


Train 9:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 9:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 9/20:
  Train Loss: 1.9478, Train Acc: 0.0116
  Val Loss: 2.0396, Val Acc: 0.0069


Train 10:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 10:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0095
Epoch 10/20:
  Train Loss: 1.8934, Train Acc: 0.0145
  Val Loss: 1.9885, Val Acc: 0.0095


Train 11:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 11:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0171
Epoch 11/20:
  Train Loss: 1.8358, Train Acc: 0.0189
  Val Loss: 1.9450, Val Acc: 0.0171


Train 12:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 12:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0178
Epoch 12/20:
  Train Loss: 1.7874, Train Acc: 0.0241
  Val Loss: 1.8963, Val Acc: 0.0178


Train 13:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 13:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0231
Epoch 13/20:
  Train Loss: 1.7336, Train Acc: 0.0282
  Val Loss: 1.8542, Val Acc: 0.0231


Train 14:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 14:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0317
Epoch 14/20:
  Train Loss: 1.6915, Train Acc: 0.0329
  Val Loss: 1.8176, Val Acc: 0.0317


Train 15:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 15:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0355
Epoch 15/20:
  Train Loss: 1.6473, Train Acc: 0.0401
  Val Loss: 1.7720, Val Acc: 0.0355


Train 16:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 16:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0371
Epoch 16/20:
  Train Loss: 1.5998, Train Acc: 0.0438
  Val Loss: 1.7398, Val Acc: 0.0371


Train 17:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 17:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0375
Epoch 17/20:
  Train Loss: 1.5615, Train Acc: 0.0507
  Val Loss: 1.7239, Val Acc: 0.0375


Train 18:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 18:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0445
Epoch 18/20:
  Train Loss: 1.5264, Train Acc: 0.0553
  Val Loss: 1.6681, Val Acc: 0.0445


Train 19:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 19:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0509
Epoch 19/20:
  Train Loss: 1.4915, Train Acc: 0.0611
  Val Loss: 1.6382, Val Acc: 0.0509


Train 20:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 20:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.0558
Epoch 20/20:
  Train Loss: 1.4560, Train Acc: 0.0667
  Val Loss: 1.6306, Val Acc: 0.0558
Final test accuracy: 0.0602


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_accuracy,▁
train_accuracy,▁▁▁▁▁▁▂▂▂▃▃▄▄▄▅▆▆▇▇█
train_loss,█▆▆▅▅▅▄▄▃▃▃▃▂▂▂▂▂▁▁▁
validation_accuracy,▁▁▁▁▁▁▂▂▂▂▃▃▄▅▅▆▆▇▇█
validation_loss,█▇▇▆▆▅▅▄▄▃▃▃▃▂▂▂▂▁▁▁

0,1
epoch,20.0
test_accuracy,0.06021
train_accuracy,0.06675
train_loss,1.45602
validation_accuracy,0.05578
validation_loss,1.63058


[34m[1mwandb[0m: Agent Starting Run: 05cxplpv with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 512
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: NAdam
[34m[1mwandb[0m: 	rnn_type: RNN
[34m[1mwandb[0m: 	seed: 42
[34m[1mwandb[0m: 	teacher_forcing: 0.7


Using device: cuda
Loading te data...
Loading cached vocabularies from ./cache/te_dakshina_vocab.pkl
Building model...
Training model...


Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Train 1:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 1:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.1156
Epoch 1/10:
  Train Loss: 1.4337, Train Acc: 0.1427
  Val Loss: 1.5242, Val Acc: 0.1156


Train 2:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 2:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.1645
Epoch 2/10:
  Train Loss: 0.7735, Train Acc: 0.2366
  Val Loss: 1.3703, Val Acc: 0.1645


Train 3:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 3:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 3/10:
  Train Loss: 0.5910, Train Acc: 0.1078
  Val Loss: 1.3226, Val Acc: 0.0940


Train 4:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 4:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 4/10:
  Train Loss: 0.4910, Train Acc: 0.2010
  Val Loss: 1.3219, Val Acc: 0.1494


Train 5:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 5:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.1897
Epoch 5/10:
  Train Loss: 0.4288, Train Acc: 0.3030
  Val Loss: 1.3396, Val Acc: 0.1897


Train 6:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 6:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 6/10:
  Train Loss: 0.3843, Train Acc: 0.2690
  Val Loss: 1.3261, Val Acc: 0.1775


Train 7:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 7:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 7/10:
  Train Loss: 0.3506, Train Acc: 0.2812
  Val Loss: 1.3818, Val Acc: 0.1714


Train 8:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 8:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 8/10:
  Train Loss: 0.3215, Train Acc: 0.3325
  Val Loss: 1.3998, Val Acc: 0.1885


Train 9:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 9:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 9/10:
  Train Loss: 0.2985, Train Acc: 0.2690
  Val Loss: 1.4343, Val Acc: 0.1402


Train 10:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 10:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.1978
Epoch 10/10:
  Train Loss: 0.2804, Train Acc: 0.3994
  Val Loss: 1.4168, Val Acc: 0.1978
Final test accuracy: 0.1891


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
train_accuracy,▂▄▁▃▆▅▅▆▅█
train_loss,█▄▃▂▂▂▁▁▁▁
validation_accuracy,▂▆▁▅▇▇▆▇▄█
validation_loss,█▃▁▁▂▁▃▄▅▄

0,1
epoch,10.0
test_accuracy,0.18914
train_accuracy,0.3994
train_loss,0.28037
validation_accuracy,0.19778
validation_loss,1.41678


[34m[1mwandb[0m: Agent Starting Run: o5wwraw8 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_dim: 1024
[34m[1mwandb[0m: 	learning_rate: 0.0008
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: NAdam
[34m[1mwandb[0m: 	rnn_type: GRU
[34m[1mwandb[0m: 	seed: 44
[34m[1mwandb[0m: 	teacher_forcing: 0.3


Using device: cuda
Loading te data...
Loading cached vocabularies from ./cache/te_dakshina_vocab.pkl
Building model...
Training model...


Epochs:   0%|          | 0/20 [00:00<?, ?it/s]

Train 1:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 1:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.4007
Epoch 1/20:
  Train Loss: 0.9567, Train Acc: 0.4782
  Val Loss: 0.7584, Val Acc: 0.4007


Train 2:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 2:   0%|          | 0/178 [00:00<?, ?it/s]

New best model saved with validation accuracy: 0.4457
Epoch 2/20:
  Train Loss: 0.5557, Train Acc: 0.5667
  Val Loss: 0.7161, Val Acc: 0.4457


Train 3:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 3:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 3/20:
  Train Loss: 0.4977, Train Acc: 0.5833
  Val Loss: 0.7209, Val Acc: 0.4286


Train 4:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 4:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 4/20:
  Train Loss: 0.4861, Train Acc: 0.5133
  Val Loss: 0.7239, Val Acc: 0.3991


Train 5:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 5:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 5/20:
  Train Loss: 0.4850, Train Acc: 0.5066
  Val Loss: 0.7158, Val Acc: 0.3752


Train 6:   0%|          | 0/1830 [00:00<?, ?it/s]

Val 6:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 6/20:
  Train Loss: 0.4916, Train Acc: 0.5182
  Val Loss: 0.7310, Val Acc: 0.3882


Train 7:   0%|          | 0/1830 [00:00<?, ?it/s]