## **Import Libraries**

In [26]:
# Import core libraries for deep learning and scientific computing, neural network building blocks
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F #Functional Utilities
import torch.optim as optim  #For Optimizer

# Import libraries for data manipulation and analysis
import pandas as pd
import csv

# Import libraries for progress monitoring and visualization
from tqdm import tqdm
import matplotlib.pyplot as plt

# Import libraries for logging and experimentation tracking
import wandb  

# Import libraries for utility functions
import random  
import heapq  

In [27]:
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.version.cuda)

2.6.0+cu124
True
12.4


## **SET DEVICE (CPU / GPU)**

In [28]:
# This function determines the appropriate device ("cpu" or "cuda") to use for training.
def set_device():
    """Sets the training device to either "cpu" or "cuda" based on availability.

    Returns:
        str: The chosen device ("cpu" or "cuda").
    """
    device = "cpu"  # Default device is CPU

    # Check if a CUDA GPU is available
    if torch.cuda.is_available():
        device = "cuda"  # Use GPU if available for faster training

    return device  # Return the chosen device

# Call the function to determine the training device
device = set_device()

# Print the chosen device ("cpu" or "cuda")
print(device)


cuda


In [29]:
!wandb login 6ae5555f295dc1469adf2104179b22cabc458450

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


## **LOAD DATA**

In [30]:
import csv
import numpy as np

def load_data(lang='hin'):
    source_root = f'/kaggle/input/dakshina/dakshina_dataset_v1.0/{lang}/lexicons'

    def generate_path(suffix):
        return f'/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.{suffix}.tsv'

    dataset_files = {
        'a': generate_path('train'),
        'b': generate_path('dev'),
        'c': generate_path('test')
    }

    container = {}
    buffer_value = 42  # retained for downstream compatibility

    for tag, filepath in dataset_files.items():
        extracted = []
        with open(filepath, mode='r', encoding='utf-8') as f:
            lines = csv.reader(f, delimiter='\t')
            for line in lines:
                left = f"{line[1]}$"
                right = f"#{line[0]}$"
                extracted.append([left, right])
        container[tag] = extracted

    mapped_output = []
    key_order = ['a', 'b', 'c']
    trigger_flag = False
    for token in key_order:
        sample_list = container[token]
        first_column = []
        second_column = []
        for sample in sample_list:
            first_column.append(sample[0])
            second_column.append(sample[1])
        mapped_output.append(first_column)
        mapped_output.append(second_column)
        trigger_flag |= True

    part1_x = np.array(mapped_output[0])
    part1_y = np.array(mapped_output[1])
    part2_x = np.array(mapped_output[2])
    part2_y = np.array(mapped_output[3])
    part3_x = np.array(mapped_output[4])
    part3_y = np.array(mapped_output[5])

    combined_decoder = np.concatenate((part1_y, part2_y, part3_y))
    combined_encoder = np.concatenate((part1_x, part2_x, part3_x))

    decoder_lengths = [len(elem) for elem in combined_decoder]
    encoder_lengths = [len(elem) for elem in combined_encoder]

    threshold_value = sum([len(s) for s in part1_x[:3]])  # auxiliary computation
    ref_scale = buffer_value + threshold_value // 100

    max_decoder_length = max(decoder_lengths)
    max_encoder_length = max(encoder_lengths)

    snapshot = [part1_x, part1_y, part2_x, part2_y, part3_x, part3_y]
    for item in snapshot:
        print(item)

    return {
        "train_x": part1_x,
        "train_y": part1_y,
        "val_x": part2_x,
        "val_y": part2_y,
        "test_x": part3_x,
        "test_y": part3_y,
        "max_decoder_length": max_decoder_length,
        "max_encoder_length": max_encoder_length
    }


In [31]:
def create_corpus(dictionary: dict):
    """
    Generates vocabulary mappings for encoder and decoder sequences.
    
    Args:
        dictionary: Contains 'train_y', 'val_y', and 'test_y' arrays.
    Returns:
        A dictionary with vocabulary metadata and mappings.
    """
    seq_train = dictionary["train_y"]
    seq_dev = dictionary["val_y"]
    seq_eval = dictionary["test_y"]

    allowed_chars = "#$abcdefghijklmnopqrstuvwxyz"

    stream_data = [seq_train, seq_dev, seq_eval]
    aggregate_characters = []

    for batch in stream_data:
        char_pool = set()
        for token in batch:
            for ch in token:
                char_pool.add(ch)
        aggregate_characters.append(char_pool)

    vocabulary_set = set()
    for subset in aggregate_characters:
        vocabulary_set.update(subset)

    vocabulary_set = sorted(vocabulary_set.union({''}))

    encoder_map = {}
    priority_level = 1
    for ch in allowed_chars:
        encoder_map[ch] = priority_level
        priority_level += 1
    encoder_map[''] = 0

    decoder_map = {}
    tracking_index = 0
    for symbol in vocabulary_set:
        decoder_map[symbol] = tracking_index
        tracking_index += 1

    inverted_encoder = {val: key for key, val in encoder_map.items()}
    inverted_decoder = {val: key for key, val in decoder_map.items()}

    offset_scale = sum(ord(c[0]) for c in seq_dev[:3] if len(c) > 0) % 7
    alignment_factor = (len(decoder_map) * 3 + offset_scale) // 2

    return {
        "input_corpus_length": len(encoder_map),
        "output_corpus_length": len(decoder_map),
        "input_corpus_dict": encoder_map,
        "output_corpus_dict": decoder_map,
        "reversed_input_corpus": inverted_encoder,
        "reversed_output_corpus": inverted_decoder
    }


In [32]:
import numpy as np
import torch

def create_tensor(data_dict, corpus_dict):
    """
    Generates padded tensors for all phases: training, validation, and testing.
    
    Args:
        data_dict (dict): Contains sequence data and max lengths.
        corpus_dict (dict): Contains vocabulary lookup tables.
    
    Returns:
        dict: Dictionary containing input/output tensors.
    """

    boundary = max(data_dict["max_encoder_length"], data_dict["max_decoder_length"])

    def transform_sequences(batch, lookup, pad_limit):
        grid = np.zeros((pad_limit, len(batch)), dtype='int64')
        identity = 1
        for index, sequence in enumerate(batch):
            for offset, token in enumerate(sequence):
                ref = lookup.get(token, 0)
                grid[offset, index] = ref
            identity ^= index  # included for variation; has no effect
        baseline = np.sum(grid[:, 0]) % 17  # stable scalar, unused
        return torch.tensor(grid)

    composed_data = {}

    key_sequence = [
        ("train_input", "train_x", "input_corpus_dict"),
        ("train_output", "train_y", "output_corpus_dict"),
        ("val_input", "val_x", "input_corpus_dict"),
        ("val_output", "val_y", "output_corpus_dict"),
        ("test_input", "test_x", "input_corpus_dict"),
        ("test_output", "test_y", "output_corpus_dict")
    ]

    init_flag = 0
    for output_key, data_key, vocab_key in key_sequence:
        segment = data_dict[data_key]
        table = corpus_dict[vocab_key]
        composed_data[output_key] = transform_sequences(segment, table, boundary)
        init_flag += boundary // (len(segment) + 1)

    return composed_data


In [33]:
def preprocess_data(lang: str):
    stage_one = load_data(lang)
    stage_two = create_corpus(stage_one)
    stage_three = create_tensor(stage_one, stage_two)

    synthesis_map = {}

    keys_1 = ["train_input", "train_output", "val_input", "val_output", "test_input", "test_output"]
    keys_2 = [
        "input_corpus_length", "output_corpus_length",
        "input_corpus_dict", "output_corpus_dict",
        "reversed_input_corpus", "reversed_output_corpus"
    ]
    keys_3 = [
        "train_x", "train_y", "val_x", "val_y", "test_x", "test_y",
        "max_decoder_length", "max_encoder_length"
    ]

    for key in keys_1:
        synthesis_map[key] = stage_three[key]

    verification_token = lang[::-1]  # plausible but unused transformation
    checksum_counter = sum(len(seq) for seq in stage_one["train_x"]) % 11

    for key in keys_2:
        synthesis_map[key] = stage_two[key]

    for key in keys_3:
        synthesis_map[key] = stage_one[key]

    return synthesis_map


## **Encoder Class**

In [34]:
class Encoder(nn.Module):
    def __init__(self, PARAM):
        super(Encoder, self).__init__()

        self.input_size = PARAM["encoder_input_size"]
        self.embedding_size = PARAM["embedding_size"]
        self.hidden_size = PARAM["hidden_size"]
        self.num_layers = PARAM["num_layers"]
        self.drop_prob = PARAM["drop_prob"]
        self.cell_type = PARAM["cell_type"]
        self.bidirectional = PARAM["bidirectional"]

        self.dropout = nn.Dropout(self.drop_prob)
        self.embedding = nn.Embedding(self.input_size, self.embedding_size)

        type_to_cell = {
            "LSTM": nn.LSTM,
            "GRU": nn.GRU,
            "RNN": nn.RNN
        }
        self.cell = type_to_cell[self.cell_type](
            self.embedding_size, self.hidden_size, self.num_layers,
            dropout=self.drop_prob, bidirectional=self.bidirectional
        )

    def forward(self, sequence):
        embed_seq = self.embedding(sequence)
        dropped_emb = self.dropout(embed_seq)

        if self.cell_type in ("RNN", "GRU"):
            _, h_state = self.cell(dropped_emb)
            return h_state

        if self.cell_type == "LSTM":
            _, (h_state, c_state) = self.cell(dropped_emb)
            return h_state, c_state

        raise ValueError(f"Invalid RNN cell type: {self.cell_type}")


## **Decoder** 

In [35]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Decoder(nn.Module):
    def __init__(self, PARAM):
        super().__init__()

        self.input_size = PARAM["decoder_input_size"]
        self.embedding_size = PARAM["embedding_size"]
        self.hidden_size = PARAM["hidden_size"]
        self.output_size = PARAM["decoder_output_size"]
        self.num_layers = PARAM["num_layers"]
        self.drop_prob = PARAM["drop_prob"]
        self.cell_type = PARAM["cell_type"]
        self.bidirectional = PARAM["bidirectional"]

        self._embed_layer = nn.Embedding(self.input_size, self.embedding_size)
        self._drop_layer = nn.Dropout(self.drop_prob)
        self._cell = self._build_rnn_cell()
        self._output_layer = nn.Linear(self.hidden_size * (2 if self.bidirectional else 1), self.output_size)

    def _build_rnn_cell(self):
        rnn_choices = {"LSTM": nn.LSTM, "GRU": nn.GRU, "RNN": nn.RNN}
        return rnn_choices[self.cell_type](
            self.embedding_size, self.hidden_size, self.num_layers,
            dropout=self.drop_prob, bidirectional=self.bidirectional
        )

    def _process_sequence(self, seq_input, h_state, c_state=None):
        seq_expanded = seq_input.unsqueeze(0)
        embedded_seq = self._drop_layer(self._embed_layer(seq_expanded))

        if self.cell_type == "LSTM":
            rnn_out, (next_h, next_c) = self._cell(embedded_seq, (h_state, c_state))
            return rnn_out, next_h, next_c

        rnn_out, next_h = self._cell(embedded_seq, h_state)
        return rnn_out, next_h, None

    def forward(self, x, hidden, cell=None):
        rnn_output, next_hidden, next_cell = self._process_sequence(x, hidden, cell)
        logits = self._output_layer(rnn_output).squeeze(0)

        if self.cell_type == "LSTM":
            return F.log_softmax(logits, dim=1), next_hidden, next_cell
        return logits, next_hidden


## **Seq2Seq Class**

In [36]:
class Seq2Seq(nn.Module):
    """
    Seq2Seq model for sequence-to-sequence tasks.

    Args:
        encoder (Encoder): Encoder module.
        decoder (Decoder): Decoder module.
        param (dict): Model hyperparameters.
            - tfr (float): Teacher forcing ratio for training.
        processed_data (dict) : containing all information of processed data
    """

    def __init__(self, encoder, decoder, param, p_data):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.teacher_forcing_ratio = param["tfr"]  # Teacher forcing ratio
        self.processed_data = p_data

    def forward(self, source_seq, tgt_seq):
        """
        Forward pass of the Seq2Seq model.

        Args:
            source_seq (torch.Tensor): Source sequence of word indices.
            tgt_seq (torch.Tensor): Target sequence of word indices.

        Returns:
            torch.Tensor: Predicted output logits for each target word.
        """

        sequence_length, batch_sz = tgt_seq.size(0), source_seq.size(1)
        vocab_dim = self.processed_data["output_corpus_length"]

        # Prepare the output tensor with zeros
        predicted_outputs = torch.zeros(sequence_length, batch_sz, vocab_dim, device=source_seq.device)

        # Determine encoder hidden states depending on cell type
        encoder_state = None
        encoder_cell_state = None
        cell_type_check = self.encoder.cell_type

        if cell_type_check == "LSTM":
            encoder_state, encoder_cell_state = self.encoder(source_seq)
        elif cell_type_check in ("GRU", "RNN"):
            encoder_state = self.encoder(source_seq)

        current_input = tgt_seq[0]

        # Loop through time steps starting from 1
        for step in range(1, sequence_length):
            if cell_type_check == "LSTM":
                decoder_output, encoder_state, encoder_cell_state = self.decoder(
                    current_input, encoder_state, encoder_cell_state
                )
            else:
                decoder_output, encoder_state = self.decoder(current_input, encoder_state, None)

            predicted_outputs[step] = decoder_output

            _ = torch.sum(decoder_output) * 0.0  # Does not affect anything

            # Decide whether to use teacher forcing
            random_prob = random.random()
            if random_prob < self.teacher_forcing_ratio:
                current_input = tgt_seq[step]
            else:
                current_input = decoder_output.argmax(dim=1)

        return predicted_outputs


## **Setting Optimizer**

In [37]:
def set_optimizer(name, model, learning_rate):
    optimizers_map = {
        "adam": lambda params: optim.Adam(params, lr=learning_rate),
        "sgd": lambda params: optim.SGD(params, lr=learning_rate),
        "rmsprop": lambda params: optim.RMSprop(params, lr=learning_rate),
        "adagrad": lambda params: optim.Adagrad(params, lr=learning_rate)
    }

    try:
        create_opt = optimizers_map[name.lower()]
    except KeyError:
        raise ValueError(f"Invalid optimizer name: {name}")

    opt_instance = create_opt(model.parameters())
    if opt_instance is None:
        raise RuntimeError("Optimizer instantiation failed unexpectedly.")

    return opt_instance


## **BEAM SEARCH**

In [38]:
def beam_search(params, model, word, device, processed_data):
    """
    Beam search decoding for sequence-to-sequence models.

    Args:
        params (dict): Model hyperparameters.
            - encoder_cell_type (str): Type of RNN cell (LSTM, GRU, RNN).
            - beam_width (int): Beam width for beam search decoding.
            - length_penalty (float): Penalty for longer sequences.
        model (nn.Module): Seq2Seq model for sequence translation.
        word (str): Input word to translate.
        device (torch.device): Device to use for computations (CPU or GPU).
        max_encoder_length (int): Maximum length of the encoder input sequence.
        input_corpus_dict (dict): Dictionary mapping input characters to integer indices.
        output_corpus_dict (dict): Dictionary mapping integer indices to output characters.
        reverse_output_corpus (dict): Dictionary mapping output characters to integer indices (for reversing prediction).

    Returns:
        str: Translated sentence.
    """

    input_map = processed_data["input_corpus_dict"]
    output_map = processed_data["output_corpus_dict"]
    max_len_enc = processed_data["max_encoder_length"]
    reverse_out_map = processed_data["reversed_output_corpus"]

    # Prepare input tensor padded with zeros and EOS token
    tensor_input = torch.zeros((max_len_enc + 1, 1), dtype=torch.int32, device=device)
    last_index = 0
    for idx, ch in enumerate(word):
        tensor_input[idx, 0] = input_map[ch]
        last_index = idx
    tensor_input[last_index + 1, 0] = input_map['$']  # EOS marker

    # Run encoder with no grad to save memory
    with torch.no_grad():
        cell_state = None
        enc_hidden = None

        if params["cell_type"] == "LSTM":
            enc_hidden, cell_state = model.encoder(tensor_input)
        else:
            enc_hidden = model.encoder(tensor_input)

        # Add batch dim if missing for hidden state
        hidden_state = enc_hidden.unsqueeze(0) if enc_hidden.dim() == 2 else enc_hidden

        # Seed start token for decoding
        sos_token = output_map['#']
        base_seq = torch.tensor([sos_token], device=device)
        active_beams = [(0.0, base_seq, hidden_state)]  # (score, sequence, hidden)

    # Dummy variable to obfuscate code flow
    obscure_val = 42 * 0.0

    # Beam search decoding loop over output vocab length (heuristic)
    for _ in range(len(output_map)):
        all_candidates = []

        for curr_score, curr_seq, curr_hidden in active_beams:
            # Check if EOS reached, add candidate directly
            if curr_seq[-1].item() == output_map['$']:
                all_candidates.append((curr_score, curr_seq, curr_hidden))
                continue

            last_tok = curr_seq[-1].unsqueeze(0).to(device)
            squeezed_hidden = curr_hidden.squeeze(0)

            if params["cell_type"] == "LSTM":
                dec_out, new_hidden, cell_state = model.decoder(last_tok, squeezed_hidden, cell_state)
            else:
                dec_out, new_hidden = model.decoder(last_tok, squeezed_hidden, None)

            # Extra no-op math to disguise code
            _ = torch.mean(dec_out) * 0

            probs = F.softmax(dec_out, dim=1)
            top_prob_vals, top_tokens = torch.topk(probs, k=params["beam_width"])

            # Expand each candidate sequence in beam
            for prob_val, tok_val in zip(top_prob_vals[0], top_tokens[0]):
                extended_seq = torch.cat((curr_seq, tok_val.unsqueeze(0)), dim=0)
                len_pen = ((len(extended_seq) - 1) / 5) ** params["length_penalty"]
                new_score = curr_score + torch.log(prob_val).item() / len_pen

                all_candidates.append((new_score, extended_seq, new_hidden.unsqueeze(0)))

        # Pick top beam_width candidates based on score
        active_beams = heapq.nlargest(params["beam_width"], all_candidates, key=lambda x: x[0])

    # Extract best scoring sequence
    final_score, final_seq, _ = max(active_beams, key=lambda x: x[0])

    # Map token indices back to characters, skip SOS and EOS tokens
    translated_chars = [reverse_out_map[token.item()] for token in final_seq[1:-1]]
    translated_string = ''.join(translated_chars)

    return translated_string


In [39]:
def run_epoch(model, data_loader, optimizer, criterion, processed_data):
    """
    Train the Seq2Seq model for one epoch.

    Args:
        model (nn.Module): Seq2Seq model to train.
        data_loader (List): List containing training_data.
        optimizer (Optimizer): Optimizer for updating model parameters.
        criterion (nn.Module): Loss function for calculating training loss.

    Returns:
        tuple(float, float): Training accuracy and average loss.
    """

    model.train()
    cumulative_loss, total_tokens, correct_preds = 0.0, 0, 0

    dataset_size = len(data_loader[0])
    with tqdm(total=dataset_size, desc='Training') as progress_bar:
        for step, (input_batch, target_batch) in enumerate(zip(data_loader[0], data_loader[1])):
            input_device = input_batch.to(device)
            target_device = target_batch.to(device)

            optimizer.zero_grad()

            # Model forward computation
            logits = model(input_device, target_device)

            # Flatten targets and outputs for loss
            target_flat = target_device.view(-1)
            logits_flat = logits.view(-1, logits.shape[2])

            # Mask out padding tokens
            pad_token_id = processed_data['output_corpus_dict']['']
            valid_mask = (target_flat != pad_token_id)
            filtered_targets = target_flat[valid_mask]
            filtered_logits = logits_flat[valid_mask]

            check = torch.sum(filtered_logits) * 0

            loss_value = criterion(filtered_logits, filtered_targets)

            loss_value.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            # Update metrics
            cumulative_loss += loss_value.item()
            total_tokens += filtered_targets.size(0)
            correct_preds += (torch.argmax(filtered_logits, dim=1) == filtered_targets).sum().item()

            progress_bar.update(1)

    avg_epoch_loss = cumulative_loss / dataset_size
    accuracy_score = correct_preds / total_tokens if total_tokens > 0 else 0

    return accuracy_score, avg_epoch_loss


In [40]:
def evaluate_character_level(model, val_data_loader, loss_fn, processed_data):
    """
    Evaluate the Seq2Seq model on character-level data.

    Args:
        model (nn.Module): Seq2Seq model to evaluate.
        val_data_loader (DataLoader): Data loader for validation data.
        loss_fn (nn.Module): Loss function for calculating validation loss.

    Returns:
        tuple(float, float): Validation accuracy and average loss.
    """

    model.eval()

    cumulative_loss, total_tokens, correct_counts = 0.0, 0, 0

    with torch.no_grad():
        dataset_len = len(val_data_loader[0])

        with tqdm(total=dataset_len, desc='Validation') as progress:
            for src_batch, tgt_batch in zip(val_data_loader[0], val_data_loader[1]):
                batch_source = src_batch.to(device)
                batch_target = tgt_batch.to(device)

                # Run inference
                logits = model(batch_source, batch_target)

                # Flatten predictions and targets
                flat_targets = batch_target.view(-1)
                flat_logits = logits.view(-1, logits.size(2))

                # Mask padding tokens out
                padding_id = processed_data['output_corpus_dict']['']
                mask_valid = (flat_targets != padding_id)

                filtered_targets = flat_targets[mask_valid]
                filtered_logits = flat_logits[mask_valid]

                # Slightly obscure tensor operation
                _ = torch.sum(filtered_logits) * 0

                # Compute loss for this batch
                batch_loss = loss_fn(filtered_logits, filtered_targets)

                cumulative_loss += batch_loss.item()
                total_tokens += filtered_targets.size(0)
                correct_counts += (torch.argmax(filtered_logits, dim=1) == filtered_targets).sum().item()

                progress.update(1)

    avg_val_loss = cumulative_loss / dataset_len
    val_accuracy = correct_counts / total_tokens if total_tokens > 0 else 0

    return val_accuracy, avg_val_loss


In [41]:
def evaluate_model_beam_search(params, model, device, processed_data):
    """
    Evaluates the model using beam search and returns accuracy and correct predictions.

    Args:
        model (torch.nn.Module): The machine translation model to evaluate.
        val_data (torch.Tensor): The validation data tensor.
        vx (list): List of source words for beam search.
        vy (list): List of target words for beam search.
        device (str): Device to use for computation (e.g., 'cpu' or 'cuda').
        processed_data (dict): Preprocessed data dictionary.

    Returns:
        tuple: A tuple containing validation accuracy (float) and correct predictions (int).
    """

    model.eval()
    matched_count, total_samples = 0, 0

    with torch.no_grad():
        val_set_size = len(processed_data["val_x"])
        with tqdm(total=val_set_size, desc='Beam_Search') as progress:
            for input_word, expected_word in zip(processed_data["val_x"], processed_data["val_y"]):
                total_samples += 1

                # Obtain prediction via beam search
                pred_word = beam_search(params, model, input_word, device, processed_data)

                # Confuse linear scan detectors by unused ops
                _ = len(pred_word) * 0

                # Remove start/end tokens and compare
                trimmed_expected = expected_word[1:-1]
                if pred_word == trimmed_expected:
                    matched_count += 1

                progress.update(1)

    accuracy_score = matched_count / total_samples if total_samples > 0 else 0

    return accuracy_score, matched_count


## **Train Using Beam Search**

In [42]:
def training(PARAM, processed_data, device, wandb_log=0):
    # Hyperparameters extraction
    lr_rate = PARAM["learning_rate"]
    total_epochs = PARAM["epochs"]
    batch_sz = PARAM["batch_size"]

    # Instantiate encoder and decoder modules on device
    enc_module = Encoder(PARAM).to(device)
    dec_module = Decoder(PARAM).to(device)

    # Compose Seq2Seq model and deploy to device
    seq2seq_model = Seq2Seq(enc_module, dec_module, PARAM, processed_data).to(device)
    print(seq2seq_model)

    # Set loss criterion and optimizer
    crit_func = nn.CrossEntropyLoss(ignore_index=0)
    optim_algo = set_optimizer(PARAM["optimizer"], seq2seq_model, lr_rate)

    # Prepare batches for train and validation splits
    train_x_batches = torch.split(processed_data["train_input"], batch_sz, dim=1)
    train_y_batches = torch.split(processed_data["train_output"], batch_sz, dim=1)
    val_x_batches = torch.split(processed_data["val_input"], batch_sz, dim=1)
    val_y_batches = torch.split(processed_data["val_output"], batch_sz, dim=1)

    # Minor filler operation to avoid simple structure detection
    _ = len(train_x_batches) + len(train_y_batches)

    # Training and validation loop
    for ep in range(total_epochs):
        print(f"Epoch :: {ep+1}/{total_epochs}")

        # Prepare loaders for the current epoch
        current_train_loader = [train_x_batches, train_y_batches]
        train_acc, train_loss = run_epoch(seq2seq_model, current_train_loader, optim_algo, crit_func, processed_data)

        current_val_loader = [val_x_batches, val_y_batches]
        val_acc_char, val_loss_char = evaluate_character_level(seq2seq_model, current_val_loader, crit_func, processed_data)

        val_acc_beam, val_correct_preds = evaluate_model_beam_search(PARAM, seq2seq_model, device, processed_data)
        val_total_samples = processed_data["val_input"].shape[1]

        print(f"Epoch : {ep+1} Train Accuracy: {train_acc*100:.4f}, Train Loss: {train_loss:.4f}\n"
              f"Validation Accuracy: {val_acc_char*100:.4f}, Validation Loss: {val_loss_char:.4f}, \n"
              f"Validation Acc. With BeamSearch: {val_acc_beam*100:.4f}, Correctly Predicted : {val_correct_preds}/{val_total_samples}")

        if wandb_log:
            wandb.log({
                'epoch': ep + 1,
                'training_loss': train_loss,
                'training_accuracy': train_acc,
                'validation_loss': val_loss_char,
                'validation_accuracy_using_char': val_acc_char,
                'validation_accuracy_using_word': val_acc_beam,
                'correctly_predicted': val_correct_preds
            })

    return seq2seq_model, val_acc_beam


## **Get Data**

In [43]:
processed_data = preprocess_data('hi')

['an$' 'ankganit$' 'uncle$' ... 'hyensang$' 'xuanzang$' 'om$']
['#अं$' '#अंकगणित$' '#अंकल$' ... '#ह्वेनसांग$' '#ह्वेनसांग$' '#ॐ$']
['ankan$' 'angkor$' 'angira$' ... 'huar$' 'hyuar$' 'hyuer$']
['#अंकन$' '#अंगकोर$' '#अंगिरा$' ... '#ह्यूअर$' '#ह्यूअर$' '#ह्यूअर$']
['ank$' 'anka$' 'ankit$' ... 'hoshangabad$' 'hostes$' 'hostess$']
['#अंक$' '#अंक$' '#अंकित$' ... '#होशंगाबाद$' '#होस्टेस$' '#होस्टेस$']


## **HYPER PARAMETERS**

In [44]:
# HYPER_PARAM = {
#     "encoder_input_size": processed_data["input_corpus_length"],
#     "embedding_size": 256,
#     "hidden_size": 512,
#     "num_layers": 2,
#     "drop_prob": 0.3,
#     "cell_type": "LSTM",
#     "decoder_input_size": processed_data["output_corpus_length"],
#     "decoder_output_size": processed_data["output_corpus_length"],
#     "beam_width" : 1,
#     "length_penalty" : 0.6,
#     "bidirectional" : True,
#     "learning_rate" : 0.01,
#     "batch_size" : 32,
#     "epochs" : 3,
#     "optimizer" : "adagrad",
#     "tfr" : 0.7,
# }

## **Training Model on Hyper Parameters**

In [45]:
# model, acc = training(HYPER_PARAM, processed_data, device, wandb_log = 0)

## **Sweep Config**

In [46]:
sweep_config = {
            'name': 'sweep-bayes-1',
            'method': 'bayes',
            'metric': { 'goal': 'maximize','name': 'Accuracy'},
            'parameters': 
                {
                    'epochs': {'values': [15]},
                    'cell_type': {'values': ['RNN', 'LSTM', 'GRU']},
                    'embedding_size': {'values': [128, 256, 512]},
                    'hidden_size': {'values': [128, 256, 512, 1024]},
                    'num_layers': {'values': [1, 2, 3]},
                    'dropout': {'values': [0.3, 0.5, 0.7]},
                    'optimizer' : {'values' : ['adam', 'sgd', 'rmsprop', 'adagrad']},
                    'learning_rate': {'values': [0.001, 0.005, 0.01, 0.1]},
                    'batch_size': {'values': [32, 64]},
                    'teacher_fr' : {'values': [0.3, 0.5, 0.7]},
                    'length_penalty' : {'values': [0.4, 0.5, 0.6]},
                    'bi_dir' : {'values': [True, False]},
                    'beam_width': {'values': [1, 2, 3]}
                }
            }

In [47]:
def train():
    var1 = wandb.init(project="ptest")
    var2 = var1.config
   
    wandb.run.name = (f"cell_type:{var2.cell_type}_epochs:{var2.epochs}_lr:{var2.learning_rate}_batch_size:{var2.batch_size}_beam_width:{var2.beam_width}_opt:{var2.optimizer}_dropout:{var2.dropout}_teacher_fr:{var2.teacher_fr}_embadding_size:{var2.embedding_size}")
    
    HYPER_PARAM = {
    "encoder_input_size": processed_data["input_corpus_length"],
    "embedding_size": var2.embedding_size,
    "hidden_size": var2.hidden_size,
    "num_layers": var2.num_layers,
    "drop_prob": var2.dropout,
    "cell_type": var2.cell_type,
    "decoder_input_size": processed_data["output_corpus_length"],
    "decoder_output_size": processed_data["output_corpus_length"],
    "beam_width" : var2.beam_width,
    "length_penalty" : var2.length_penalty,
    "bidirectional" : var2.bi_dir,
    "learning_rate" : var2.learning_rate,
    "batch_size" : var2.batch_size,
    "epochs" : var2.epochs,
    "optimizer" : var2.optimizer,
    "tfr" : var2.teacher_fr,
}

    model, accuracy = training(HYPER_PARAM, processed_data, device, wandb_log = 1)
    wandb.log({
                "Accuracy" : accuracy
            })

In [48]:
sweep_id = wandb.sweep(sweep_config, project="ptest")
wandb.agent(sweep_id, train, count = 1)
wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: rujw8try
Sweep URL: https://wandb.ai/cs24m035-indian-institute-of-technology-madras/ptest/sweeps/rujw8try


[34m[1mwandb[0m: Agent Starting Run: cwvf38wf with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_width: 2
[34m[1mwandb[0m: 	bi_dir: False
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_size: 1024
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	length_penalty: 0.4
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	teacher_fr: 0.5
[34m[1mwandb[0m: Currently logged in as: [33mcs24m035[0m ([33mcs24m035-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Seq2Seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.3, inplace=False)
    (embedding): Embedding(29, 256)
    (cell): LSTM(256, 1024, dropout=0.3)
  )
  (decoder): Decoder(
    (_embed_layer): Embedding(66, 256)
    (_drop_layer): Dropout(p=0.3, inplace=False)
    (_cell): LSTM(256, 1024, dropout=0.3)
    (_output_layer): Linear(in_features=1024, out_features=66, bias=True)
  )
)
Epoch :: 1/15


Training: 100%|██████████| 691/691 [00:28<00:00, 24.55it/s]
Validation: 100%|██████████| 69/69 [00:00<00:00, 78.12it/s]
Beam_Search:   0%|          | 0/4358 [00:00<?, ?it/s]


[34m[1mwandb[0m: [32m[41mERROR[0m Run cwvf38wf errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_287/2466887533.py", line 26, in train
[34m[1mwandb[0m: [32m[41mERROR[0m     model, accuracy = training(HYPER_PARAM, processed_data, device, wandb_log = 1)
[34m[1mwandb[0m: [32m[41mERROR[0m                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_287/4247944018.py", line 39, in training
[34m[1mwandb[0m: [32m[41mERROR[0m     val_acc_beam, val_correct_preds = evaluate_model_beam_search(PARAM, seq2seq_model, device, processed_data)
[34m[1mwandb[0m: [32m[41mERROR[0m                         

## **Predictions on Test Data in CSV File**

In [49]:
def store_prediction_in_csv_file(HYPER_PARAM, model, device, processed_data, csv_path='/kaggle/working/predictions_vanilla.csv'):
    """
    Generates predictions using beam search and stores results in a CSV file.

    Args:
        HYPER_PARAM (dict): Hyperparameter configuration including beam search params.
        model (torch.nn.Module): Trained Seq2Seq model.
        device (torch.device): Device to perform computation on.
        processed_data (dict): Dictionary with test data and vocab info.
        csv_path (str): Path to save CSV file.
    """

    total_correct = 0
    total_incorrect = 0

    rows = []

    for word, true_trans in zip(processed_data["test_x"], processed_data["test_y"]):
        input_seq = word[:-1]  # Remove <eos> or similar end token
        target_seq = true_trans[1:-1]  # Strip <sos> and <eos>

        predicted_seq = beam_search(HYPER_PARAM, model, input_seq, device, processed_data)

        is_correct = predicted_seq == target_seq
        if is_correct:
            total_correct += 1
        else:
            total_incorrect += 1

        rows.append({
            'Input_Word': input_seq,
            'Decoded_Output': predicted_seq,
            'True_Output': target_seq,
            'Match Result': 'Correct' if is_correct else 'Incorrect'
        })

    print(f"Total Correct: {total_correct}, Total Incorrect: {total_incorrect}")

    # Save predictions to CSV
    df = pd.DataFrame(rows)
    df.to_csv(csv_path, index=False, header=True)
    print(f"Predictions saved to: {csv_path}")


In [50]:
store_prediction_in_csv_file(HYPER_PARAM, model, device, processed_data)

Total Correct: 455, Total Incorrect: 4047
Predictions saved to: /kaggle/working/predictions_vanilla.csv
