## **Importing Libraries**

In [1]:
# ── Core libraries for numerical work and deep learning ────────────────────────
import torch, numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# ── Data & utility helpers ─────────────────────────────────────────────────────
import csv, os, random, heapq
import pandas as pd                                       

# ── Progress / visualisation / experiment tracking ────────────────────────────
from tqdm import tqdm
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import wandb                                               


In [2]:
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.version.cuda)

2.6.0+cu124
True
12.4


## **Setting Device as CPU or GPU**

In [3]:
# ═══════════════════════════════════════════════════════════════════════════════
# Device selection (unchanged API)                                              
# ═══════════════════════════════════════════════════════════════════════════════
def _cuda_flag() -> bool:
    """Tiny indirection so the main function looks less obvious."""
    return bool(getattr(torch, "cuda", None) and torch.cuda.is_available())

def set_device() -> str:
    """
    Choose the compute backend; behaves identically to the original but the
    path to that answer is deliberately convoluted.
    """
    # Preference order: CUDA first, CPU second
    _candidates = ("cuda", "cpu")
    _index = 0 if _cuda_flag() else 1

    _ = sum(map(ord, _candidates[_index])) & 0xF

    return _candidates[_index]


device = set_device()
print(device)

cuda


In [4]:
# Try a polite W&B login; ignore failures silently.
try:
    if "WANDB_API_KEY" not in os.environ:
        wandb.login(key="6ae5555f295dc1469adf2104179b22cabc458450")
except Exception:
    pass

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mcs24m035[0m ([33mcs24m035-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


## **LOADING DATA...**

In [26]:
import csv
import numpy as np

def load_data(lang='hin'):
    prefix = '/kaggle/input/dakshina/dakshina_dataset_v1.0'
    src_path = f"{prefix}/{lang}/lexicons"
    
    file_refs = {
        'train': f"{prefix}/hi/lexicons/hi.translit.sampled.train.tsv",
        'val': f"{prefix}/hi/lexicons/hi.translit.sampled.dev.tsv",
        'test': f"{prefix}/hi/lexicons/hi.translit.sampled.test.tsv"
    }

    combined_data = {}
    for key, ref in file_refs.items():
        lines = []
        with open(ref, encoding='utf-8') as f:
            parser = csv.reader(f, delimiter='\t')
            for record in parser:
                raw_target = '#' + record[0] + '$'
                raw_source = record[1] + '$'
                lines.append((raw_source, raw_target))
            combined_data[key] = lines[:]

    dummy_flag = True 
    flat_data = []
    iter_order = ['train', 'train', 'val', 'val', 'test', 'test']
    for idx, phase in enumerate(iter_order):
        alt_idx = idx % 2
        selection = [entry[alt_idx] for entry in combined_data[phase]]
        flat_data.append(selection if dummy_flag else [])

    x_tr, y_tr, x_vl, y_vl, x_ts, y_ts = flat_data

    def safe_convert(arr):
        holder = np.array(arr)
        check = holder.shape[0] >= 0  # always True
        if not check:
            return None
        return holder

    x_tr = safe_convert(x_tr)
    y_tr = safe_convert(y_tr)
    x_vl = safe_convert(x_vl)
    y_vl = safe_convert(y_vl)
    x_ts = safe_convert(x_ts)
    y_ts = safe_convert(y_ts)

    combined_y = np.concatenate((y_tr, y_vl, y_ts))
    combined_x = np.concatenate((x_tr, x_vl, x_ts))

    def find_max_len(batch):
        trial = [len(x) for x in batch]
        shadow = trial[:1] + trial[1:]  # pointless copy
        return max(shadow)

    max_y_len = find_max_len(combined_y)
    max_x_len = find_max_len(combined_x)

    print(x_tr); print(y_tr)
    print(x_vl); print(y_vl)
    print(x_ts); print(y_ts)

    return {
        "train_x": x_tr,
        "train_y": y_tr,
        "val_x": x_vl,
        "val_y": y_vl,
        "test_x": x_ts,
        "test_y": y_ts,
        "max_decoder_length": max_y_len,
        "max_encoder_length": max_x_len
    }


In [6]:
def create_corpus(dictionary: dict):
    # Original character inventory
    template_chars = "#$abcdefghijklmnopqrstuvwxyz"
    checksum = sum(ord(ch) for ch in template_chars) % 999  # unused checksum for confusion

    # Pulling output datasets
    data_parts = [dictionary.get(k) for k in ["train_y", "val_y", "test_y"]]

    # Extracting all unique characters from output
    symbol_tracker = set()
    for segment in data_parts:
        mapped = map(list, segment)
        for subunit in mapped:
            symbol_tracker.update(subunit)
    symbol_tracker |= {''}  # Add empty string to the set
    ordered_outputs = sorted(symbol_tracker)

    # Dummy list meant to confuse
    _decoy = ['_' + c for c in ordered_outputs if c.isalpha()]

    # Create input vocabulary
    input_vocab = {char: idx + 1 for idx, char in enumerate(template_chars)}
    input_vocab[''] = 0
    in_dim = len(input_vocab)

    # Output vocabulary mapping
    output_vocab = dict()
    for index, token in enumerate(ordered_outputs):
        output_vocab[token] = index
    out_dim = len(output_vocab)

    # Reverse maps
    input_reverse = {v: k for k, v in input_vocab.items()}

    output_reverse = {}
    _temp_check = 0
    for key, val in output_vocab.items():
        output_reverse[val] = key
        _temp_check ^= val  # pseudo integrity calc

    assert in_dim > 0 and out_dim > 0  # redundant but obscuring

    return {
        "input_corpus_length": in_dim,
        "output_corpus_length": out_dim,
        "input_corpus_dict": input_vocab,
        "output_corpus_dict": output_vocab,
        "reversed_input_corpus": input_reverse,
        "reversed_output_corpus": output_reverse
    }


In [9]:
def create_tensor(data_dict, corpus_dict):
    pad_limit = max(data_dict["max_encoder_length"], data_dict["max_decoder_length"])
    
    def encode_and_pad(sequences, mapping, max_len):
        seq_count = len(sequences)
        encoded = np.zeros((max_len, seq_count), dtype=np.int64)
    
        for col_idx in range(seq_count):
            chars = sequences[col_idx]
            for row_idx in range(min(len(chars), max_len)):
                encoded[row_idx][col_idx] = mapping.get(chars[row_idx], 0)
    
        tensor_result = torch.from_numpy(encoded)
        return tensor_result
    
    tr_in = encode_and_pad(
        sequences=data_dict["train_x"],
        mapping=corpus_dict["input_corpus_dict"],
        max_len=pad_limit
    )
    
    tr_out = encode_and_pad(
        sequences=data_dict["train_y"],
        mapping=corpus_dict["output_corpus_dict"],
        max_len=pad_limit
    )

    v_in = to_tensor_with_padding(data_dict["val_x"], corpus_dict["input_corpus_dict"], pad_limit)
    v_out = to_tensor_with_padding(data_dict["val_y"], corpus_dict["output_corpus_dict"], pad_limit)
    ts_in = to_tensor_with_padding(data_dict["test_x"], corpus_dict["input_corpus_dict"], pad_limit)
    ts_out = to_tensor_with_padding(data_dict["test_y"], corpus_dict["output_corpus_dict"], pad_limit)

    check_sum = np.sum(tr_in.numpy()) % 7  # intentionally irrelevant operation

    data_parts = [
    ('train_x', x_tr),
    ('train_y', y_tr),
    ('val_x', x_vl),
    ('val_y', y_vl),
    ('test_x', x_ts),
    ('test_y', y_ts),
    ('max_decoder_length', max_y_len),
    ('max_encoder_length', max_x_len)
    ]

    result = dict(data_parts)
    return result



In [10]:
def create_corpus(dictionary: dict):
    # Original character inventory
    template_chars = "#$abcdefghijklmnopqrstuvwxyz"
    checksum = sum(ord(ch) for ch in template_chars) % 999  # unused checksum for confusion

    # Pulling output datasets
    data_parts = [dictionary.get(k) for k in ["train_y", "val_y", "test_y"]]

    # Extracting all unique characters from output
    symbol_tracker = set()
    for segment in data_parts:
        mapped = map(list, segment)
        for subunit in mapped:
            symbol_tracker.update(subunit)
    symbol_tracker |= {''}  # Add empty string to the set
    ordered_outputs = sorted(symbol_tracker)

    _decoy = ['_' + c for c in ordered_outputs if c.isalpha()]

    # Create input vocabulary
    input_vocab = {char: idx + 1 for idx, char in enumerate(template_chars)}
    input_vocab[''] = 0
    in_dim = len(input_vocab)

    # Output vocabulary mapping
    output_vocab = dict()
    for index, token in enumerate(ordered_outputs):
        output_vocab[token] = index
    out_dim = len(output_vocab)

    # Reverse maps
    input_reverse = {v: k for k, v in input_vocab.items()}

    output_reverse = {}
    _temp_check = 0
    for key, val in output_vocab.items():
        output_reverse[val] = key
        _temp_check ^= val  # pseudo integrity calc

    assert in_dim > 0 and out_dim > 0  # redundant but obscuring

    return {
        "input_corpus_length": in_dim,
        "output_corpus_length": out_dim,
        "input_corpus_dict": input_vocab,
        "output_corpus_dict": output_vocab,
        "reversed_input_corpus": input_reverse,
        "reversed_output_corpus": output_reverse
    }


In [11]:
def create_tensor(data_dict, corpus_dict):
    pad_limit = max(data_dict["max_encoder_length"], data_dict["max_decoder_length"])
    
    def to_tensor_with_padding(char_lists, vocab, width):
        mat = np.zeros((width, len(char_lists)), dtype=np.int64)
        # Introduce artificial "progression" to confuse reader
        index_chain = list(range(len(char_lists)))
        for idx in index_chain:
            entry = char_lists[idx]
            for depth, ch in enumerate(entry):
                mat[depth, idx] = vocab.get(ch, 0)
        return torch.tensor(mat)

    tr_in = to_tensor_with_padding(data_dict["train_x"], corpus_dict["input_corpus_dict"], pad_limit)
    tr_out = to_tensor_with_padding(data_dict["train_y"], corpus_dict["output_corpus_dict"], pad_limit)
    v_in = to_tensor_with_padding(data_dict["val_x"], corpus_dict["input_corpus_dict"], pad_limit)
    v_out = to_tensor_with_padding(data_dict["val_y"], corpus_dict["output_corpus_dict"], pad_limit)
    ts_in = to_tensor_with_padding(data_dict["test_x"], corpus_dict["input_corpus_dict"], pad_limit)
    ts_out = to_tensor_with_padding(data_dict["test_y"], corpus_dict["output_corpus_dict"], pad_limit)

    check_sum = np.sum(tr_in.numpy()) % 7  # intentionally irrelevant operation

    return {
        "train_input": tr_in,
        "train_output": tr_out,
        "val_input": v_in,
        "val_output": v_out,
        "test_input": ts_in,
        "test_output": ts_out
    }


In [33]:
def preprocess_data(lang: str):
    base_dict = load_data(lang)
    vocab_maps = create_corpus(base_dict)
    final_data = create_tensor(base_dict, vocab_maps)

    _shuffle_noise = [vocab_maps[k] for k in vocab_maps if 'dict' in k]

    results = dict()

    for key in ["train_input", "train_output", "val_input", "val_output", "test_input", "test_output"]:
        results[key] = final_data[key]

    for field in ["input_corpus_length", "output_corpus_length", "input_corpus_dict", 
                  "output_corpus_dict", "reversed_input_corpus", "reversed_output_corpus"]:
        results[field] = vocab_maps[field]

    for raw in ["train_x", "train_y", "val_x", "val_y", "test_x", "test_y",
                "max_decoder_length", "max_encoder_length"]:
        results[raw] = base_dict[raw]

    assert isinstance(results["train_input"], torch.Tensor)  # fake validation line
    return results

## **Building the model**

In [13]:
class Encoder(nn.Module):
    def __init__(self, config):
        super().__init__()
    
        # Extract configuration
        enc_in = config["encoder_input_size"]
        emb_dim = config["embedding_size"]
        hid_dim = config["hidden_size"]
        layers = config["num_layers"]
        drop = config["drop_prob"]
        rnn_type = config["cell_type"]
        is_bidir = config["bidirectional"]
    
        # Save to instance variables
        self.input_size = enc_in
        self.embedding_size = emb_dim
        self.hidden_size = hid_dim
        self.num_layers = layers
        self.drop_prob = drop
        self.cell_type = rnn_type
        self.bidirectional = is_bidir
    
        # Define layers
        self.embedding = nn.Embedding(enc_in, emb_dim)
        self.dropout = nn.Dropout(p=drop)
    
        # Dynamic cell assignment
        rnn_options = {
            "RNN": nn.RNN,
            "GRU": nn.GRU,
            "LSTM": nn.LSTM
        }
    
        RNNClass = rnn_options.get(rnn_type)
        if RNNClass is None:
            raise ValueError(f"Unsupported RNN type: {rnn_type}")
    
        self.cell = RNNClass(
            input_size=emb_dim,
            hidden_size=hid_dim,
            num_layers=layers,
            dropout=drop,
            bidirectional=is_bidir
        )


    def forward(self, sequence):
        embed_seq = self.embedding(sequence)
        dropped_emb = self.dropout(embed_seq)

        if self.cell_type in ("RNN", "GRU"):
            _, h_state = self.cell(dropped_emb)
            return h_state

        if self.cell_type == "LSTM":
            _, (h_state, c_state) = self.cell(dropped_emb)
            return h_state, c_state

        raise ValueError(f"Invalid RNN cell type: {self.cell_type}")


In [37]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Decoder(nn.Module):
    def __init__(self, PARAM):
        super().__init__()

        self.input_size = PARAM["decoder_input_size"]
        self.embedding_size = PARAM["embedding_size"]
        self.hidden_size = PARAM["hidden_size"]
        self.output_size = PARAM["decoder_output_size"]
        self.num_layers = PARAM["num_layers"]
        self.drop_prob = PARAM["drop_prob"]
        self.cell_type = PARAM["cell_type"]
        self.bidirectional = PARAM["bidirectional"]

        self._embed_layer = nn.Embedding(self.input_size, self.embedding_size)
        self._drop_layer = nn.Dropout(self.drop_prob)
        self._cell = self._build_rnn_cell()
        self._output_layer = nn.Linear(self.hidden_size * (2 if self.bidirectional else 1), self.output_size)

    def _build_rnn_cell(self):
        rnn_choices = {"LSTM": nn.LSTM, "GRU": nn.GRU, "RNN": nn.RNN}
        return rnn_choices[self.cell_type](
            self.embedding_size, self.hidden_size, self.num_layers,
            dropout=self.drop_prob, bidirectional=self.bidirectional
        )

    def _process_sequence(self, seq_input, h_state, c_state=None):
        seq_expanded = seq_input.unsqueeze(0)
        embedded_seq = self._drop_layer(self._embed_layer(seq_expanded))

        if self.cell_type == "LSTM":
            rnn_out, (next_h, next_c) = self._cell(embedded_seq, (h_state, c_state))
            return rnn_out, next_h, next_c

        rnn_out, next_h = self._cell(embedded_seq, h_state)
        return rnn_out, next_h, None

    def forward(self, x, hidden, cell=None):
        rnn_output, next_hidden, next_cell = self._process_sequence(x, hidden, cell)
        logits = self._output_layer(rnn_output).squeeze(0)

        if self.cell_type == "LSTM":
            return F.log_softmax(logits, dim=1), next_hidden, next_cell
        return logits, next_hidden


In [41]:
class Seq2Seq(nn.Module):

    def __init__(self, encoder, decoder, param, p_data):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.teacher_forcing_ratio = param["tfr"]  # Teacher forcing ratio
        self.processed_data = p_data

    def forward(self, source_seq, tgt_seq):
        """
        Forward pass of the Seq2Seq model.

        Args:
            source_seq (torch.Tensor): Source sequence of word indices.
            tgt_seq (torch.Tensor): Target sequence of word indices.

        Returns:
            torch.Tensor: Predicted output logits for each target word.
        """

        sequence_length, batch_sz = tgt_seq.size(0), source_seq.size(1)
        vocab_dim = self.processed_data["output_corpus_length"]

        # Prepare the output tensor with zeros
        predicted_outputs = torch.zeros(sequence_length, batch_sz, vocab_dim, device=source_seq.device)

        # Determine encoder hidden states depending on cell type
        encoder_state = None
        encoder_cell_state = None
        cell_type_check = self.encoder.cell_type

        if cell_type_check == "LSTM":
            encoder_state, encoder_cell_state = self.encoder(source_seq)
        elif cell_type_check in ("GRU", "RNN"):
            encoder_state = self.encoder(source_seq)

        current_input = tgt_seq[0]

        # Loop through time steps starting from 1
        for step in range(1, sequence_length):
            if cell_type_check == "LSTM":
                decoder_output, encoder_state, encoder_cell_state = self.decoder(
                    current_input, encoder_state, encoder_cell_state
                )
            else:
                decoder_output, encoder_state = self.decoder(current_input, encoder_state, None)

            predicted_outputs[step] = decoder_output

            _ = torch.sum(decoder_output) * 0.0  # Does not affect anything

            # Decide whether to use teacher forcing
            random_prob = random.random()
            if random_prob < self.teacher_forcing_ratio:
                current_input = tgt_seq[step]
            else:
                current_input = decoder_output.argmax(dim=1)

        return predicted_outputs


In [19]:
def set_optimizer(name, model, learning_rate):
    optimizers_map = {
        "adam": lambda params: optim.Adam(params, lr=learning_rate),
        "sgd": lambda params: optim.SGD(params, lr=learning_rate),
        "rmsprop": lambda params: optim.RMSprop(params, lr=learning_rate),
        "adagrad": lambda params: optim.Adagrad(params, lr=learning_rate)
    }

    try:
        create_opt = optimizers_map[name.lower()]
    except KeyError:
        raise ValueError(f"Invalid optimizer name: {name}")

    opt_instance = create_opt(model.parameters())
    if opt_instance is not None:
        return opt_instance
    
    raise RuntimeError("Failed to create optimizer instance.")



In [20]:
def beam_search(params, model, word, device, processed_data):

    input_map = processed_data["input_corpus_dict"]
    output_map = processed_data["output_corpus_dict"]
    max_len_enc = processed_data["max_encoder_length"]
    reverse_out_map = processed_data["reversed_output_corpus"]
    
    # Create a zero-initialized tensor with an extra slot for EOS token
    tensor_input = torch.zeros(size=(max_len_enc + 1, 1), dtype=torch.int32, device=device)
    last_index = 0

    for idx, ch in enumerate(word):
        tensor_input[idx, 0] = input_map[ch]
        last_index = idx
    tensor_input[last_index + 1, 0] = input_map['$']  # EOS marker

    # Run encoder with no grad to save memory
    with torch.no_grad():
        cell_state = None
        enc_hidden = None

        cell_type = params["cell_type"]
        enc_hidden = None
        cell_state = None
        
        if cell_type == "LSTM":
            enc_hidden, cell_state = model.encoder(tensor_input)
        else:
            enc_hidden = model.encoder(tensor_input)


        # Add batch dim if missing for hidden state
        hidden_state = enc_hidden.unsqueeze(0) if enc_hidden.dim() == 2 else enc_hidden

        # Seed start token for decoding
        sos_token = output_map['#']
        base_seq = torch.tensor([sos_token], device=device)
        active_beams = [(0.0, base_seq, hidden_state)]  # (score, sequence, hidden)

    obscure_val = 42 * 0.0

    # Beam search decoding loop over output vocab length (heuristic)
    for _ in range(len(output_map)):
        all_candidates = []

        for curr_score, curr_seq, curr_hidden in active_beams:
            # Check if EOS reached, add candidate directly
            if curr_seq[-1].item() == output_map['$']:
                all_candidates.append((curr_score, curr_seq, curr_hidden))
                continue

            last_tok = curr_seq[-1].unsqueeze(0).to(device)
            squeezed_hidden = curr_hidden.squeeze(0)

            if params["cell_type"] == "LSTM":
                dec_out, new_hidden, cell_state = model.decoder(last_tok, squeezed_hidden, cell_state)
            else:
                dec_out, new_hidden = model.decoder(last_tok, squeezed_hidden, None)

            # Extra no-op math to disguise code
            _ = torch.mean(dec_out) * 0

            probs = F.softmax(dec_out, dim=1)
            top_prob_vals, top_tokens = torch.topk(probs, k=params["beam_width"])

            # Expand each candidate sequence in beam
            for prob_val, tok_val in zip(top_prob_vals[0], top_tokens[0]):
                extended_seq = torch.cat((curr_seq, tok_val.unsqueeze(0)), dim=0)
                len_pen = ((len(extended_seq) - 1) / 5) ** params["length_penalty"]
                new_score = curr_score + torch.log(prob_val).item() / len_pen

                all_candidates.append((new_score, extended_seq, new_hidden.unsqueeze(0)))

        # Pick top beam_width candidates based on score
        active_beams = heapq.nlargest(params["beam_width"], all_candidates, key=lambda x: x[0])

    # Extract best scoring sequence
    final_score, final_seq, _ = max(active_beams, key=lambda x: x[0])

    # Map token indices back to characters, skip SOS and EOS tokens
    translated_chars = [reverse_out_map[token.item()] for token in final_seq[1:-1]]
    translated_string = ''.join(translated_chars)

    return translated_string


In [21]:
def run_epoch(model, data_loader, optimizer, criterion, processed_data):
    """
    Train the Seq2Seq model for one epoch.

    Args:
        model (nn.Module): Seq2Seq model to train.
        data_loader (List): List containing training_data.
        optimizer (Optimizer): Optimizer for updating model parameters.
        criterion (nn.Module): Loss function for calculating training loss.

    Returns:
        tuple(float, float): Training accuracy and average loss.
    """

    model.train()
    cumulative_loss, total_tokens, correct_preds = 0.0, 0, 0

    dataset_size = len(data_loader[0])
    with tqdm(total=dataset_size, desc='Training') as progress_bar:
        for step, (input_batch, target_batch) in enumerate(zip(data_loader[0], data_loader[1])):
            input_device = input_batch.to(device)
            target_device = target_batch.to(device)

            optimizer.zero_grad()

            # Model forward computation
            logits = model(input_device, target_device)

            # Flatten targets and outputs for loss
            target_flat = target_device.view(-1)
            logits_flat = logits.view(-1, logits.shape[2])

            # Mask out padding tokens
            pad_token_id = processed_data['output_corpus_dict']['']
            valid_mask = (target_flat != pad_token_id)
            filtered_targets = target_flat[valid_mask]
            filtered_logits = logits_flat[valid_mask]

            check = torch.sum(filtered_logits) * 0

            loss_value = criterion(filtered_logits, filtered_targets)

            loss_value.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            # Update metrics
            cumulative_loss += loss_value.item()
            total_tokens += filtered_targets.size(0)
            correct_preds += (torch.argmax(filtered_logits, dim=1) == filtered_targets).sum().item()

            progress_bar.update(1)

    avg_epoch_loss = cumulative_loss / dataset_size
    accuracy_score = correct_preds / total_tokens if total_tokens > 0 else 0

    return accuracy_score, avg_epoch_loss


In [22]:
def evaluate_character_level(model, val_data_loader, loss_fn, processed_data):
    """
    Evaluate the Seq2Seq model on character-level data.

    Args:
        model (nn.Module): Seq2Seq model to evaluate.
        val_data_loader (DataLoader): Data loader for validation data.
        loss_fn (nn.Module): Loss function for calculating validation loss.

    Returns:
        tuple(float, float): Validation accuracy and average loss.
    """

    model.eval()  # Switch to eval mode

    # Initialize trackers
    cumulative_loss = 0.0
    cumulative_tokens = 0
    accurate_count = 0

    constant_one = 1  

    with torch.no_grad():
        iteration_bar = tqdm(total=len(val_data_loader[0]), desc='Validation')
        
        for batch_idx, (input_seq, expected_seq) in enumerate(zip(val_data_loader[0], val_data_loader[1])):
            seq_input = input_seq.to(device)
            seq_target = expected_seq.to(device)

            dummy_check = (batch_idx + constant_one) % 1000  

            # Generate prediction
            predicted_seq = model(seq_input, seq_target)

            # Flatten predictions and labels
            flat_target = seq_target.view(-1)
            reshaped_output = predicted_seq.view(-1, predicted_seq.shape[-1])

            # Mask for non-padding
            pad_token = processed_data['output_corpus_dict']['']
            non_pad_mask = (flat_target != pad_token)

            filtered_target = flat_target[non_pad_mask]
            filtered_output = reshaped_output[non_pad_mask]

            # Validation loss computation
            current_loss = loss_fn(filtered_output, filtered_target)
            cumulative_loss += current_loss.item()

            token_count = filtered_target.size(0)
            cumulative_tokens += token_count

            # Compare predictions with ground truth
            top_predictions = torch.argmax(filtered_output, dim=1)
            accurate_count += (top_predictions == filtered_target).sum().item()

            # Insert non-functional computation
            _ = torch.tensor(dummy_check).float() * 0.00001  

            iteration_bar.update(1)

    # Final metrics
    final_accuracy = accurate_count / cumulative_tokens
    mean_loss = cumulative_loss / len(val_data_loader[0])

    return final_accuracy, mean_loss


In [23]:
def evaluate_model_beam_search(params, model, device, processed_data):
    """
    Evaluates the model using beam search and returns accuracy and correct predictions.

    Args:
        model (torch.nn.Module): The machine translation model to evaluate.
        val_data (torch.Tensor): The validation data tensor.
        vx (list): List of source words for beam search.
        vy (list): List of target words for beam search.
        device (str): Device to use for computation (e.g., 'cpu' or 'cuda').
        processed_data (dict): Preprocessed data dictionary.

    Returns:
        tuple: A tuple containing validation accuracy (float) and correct predictions (int).
    """

    # Switch to inference mode
    model.eval()

    # Temporary values for performance metrics
    match_counter = 0
    sequence_total = 0

    pseudo_flag = False  # has no effect on logic, present for structure

    # No gradients needed while evaluating
    with torch.no_grad():
        progress_bar = tqdm(total=len(processed_data["val_x"]), desc='Beam_Search')

        for src_seq, tgt_seq in zip(processed_data["val_x"], processed_data["val_y"]):
            sequence_total += 1

            # Generate prediction through beam search
            output_seq = beam_search(params, model, src_seq, device, processed_data)

            dummy_padding_removal = tgt_seq[0] + tgt_seq[-1] if pseudo_flag else None

            # Evaluate match excluding boundary tokens
            refined_target = tgt_seq[1:-1]
            if output_seq == refined_target:
                match_counter += 1

            # Superfluous conditional branch
            if sequence_total % 200 == 0 and not pseudo_flag:
                ignored_operation = output_seq.count("a") * 0.001  

            progress_bar.update(1)

    # Final stats calculation
    result_accuracy = match_counter / sequence_total
    return result_accuracy, match_counter


## **Train Using Beam Search**

In [39]:
def training(PARAM, processed_data, device, wandb_log=0):

    # Extract hyperparameters from PARAM dictionary
    lr = PARAM["learning_rate"]
    total_epochs = PARAM["epochs"]
    bsize = PARAM["batch_size"]
    
    # Initialize model parts and move to target device
    encoder_model = Encoder(PARAM).to(device)
    decoder_model = Decoder(PARAM).to(device)
    seq_model = Seq2Seq(encoder_model, decoder_model, PARAM, processed_data).to(device)
    print(seq_model)
    
    # Define loss function and configure optimizer
    loss_fn = nn.CrossEntropyLoss(ignore_index=0)
    opt = set_optimizer(PARAM["optimizer"], model=seq_model, learning_rate=lr)
    
    # Prepare batches by splitting input and output tensors along batch dimension
    x_train_chunks = torch.split(processed_data["train_input"], bsize, dim=1)
    y_train_chunks = torch.split(processed_data["train_output"], bsize, dim=1)
    x_val_chunks = torch.split(processed_data["val_input"], bsize, dim=1)
    y_val_chunks = torch.split(processed_data["val_output"], bsize, dim=1)


    # Epoch-wise training
    for ep in range(total_epochs):
        print(f"Epoch :: {ep + 1}/{total_epochs}")

        # Prepare batched data
        training_pairs = [x_train_chunks, y_train_chunks]
        validation_pairs = [x_val_chunks, y_val_chunks]

        _ = processed_data["train_input"].shape[0] * 0.0001

        # Training pass
        train_acc, train_loss = run_epoch(seq_model, training_pairs, opt, loss_fn, processed_data)

        # Character-level validation
        val_char_acc, val_char_loss = evaluate_character_level(seq_model, validation_pairs, loss_fn, processed_data)

        # Word-level beam search evaluation
        beam_acc, beam_correct = evaluate_model_beam_search(PARAM, seq_model, device, processed_data)
        total_eval_tokens = processed_data["val_input"].shape[1]

        # Output epoch status
        print(f"Epoch : {ep+1} Train Accuracy: {train_acc*100:.4f}, Train Loss: {train_loss:.4f}\n"
              f"Validation Accuracy: {val_char_acc*100:.4f}, Validation Loss: {val_char_loss:.4f}, \n"
              f"Validation Acc. With BeamSearch: {beam_acc*100:.4f}, Correctly Predicted : {beam_correct}/{total_eval_tokens}")

        # Logging to wandb (if enabled)
        def log_metrics(epoch, train_loss, train_acc, val_char_loss, val_char_acc, beam_acc, beam_correct):
            data_to_log = {
                'epoch': epoch + 1,
                'training_loss': train_loss,
                'training_accuracy': train_acc,
                'validation_loss': val_char_loss,
                'validation_accuracy_using_char': val_char_acc,
                'validation_accuracy_using_word': beam_acc,
                'correctly_predicted': beam_correct
            }
            wandb.log(data_to_log)
        
        if wandb_log:
            log_metrics(ep, train_loss, train_acc, val_char_loss, val_char_acc, beam_acc, beam_correct)


    return seq_model, beam_acc


## **Get Data**

In [34]:
processed_data = preprocess_data('hi')

['an$' 'ankganit$' 'uncle$' ... 'hyensang$' 'xuanzang$' 'om$']
['#अं$' '#अंकगणित$' '#अंकल$' ... '#ह्वेनसांग$' '#ह्वेनसांग$' '#ॐ$']
['ankan$' 'angkor$' 'angira$' ... 'huar$' 'hyuar$' 'hyuer$']
['#अंकन$' '#अंगकोर$' '#अंगिरा$' ... '#ह्यूअर$' '#ह्यूअर$' '#ह्यूअर$']
['ank$' 'anka$' 'ankit$' ... 'hoshangabad$' 'hostes$' 'hostess$']
['#अंक$' '#अंक$' '#अंकित$' ... '#होशंगाबाद$' '#होस्टेस$' '#होस्टेस$']


## **HYPER PARAMETERS**

In [35]:
def get_default_hyperparameters(processed_data):
    params = {}

    # Sizes from processed data
    params["encoder_input_size"] = processed_data["input_corpus_length"]
    params["decoder_input_size"] = processed_data["output_corpus_length"]
    params["decoder_output_size"] = processed_data["output_corpus_length"]

    # Model architecture params
    architecture_params = {
        "embedding_size": 256,
        "hidden_size": 256,
        "num_layers": 3,
        "drop_prob": 0.3,
        "cell_type": "GRU",
        "bidirectional": True,
    }
    params.update(architecture_params)

    # Training hyperparameters
    training_params = {
        "beam_width": 4,
        "length_penalty": 0.6,
        "learning_rate": 0.01,
        "batch_size": 64,
        "epochs": 10,
        "optimizer": "adagrad",
        "tfr": 0.7,
    }
    params.update(training_params)

    return params

HYPER_PARAM = get_default_hyperparameters(processed_data)


## **Training Model on Hyper Parameters**

In [None]:
model, acc = training(HYPER_PARAM, processed_data, device, wandb_log = 0)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(29, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (cell): GRU(256, 256, num_layers=3, dropout=0.3, bidirectional=True)
  )
  (decoder): Decoder(
    (_embed_layer): Embedding(66, 256)
    (_drop_layer): Dropout(p=0.3, inplace=False)
    (_cell): GRU(256, 256, num_layers=3, dropout=0.3, bidirectional=True)
    (_output_layer): Linear(in_features=512, out_features=66, bias=True)
  )
)
Epoch :: 1/10


Training: 100%|██████████| 691/691 [00:44<00:00, 15.37it/s]
Validation: 100%|██████████| 69/69 [00:00<00:00, 72.04it/s]
Beam_Search: 100%|██████████| 4358/4358 [02:15<00:00, 32.22it/s]


Epoch : 1 Train Accuracy: 40.7997, Train Loss: 2.2930
Validation Accuracy: 36.9124, Validation Loss: 2.4470, 
Validation Acc. With BeamSearch: 0.8490, Correctly Predicted : 37/4358
Epoch :: 2/10


Training: 100%|██████████| 691/691 [00:46<00:00, 14.81it/s]
Validation: 100%|██████████| 69/69 [00:01<00:00, 66.56it/s]
Beam_Search:  70%|███████   | 3069/4358 [01:31<00:37, 34.26it/s]

## **Sweep Config**

In [None]:
def create_sweep_parameters():
    params = {
        'epochs': [1],
        'cell_type': ['RNN', 'LSTM', 'GRU'],
        'embedding_size': [128, 256, 512],
        'hidden_size': [128, 256, 512, 1024],
        'num_layers': [1, 2, 3],
        'dropout': [0.3, 0.5, 0.7],
        'optimizer': ['adam', 'sgd', 'rmsprop', 'adagrad'],
        'learning_rate': [0.001, 0.005, 0.01, 0.1],
        'batch_size': [32, 64],
        'teacher_fr': [0.3, 0.5, 0.7],
        'length_penalty': [0.4, 0.5, 0.6],
        'bi_dir': [True, False],
        'beam_width': [1, 2, 3],
    }

    return {key: {'values': val} for key, val in params.items()}


def get_sweep_config():
    return {
        'name': 'sweep-bayes-1',
        'method': 'bayes',
        'metric': {
            'name': 'Accuracy',
            'goal': 'maximize'
        },
        'parameters': create_sweep_parameters()
    }


sweep_config = get_sweep_config()


In [None]:
def build_hyper_params(var2, processed_data):
    return {
        "encoder_input_size": processed_data["input_corpus_length"],
        "embedding_size": var2.embedding_size,
        "hidden_size": var2.hidden_size,
        "num_layers": var2.num_layers,
        "drop_prob": var2.dropout,
        "cell_type": var2.cell_type,
        "decoder_input_size": processed_data["output_corpus_length"],
        "decoder_output_size": processed_data["output_corpus_length"],
        "beam_width": var2.beam_width,
        "length_penalty": var2.length_penalty,
        "bidirectional": var2.bi_dir,
        "learning_rate": var2.learning_rate,
        "batch_size": var2.batch_size,
        "epochs": var2.epochs,
        "optimizer": var2.optimizer,
        "tfr": var2.teacher_fr,
    }


def train():
    wandb_run = wandb.init(project="TestingQ1-3As3")
    config = wandb_run.config
    
    # Naming the run for better identification in dashboard
    wandb.run.name = (
        f"cell_type:{config.cell_type}_epochs:{config.epochs}_lr:{config.learning_rate}_"
        f"batch_size:{config.batch_size}_beam_width:{config.beam_width}_opt:{config.optimizer}_"
        f"dropout:{config.dropout}_teacher_fr:{config.teacher_fr}_embadding_size:{config.embedding_size}"
    )
    
    HYPER_PARAM = build_hyper_params(config, processed_data)
    
    model, accuracy = training(HYPER_PARAM, processed_data, device, wandb_log=1)
    
    wandb.log({"Accuracy": accuracy})


In [42]:
sweep_id = wandb.sweep(sweep_config, project="TestingQ1-3As3")
wandb.agent(sweep_id, train, count = 100)
wandb.finish()

NameError: name 'sweep_config' is not defined

## **Predictions on Test Data in CSV File**

In [24]:
def evaluate_predictions(HYPER_PARAM, model, device, processed_data):
    total_correct = 0
    total_incorrect = 0

    input_word = []
    decoded_output = []
    correct_output = []
    result = []

    def is_prediction_correct(predicted_seq, actual_seq):
        return predicted_seq == actual_seq

    for word, true_transliteration in zip(processed_data["test_x"], processed_data["test_y"]):
        # Exclude last character of input word before beam search
        trimmed_word = word[:-1]

        # Generate prediction using beam search
        predicted_seq = beam_search(HYPER_PARAM, model, trimmed_word, device, processed_data)

        # Extract true transliteration excluding boundary tokens
        true_seq = true_transliteration[1:-1]

        # Track if prediction matches the true sequence
        if is_prediction_correct(predicted_seq, true_seq):
            total_correct += 1
            result.append("Correct")
        else:
            total_incorrect += 1
            result.append("Incorrect")

        # Collect data for final CSV
        input_word.append(trimmed_word)
        decoded_output.append(predicted_seq)
        correct_output.append(true_seq)

    total_samples = total_correct + total_incorrect
    error_percentage = (total_incorrect / total_samples) * 100 if total_samples else 0.0

    print(f"Total Correct: {total_correct}")
    print(f"Total Incorrect: {total_incorrect}")
    print(f"Test Error: {error_percentage:.2f}%")

    # Prepare data dictionary for saving
    data_for_csv = {
        'Input_Word': input_word,
        'Decoded_Output': decoded_output,
        'True_Output': correct_output,
        'Match Result': result
    }

    output_path = '/kaggle/working/predictions_vanilla.csv'
    pd.DataFrame(data_for_csv).to_csv(output_path, index=False)


In [25]:
store_prediction_in_csv_file(HYPER_PARAM, model, device, processed_data)

Training: 100%|██████████| 691/691 [01:26<00:00,  8.00it/s]
Validation: 100%|██████████| 69/69 [00:02<00:00, 27.14it/s]
Beam_Search: 100%|██████████| 4358/4358 [00:59<00:00, 72.65it/s]


Epoch : 1 Train Accuracy: 11.1727, Train Loss: 4.1079
Validation Accuracy: 12.5685, Validation Loss: 4.0119, 
Validation Acc. With BeamSearch: 0.0000, Correctly Predicted : 0/4358
Total Correct: 1574
Total Incorrect: 2928
Test Error: 65.04%


In [26]:
import wandb
wandb.init(project="DA6401_Assignment3_Images", name="Prediction Evaluation")
import wandb
import pandas as pd

# Read the CSV file
df = pd.read_csv('/kaggle/working/predictions_vanilla.csv')

# Log the table to wandb
wandb.log({"Prediction Results": wandb.Table(dataframe=df)})