In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from torch.utils.data import Dataset, DataLoader# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        # print(os.path.join(dirname, filename))
        pass

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

# 1. Load the splits
def load_dakshina_splits(lang_code='hi', base_dir='dakshina_dataset_v1.0'):
    lex_dir = f"{base_dir}/{lang_code}/lexicons/"
    train_path = lex_dir + f"{lang_code}.translit.sampled.train.tsv"
    dev_path   = lex_dir + f"{lang_code}.translit.sampled.dev.tsv"
    test_path  = lex_dir + f"{lang_code}.translit.sampled.test.tsv"
    train_df = pd.read_csv(train_path, sep='\t', header=None, names=['native', 'latin', 'count'], na_filter = False)
    dev_df   = pd.read_csv(dev_path,
                           sep='\t', header=None, names=['native', 'latin', 'count'], na_filter = False)
    test_df  = pd.read_csv(test_path,  sep='\t', header=None, names=['native', 'latin', 'count'], na_filter = False)
    train_df.drop(["count"], axis = 1,inplace = True)
    dev_df.drop(["count"], axis = 1,inplace = True)
    test_df.drop(["count"], axis = 1,inplace = True)

    return train_df, dev_df, test_df

# 2. Preprocessing: add <start> and <end> tokens, tokenize as characters
def preprocess_df(df):
    def process(text):
        return ' '.join(['<start>'] + list(str(text).strip()) + ['<end>'])
    df['native_proc'] = df['native'].apply(process)
    df['latin_proc'] = df['latin'].apply(process)
    return df

# 3. Build vocabulary from training data only
def build_vocab(texts):
    special_tokens = ['<pad>', '<unk>', '<start>', '<end>']
    chars = set()
    for text in texts:
        chars.update(text.split())
    chars = [c for c in sorted(chars) if c not in special_tokens]
    vocab = special_tokens + chars
    char2idx = {c: i for i, c in enumerate(vocab)}
    idx2char = {i: c for i, c in enumerate(vocab)}
    return char2idx, idx2char

# 4. Convert text to padded sequences of indices
def texts_to_sequences(texts, vocab, max_len=None):
    seqs = []
    for text in texts:
        seq = [vocab.get(c, vocab['<unk>']) for c in text.split()]
        seqs.append(seq)
    if not max_len:
        max_len = max(len(seq) for seq in seqs)
    padded_seqs = [seq + [vocab['<pad>']] * (max_len - len(seq)) for seq in seqs]
    # print(padded_seqs , max_len)
    return np.array(padded_seqs), max_len

# 5. PyTorch Dataset
class TransliterationDataset(Dataset):
    def __init__(self, src_seqs, trg_seqs):
        self.src = torch.LongTensor(src_seqs)
        self.trg = torch.LongTensor(trg_seqs)
    def __len__(self):
        return len(self.src)
    def __getitem__(self, idx):
        return {
            'source': self.src[idx],
            'target': self.trg[idx],
            'target_input': self.trg[idx][:-1],  # Exclude <end>
            'target_output': self.trg[idx][1:]   # Exclude <start>
        }

# 6. Main function to prepare everything
def prepare_dakshina_data(base_dir,lang_code='hi', batch_size=64):
    # Load splits
    train_df, dev_df, test_df = load_dakshina_splits(lang_code,base_dir)
    train_df = preprocess_df(train_df)
    dev_df = preprocess_df(dev_df)
    test_df = preprocess_df(test_df)

    # Build vocabs from training only
    src_vocab, src_idx2char = build_vocab(train_df['latin_proc'])
    trg_vocab, trg_idx2char = build_vocab(train_df['native_proc'])

    # Find max lengths across all splits for consistent padding
    src_max_len = max(
        train_df['latin_proc'].apply(lambda x: len(x.split())).max(),
        dev_df['latin_proc'].apply(lambda x: len(x.split())).max(),
        test_df['latin_proc'].apply(lambda x: len(x.split())).max()
    )
    trg_max_len = max(
        train_df['native_proc'].apply(lambda x: len(x.split())).max(),
        dev_df['native_proc'].apply(lambda x: len(x.split())).max(),
        test_df['native_proc'].apply(lambda x: len(x.split())).max()
    )

    # Convert to sequences
    train_src, _ = texts_to_sequences(train_df['latin_proc'], src_vocab, src_max_len)
    train_trg, _ = texts_to_sequences(train_df['native_proc'], trg_vocab, trg_max_len)
    dev_src, _ = texts_to_sequences(dev_df['latin_proc'], src_vocab, src_max_len)
    dev_trg, _ = texts_to_sequences(dev_df['native_proc'], trg_vocab, trg_max_len)
    test_src, _ = texts_to_sequences(test_df['latin_proc'], src_vocab, src_max_len)
    test_trg, _ = texts_to_sequences(test_df['native_proc'], trg_vocab, trg_max_len)

    # Datasets and loaders
    train_dataset = TransliterationDataset(train_src, train_trg)
    dev_dataset = TransliterationDataset(dev_src, dev_trg)
    test_dataset = TransliterationDataset(test_src, test_trg)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    dev_loader = DataLoader(dev_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    return {
        'train_loader': train_loader,
        'dev_loader': dev_loader,
        'test_loader': test_loader,
        'src_vocab': src_vocab,
        'trg_vocab': trg_vocab,
        'src_idx2char': src_idx2char,
        'trg_idx2char': trg_idx2char,
        'src_max_len': src_max_len,
        'trg_max_len': trg_max_len,
    }

# Example usage
PATH = "/kaggle/input/dakshina/dakshina_dataset_v1.0/"
data = prepare_dakshina_data(lang_code='hi', batch_size=64,base_dir = PATH)
print(f"Source vocab size: {len(data['src_vocab'])}")
print(f"Target vocab size: {len(data['trg_vocab'])}")
print(f"Train batches: {len(data['train_loader'])}")
print(f"Dev batches: {len(data['dev_loader'])}")
print(f"Test batches: {len(data['test_loader'])}")

# Check a batch
batch = next(iter(data['train_loader']))
print("Source batch shape:", batch['source'].shape)
print("Target input batch shape:", batch['target_input'].shape)
print("Target output batch shape:", batch['target_output'].shape)


Source vocab size: 30
Target vocab size: 67
Train batches: 691
Dev batches: 69
Test batches: 71
Source batch shape: torch.Size([64, 22])
Target input batch shape: torch.Size([64, 20])
Target output batch shape: torch.Size([64, 20])


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import wandb
from tqdm import tqdm
import numpy as np

class Seq2Seq(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, 
                 embed_dim=256, hidden_size=512,
                 enc_layers=2, dec_layers=2,
                 cell_type='lstm', dropout=0.3):
        super().__init__()
        
        self.src_vocab_size = src_vocab_size
        self.trg_vocab_size = trg_vocab_size
        self.hidden_size = hidden_size
        self.cell_type = cell_type.lower()
        self.enc_layers = enc_layers
        self.dec_layers = dec_layers

        # Embedding layers
        self.src_embed = nn.Embedding(src_vocab_size, embed_dim)
        self.trg_embed = nn.Embedding(trg_vocab_size, embed_dim)
        
        # RNN cell selection and initialization
        rnn_dict = {
            'rnn': nn.RNN,
            'lstm': nn.LSTM,
            'gru': nn.GRU
        }
        rnn_class = rnn_dict[self.cell_type]
        
        # Encoder
        self.encoder = rnn_class(
            embed_dim, hidden_size, enc_layers,
            dropout=dropout if enc_layers > 1 else 0,
            batch_first=True
        )
        
        # Decoder 
        self.decoder = rnn_class(
            embed_dim, hidden_size, dec_layers,
            dropout=dropout if dec_layers > 1 else 0,
            batch_first=True
        )
        
        # Final projection layer
        self.fc = nn.Linear(hidden_size, trg_vocab_size)
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        trg_len = trg.size(1)
        
        # Encoder forward
        src_embedded = self.src_embed(src)
        encoder_outputs, hidden = self._run_encoder(src_embedded)
        
        # Adjust hidden states for decoder
        if self.cell_type == 'lstm':
            hidden = self._adapt_hidden(hidden, self.dec_layers)
        else:
            if self.enc_layers > 1:  # Handle multi-layer RNN/GRU encoder
                hidden = hidden[-self.dec_layers:]
            else:  # Single-layer encoder → expand for multi-layer decoder
                hidden = hidden.repeat(self.dec_layers, 1, 1)
        
        # Decoder initialization
        inputs = trg[:, 0]
        outputs = torch.zeros(batch_size, trg_len, self.trg_vocab_size).to(src.device)
        
        # Decoder steps
        for t in range(1, trg_len):
            trg_embedded = self.trg_embed(inputs).unsqueeze(1)
            
            if self.cell_type == 'lstm':
                out, (hidden, cell) = self.decoder(trg_embedded, hidden)
                hidden = (hidden, cell)
            else:
                out, hidden = self.decoder(trg_embedded, hidden)
            
            output = self.fc(out.squeeze(1))
            outputs[:, t] = output
            
            # Teacher forcing
            use_teacher = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            inputs = trg[:, t] if use_teacher else top1
            
        return outputs


    def _adapt_hidden(self, hidden, target_layers):
        """Adjust hidden states to match target layer count"""
        if isinstance(hidden, tuple):  # LSTM case
            return (hidden[0][-target_layers:], 
                    hidden[1][-target_layers:])
        else:  # RNN/GRU case
            return hidden[-target_layers:]

    
    def _run_encoder(self, src_embedded):
        if self.cell_type == 'lstm':
            outputs, (hidden, cell) = self.encoder(src_embedded)
            return outputs, (hidden, cell)
        else:
            outputs, hidden = self.encoder(src_embedded)
            return outputs, hidden
    
    def _decoder_step(self, inputs, hidden):
        trg_embedded = self.trg_embed(inputs).unsqueeze(1)
        
        if self.cell_type == 'lstm':
            hidden, cell = hidden
            out, (hidden, cell) = self.decoder(trg_embedded, (hidden, cell))
            hidden_state = (hidden, cell)
        else:
            out, hidden = self.decoder(trg_embedded, hidden)
            hidden_state = hidden
            
        output = self.fc(out.squeeze(1))
        return output, hidden_state
    
    def beam_search_decode(self, src, beam_width=3, max_len=20):
        self.eval()
        with torch.no_grad():
            src_embedded = self.src_embed(src)
            encoder_outputs, hidden = self._run_encoder(src_embedded)
            
            # Initialize beam
            start_token = self.trg_embed.weight.shape[0] - 4  # <start> index
            beams = [([start_token], 0, hidden)]
            
            for _ in range(max_len):
                candidates = []
                for seq, score, hidden_state in beams:
                    if seq[-1] == self.trg_embed.weight.shape[0] - 3:  # <end>
                        candidates.append((seq, score, hidden_state))
                        continue
                        
                    inputs = torch.LongTensor([seq[-1]]).to(src.device)
                    output, new_hidden = self._decoder_step(inputs, hidden_state)
                    topk_probs, topk_ids = torch.topk(torch.log_softmax(output, dim=1), beam_width)
                    
                    for i in range(beam_width):
                        candidates.append((
                            seq + [topk_ids[0, i].item()],
                            score + topk_probs[0, i].item(),
                            new_hidden
                        ))
                
                # Keep top-k candidates
                candidates.sort(key=lambda x: x[1]/len(x[0]), reverse=True)
                beams = candidates[:beam_width]
                
                # Check if all beams end with <end>
                if all([seq[-1] == self.trg_embed.weight.shape[0] - 3 for seq, _, _ in beams]):
                    break
                    
            # Return best sequence (strip <start> and <end>)
            best_seq = beams[0][0][1:-1]
            return torch.LongTensor(best_seq).unsqueeze(0)

In [5]:
def train(config=None):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # with wandb.init(config=config):
    # config = wandb.config
    pad_idx = data['trg_vocab']['<pad>']
    
    # Model initialization
    model = Seq2Seq(
        src_vocab_size=len(data['src_vocab']),
        trg_vocab_size=len(data['trg_vocab']),
        # embed_dim=config.embed_dim,
        # hidden_size=config.hidden_size,
        # enc_layers=config.enc_layers,
        # dec_layers=config.dec_layers,
        # cell_type=config.cell_type,
        # dropout=config.dropout
    ).to(device)
    
    # Training setup
    # optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
    optimizer = optim.Adam(model.parameters(), lr=0.0005)
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
    best_val_loss = float('inf')
    
    for epoch in range(10):
        # Training
        model.train()
        train_loss = 0
        for batch in tqdm(data['train_loader'], desc=f"Epoch {epoch+1}"):
            src = batch['source'].to(device)
            trg = batch['target'].to(device)
            
            optimizer.zero_grad()
            output = model(src, trg, teacher_forcing_ratio=0.5)
            
            # Reshape for loss calculation
            output = output[:, 1:].reshape(-1, output.size(-1))
            targets = batch['target_output'].to(device).reshape(-1)
            
            loss = criterion(output, targets)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for batch in data['dev_loader']:
                src = batch['source'].to(device)
                trg = batch['target'].to(device)
                
                output = model(src, trg, teacher_forcing_ratio=0)
                output = output[:, 1:].reshape(-1, output.size(-1))
                targets = batch['target_output'].to(device).reshape(-1)
                
                loss = criterion(output, targets)
                val_loss += loss.item()
                
                # Calculate accuracy
                _, predicted = torch.max(output, 1)
                mask = targets != pad_idx
                correct += ((predicted == targets) * mask).sum().item()
                total += mask.sum().item()
        
        avg_train_loss = train_loss / len(data['train_loader'])
        avg_val_loss = val_loss / len(data['dev_loader'])
        val_acc = correct / total
        
        # wandb.log({
        #     "epoch": epoch,
        #     "train_loss": avg_train_loss,
        #     "val_loss": avg_val_loss,
        #     "val_acc": val_acc
        # })
        print( "epoch:", epoch,
            "train_loss:", avg_train_loss,
            "val_loss:", avg_val_loss,
            "val_acc:", val_acc)
        
        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), "best_model.pth")


In [5]:
train()

Epoch 1: 100%|██████████| 691/691 [00:29<00:00, 23.59it/s]


epoch: 0 train_loss: 2.4701947839836307 val_loss: 1.6738811765891919 val_acc: 0.49993402823591504


Epoch 2: 100%|██████████| 691/691 [00:28<00:00, 24.10it/s]


epoch: 1 train_loss: 1.0600401615095898 val_loss: 1.2050387842067773 val_acc: 0.6462264150943396


Epoch 3: 100%|██████████| 691/691 [00:28<00:00, 24.06it/s]


epoch: 2 train_loss: 0.7334234876242802 val_loss: 1.0748567460239797 val_acc: 0.6862712758939175


Epoch 4: 100%|██████████| 691/691 [00:28<00:00, 24.06it/s]


epoch: 3 train_loss: 0.5758513196677444 val_loss: 1.0228825133779775 val_acc: 0.7068544662884285


Epoch 5: 100%|██████████| 691/691 [00:28<00:00, 24.02it/s]


epoch: 4 train_loss: 0.4763692596283042 val_loss: 1.0337142952974292 val_acc: 0.7120332497690989


Epoch 6: 100%|██████████| 691/691 [00:28<00:00, 24.22it/s]


epoch: 5 train_loss: 0.3988922502516665 val_loss: 1.0324933671432992 val_acc: 0.72107138144874


Epoch 7: 100%|██████████| 691/691 [00:28<00:00, 24.13it/s]


epoch: 6 train_loss: 0.3296491702110136 val_loss: 1.0404761606368467 val_acc: 0.7294168096054888


Epoch 8: 100%|██████████| 691/691 [00:28<00:00, 23.94it/s]


epoch: 7 train_loss: 0.2856514769378516 val_loss: 1.112315789512966 val_acc: 0.7242710120068611


Epoch 9: 100%|██████████| 691/691 [00:28<00:00, 24.07it/s]


epoch: 8 train_loss: 0.24038467816092 val_loss: 1.1648674866427546 val_acc: 0.7170471038395567


Epoch 10: 100%|██████████| 691/691 [00:28<00:00, 24.02it/s]


epoch: 9 train_loss: 0.2091788559311033 val_loss: 1.172158757413643 val_acc: 0.7247328143554559


In [6]:
def test_model_and_save_predictions(model, test_loader, 
                                  src_vocab, src_idx2char,
                                  trg_vocab, trg_idx2char, 
                                  filename='vanilla_prediction.tsv'):
    model.eval()
    correct = 0
    total = 0
    pad_idx = trg_vocab['<pad>']
    device = next(model.parameters()).device
    
    inputs_list = []
    predictions_list = []
    targets_list = []

    with torch.no_grad():
        for batch in test_loader:
            src = batch['source'].to(device)
            trg = batch['target'].to(device)
            
            # Greedy decoding
            outputs = model(src, trg, teacher_forcing_ratio=0)
            outputs_reshaped = outputs[:, 1:].reshape(-1, outputs.size(-1))
            targets = batch['target_output'].to(device).reshape(-1)
            
            _, predicted = torch.max(outputs_reshaped, 1)
            mask = targets != pad_idx
            correct += ((predicted == targets) * mask).sum().item()
            total += mask.sum().item()
            
            # Convert indices to strings
            for i in range(src.size(0)):
                # Decode SOURCE (Latin) using source vocab
                src_seq = src[i].cpu().tolist()
                src_chars = [src_idx2char[idx] for idx in src_seq 
                           if idx not in [src_vocab['<start>'], src_vocab['<end>'], src_vocab['<pad>']]]
                latin_input = ''.join(src_chars)
                
                # Decode PREDICTION (Devanagari) using target vocab
                pred_seq = outputs[i].argmax(dim=1).cpu().tolist()
                pred_chars = [trg_idx2char[idx] for idx in pred_seq 
                            if idx not in [trg_vocab['<start>'], trg_vocab['<end>'], trg_vocab['<pad>']]]
                devanagari_pred = ''.join(pred_chars)
                
                # Decode TARGET (Devanagari) using target vocab
                trg_seq = trg[i].cpu().tolist()
                trg_chars = [trg_idx2char[idx] for idx in trg_seq 
                           if idx not in [trg_vocab['<start>'], trg_vocab['<end>'], trg_vocab['<pad>']]]
                devanagari_target = ''.join(trg_chars)
                
                inputs_list.append(latin_input)
                predictions_list.append(devanagari_pred)
                targets_list.append(devanagari_target)
    
    # Save to TSV
    import pandas as pd
    df = pd.DataFrame({
        'latin_input': inputs_list,
        'devanagari_prediction': predictions_list,
        'devanagari_target': targets_list
    })
    df.to_csv(filename, sep='\t', index=False, encoding='utf-8')
    
    return correct / total

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
best_model = Seq2Seq(
    len(data['src_vocab']),
    len(data['trg_vocab']),
    embed_dim=256,  # Replace with best params from sweep
    hidden_size=512
).to(device)
best_model.load_state_dict(torch.load("best_model.pth"))

# Usage
test_acc = test_model_and_save_predictions(
    best_model, 
    data['test_loader'],
    data['src_vocab'],  # Source vocab (Latin)
    data['src_idx2char'],
    data['trg_vocab'],  # Target vocab (Devanagari)
    data['trg_idx2char'],
    'vanilla_prediction.tsv'
)


In [7]:
print(f"Test Accuracy: {test_acc:.4f}")


Test Accuracy: 0.7126


In [8]:
pred = pd.read_csv("/kaggle/working/vanilla_prediction.tsv" , sep = '\t')

In [9]:
pred.shape

(4502, 3)

In [10]:
correct_pred = pred[pred["devanagari_prediction"] == pred["devanagari_target"]]

In [11]:
correct_pred.shape[0]/pred.shape[0]

0.3427365615282097

1

In [46]:
top_matches = correct_pred.head()

In [47]:
top_matches = pd.concat([top_matches,correct_pred.iloc[500:505]] , axis = 0)

In [48]:
top_matches = pd.concat([top_matches,correct_pred.iloc[1000:1005]] , axis = 0)

In [49]:
top_matches = pd.concat([top_matches,correct_pred.tail()] , axis = 0)

In [50]:
top_matches.shape

(20, 3)

In [51]:
top_matches

Unnamed: 0,latin_input,devanagari_prediction,devanagari_target
0,ank,अंक,अंक
8,angaarak,अंगारक,अंगारक
20,andha,अंधा,अंधा
21,andhapan,अंधापन,अंधापन
22,andheri,अंधेरी,अंधेरी
1596,dvc,डीवीसी,डीवीसी
1624,domenic,डोमेनिक,डोमेनिक
1626,dolane,डोलने,डोलने
1627,dolne,डोलने,डोलने
1633,drager,ड्रेजर,ड्रेजर


In [29]:
pred.tail(10)

Unnamed: 0,latin_input,devanagari_prediction,devanagari_target
4492,have,हवे,हैव
4493,hong,होंग,हॉन्ग
4494,half,हल्फ,हॉफ
4495,hoaf,होफ,हॉफ
4496,hounga,हौंगा,होऊंगा
4497,holding,हॉल्डिंग,होल्डिंग
4498,hoshangabaad,होषंगाबाद,होशंगाबाद
4499,hoshangabad,होषंगाबाद,होशंगाबाद
4500,hostes,हॉस्ट्स,होस्टेस
4501,hostess,हॉस्टेसस,होस्टेस


In [52]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("dl3wandb")


In [53]:
wandb.login(key =secret_value_0 )
wandb.init(project="DA6401_assignment3", name="devanagari_match_log")
table = wandb.Table(columns=["latin_input","devanagari_prediction", "devanagari_target"])
for _, row in top_matches.iterrows():
    table.add_data(row["latin_input"],row["devanagari_prediction"], row["devanagari_target"])

# Step 6: Log the table
wandb.log({"Sample Matches": table})

# Finish the run
wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mda24s021[0m ([33mda24s021-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
def train(config=None):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    with wandb.init(config=config):
        config = wandb.config
        pad_idx = data['trg_vocab']['<pad>']
        
        # Model initialization
        model = Seq2Seq(
            src_vocab_size=len(data['src_vocab']),
            trg_vocab_size=len(data['trg_vocab']),
            embed_dim=config.embed_dim,
            hidden_size=config.hidden_size,
            enc_layers=config.enc_layers,
            dec_layers=config.dec_layers,
            cell_type=config.cell_type,
            dropout=config.dropout
        ).to(device)
        
        # Training setup
        optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
        criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
        best_val_loss = float('inf')
        
        for epoch in range(config.epochs):
            # Training
            model.train()
            train_loss = 0
            for batch in tqdm(data['train_loader'], desc=f"Epoch {epoch+1}"):
                src = batch['source'].to(device)
                trg = batch['target'].to(device)
                
                optimizer.zero_grad()
                output = model(src, trg, teacher_forcing_ratio=config.teacher_forcing)
                
                # Reshape for loss calculation
                output = output[:, 1:].reshape(-1, output.size(-1))
                targets = batch['target_output'].to(device).reshape(-1)
                
                loss = criterion(output, targets)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                
                train_loss += loss.item()
            
            # Validation
            model.eval()
            val_loss = 0
            correct = 0
            total = 0
            
            with torch.no_grad():
                for batch in data['dev_loader']:
                    src = batch['source'].to(device)
                    trg = batch['target'].to(device)
                    
                    output = model(src, trg, teacher_forcing_ratio=0)
                    output = output[:, 1:].reshape(-1, output.size(-1))
                    targets = batch['target_output'].to(device).reshape(-1)
                    
                    loss = criterion(output, targets)
                    val_loss += loss.item()
                    
                    # Calculate accuracy
                    _, predicted = torch.max(output, 1)
                    mask = targets != pad_idx
                    correct += ((predicted == targets) * mask).sum().item()
                    total += mask.sum().item()
            
            avg_train_loss = train_loss / len(data['train_loader'])
            avg_val_loss = val_loss / len(data['dev_loader'])
            val_acc = correct / total
            
            wandb.log({
                "epoch": epoch,
                "train_loss": avg_train_loss,
                "val_loss": avg_val_loss,
                "val_acc": val_acc
            })
            
            # Save best model
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                torch.save(model.state_dict(), "best_model.pth")


In [None]:



# Sweep configuration
sweep_config = {
    'method': 'bayes',
    'metric': {'name': 'val_loss', 'goal': 'minimize'},
    'parameters': {
        'embed_dim': {'values': [64, 128, 256]},
        'hidden_size': {'values': [128, 256, 512]},
        'enc_layers': {'values': [1, 2]},
        'dec_layers': {'values': [1, 2]},
        'cell_type': {'values': ['lstm', 'gru', 'rnn']},
        'dropout': {'values': [0.0, 0.2, 0.3]},
        'learning_rate': {'values': [0.001, 0.0005]},
        'teacher_forcing': {'values': [0.5, 0.7]},
        'epochs': {'value': 10}
    }
}

# Initialize wandb and run sweep
wandb.login(key =secret_value_0 )
sweep_id = wandb.sweep(sweep_config, project="DA6401_assignment3")
wandb.agent(sweep_id, train)

# Test function (to run after training)


In [None]:
def test_model(model, test_loader, trg_vocab):
    model.eval()
    correct = 0
    total = 0
    pad_idx = trg_vocab['<pad>']
    
    with torch.no_grad():
        for batch in test_loader:
            src = batch['source'].to(device)
            trg = batch['target'].to(device)
            
            # Greedy decoding
            outputs = model(src, trg, teacher_forcing_ratio=0)
            outputs = outputs[:, 1:].reshape(-1, outputs.size(-1))
            targets = batch['target_output'].to(device).reshape(-1)
            
            _, predicted = torch.max(outputs, 1)
            mask = targets != pad_idx
            correct += ((predicted == targets) * mask).sum().item()
            total += mask.sum().item()
    
    return correct / total

# Load best model and test
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
best_model = Seq2Seq(
    len(data['src_vocab']),
    len(data['trg_vocab']),
    embed_dim=256,  # Replace with best params from sweep
    hidden_size=512
).to(device)
best_model.load_state_dict(torch.load("best_model.pth"))
test_acc = test_model(best_model, data['test_loader'], data['trg_vocab'])
print(f"Test Accuracy: {test_acc:.4f}")


In [None]:
# data =  prepare_dakshina_data(base_dir = "/kaggle/input/dakshina/dakshina_dataset_v1.0")