<a href="https://colab.research.google.com/github/Sai-sakunthala/Assignment-3/blob/main/assignment_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install torch wandb pandas tqdm

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [41]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import random
import wandb
import editdistance
import numpy as np

device = 'cuda' if torch.cuda.is_available() else 'cpu'
LANG = 'te'
base_path = f'/content/drive/MyDrive/dakshina_dataset_v1.0/{LANG}/lexicons/'

def read_pairs(filepath, max_len=40):
    pairs = []
    with open(filepath, encoding='utf8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) < 2:
                continue
            src, tgt = parts[0], parts[1]
            if len(src) <= max_len and len(tgt) <= max_len:
                pairs.append((src, tgt))
    return pairs

def build_vocab(sequences):
    vocab = {'<pad>':0, '<sos>':1, '<eos>':2}
    idx = 3
    for seq in sequences:
        for ch in seq:
            if ch not in vocab:
                vocab[ch] = idx
                idx += 1
    inv_vocab = {i:c for c,i in vocab.items()}
    return vocab, inv_vocab

def tokenize(seq, vocab):
    return [vocab['<sos>']] + [vocab[ch] for ch in seq] + [vocab['<eos>']]

def pad_seq(seq, max_len, pad_idx=0):
    return seq + [pad_idx] * (max_len - len(seq))

class TransliterationDataset(Dataset):
    def __init__(self, pairs, src_vocab, tgt_vocab):
        self.src_pad = src_vocab['<pad>']
        self.tgt_pad = tgt_vocab['<pad>']
        self.data = []
        for src, tgt in pairs:
            src_t = tokenize(src, src_vocab)
            tgt_t = tokenize(tgt, tgt_vocab)
            self.data.append((src_t, tgt_t))
        self.src_max = max(len(x[0]) for x in self.data)
        self.tgt_max = max(len(x[1]) for x in self.data)

    def __len__(self): return len(self.data)

    def __getitem__(self, idx):
        src, tgt = self.data[idx]
        src = pad_seq(src, self.src_max, self.src_pad)
        tgt = pad_seq(tgt, self.tgt_max, self.tgt_pad)
        return torch.tensor(src), torch.tensor(tgt)

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout, cell='lstm'):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        rnn_cls = {'rnn': nn.RNN, 'gru': nn.GRU, 'lstm': nn.LSTM}[cell.lower()]
        self.rnn = rnn_cls(emb_dim, hid_dim, n_layers, dropout=dropout if n_layers > 1 else 0, batch_first=True)
        self.cell = cell.lower()

    def forward(self, src):
        embedded = self.embedding(src)
        if self.cell == 'lstm':
            outputs, (hidden, cell) = self.rnn(embedded)
            return hidden, cell
        else:
            outputs, hidden = self.rnn(embedded)
            return hidden, None

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout, cell='lstm'):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        rnn_cls = {'rnn': nn.RNN, 'gru': nn.GRU, 'lstm': nn.LSTM}[cell.lower()]
        self.rnn = rnn_cls(emb_dim, hid_dim, n_layers, dropout=dropout if n_layers > 1 else 0, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.cell = cell.lower()

    def forward(self, input, hidden, cell=None):
        input = input.unsqueeze(1)
        embedded = self.embedding(input)
        if self.cell == 'lstm':
            output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        else:
            output, hidden = self.rnn(embedded, hidden)
            cell = None
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        trg_len = tgt.size(1)
        output_dim = self.decoder.fc_out.out_features

        outputs = torch.zeros(batch_size, trg_len, output_dim).to(self.device)
        hidden, cell = self.encoder(src)
        input = tgt[:, 0]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = tgt[:, t] if teacher_force else top1
        return outputs

def calculate_accuracy(preds, y, pad_idx=0):
    max_preds = preds.argmax(dim=2)
    non_pad = y != pad_idx
    correct = (max_preds == y) & non_pad
    return correct.sum().item() / non_pad.sum().item()

def calculate_cer(preds, targets, pad_idx=0):
    cer, total = 0, 0
    for pred, tgt in zip(preds, targets):
        # Remove PAD tokens for evaluation.
        pred = [p for p in pred if p != pad_idx]
        tgt = [t for t in tgt if t != pad_idx]
        cer += editdistance.eval(pred, tgt)
        total += len(tgt)
    return cer / total if total > 0 else 0

def run(config=None):
    with wandb.init(config=config):
        cfg = wandb.config
        cfg.hidden_dim = 2 * cfg.embed_dim if cfg.hidden_dim_config == 'double' else cfg.embed_dim
        sweep_name = f"{cfg.cell_type}_{embed}e_{hidden}h_{cfg.layers}l_" \
             f"{int(cfg.dropout*100)}d_{int(cfg.teacher_forcing*10)}tf_" \
             f"{str(cfg.lr).replace('.', '')}lr"

        wandb.run.name = sweep_name

        max_len = 30

        # Load data (ensure paths match your dataset—full dataset naming assumed)
        train_pairs = read_pairs(base_path + f"{LANG}.translit.sampled.train.tsv", max_len=max_len)
        val_pairs   = read_pairs(base_path + f"{LANG}.translit.sampled.dev.tsv",   max_len=max_len)
        test_pairs  = read_pairs(base_path + f"{LANG}.translit.sampled.test.tsv",  max_len=max_len)

        src_vocab, _ = build_vocab([x[0] for x in train_pairs])
        tgt_vocab, _ = build_vocab([x[1] for x in train_pairs])

        # (Add assertions to ensure special tokens are consistent)
        assert src_vocab['<pad>'] == 0 and tgt_vocab['<pad>'] == 0, "Pad token must be index 0 in both vocabs."

        train_ds = TransliterationDataset(train_pairs, src_vocab, tgt_vocab)
        val_ds   = TransliterationDataset(val_pairs,   src_vocab, tgt_vocab)
        test_ds  = TransliterationDataset(test_pairs,  src_vocab, tgt_vocab)

        # Use drop_last=True to ensure consistent batch sizes.
        train_dl = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, drop_last=True)
        val_dl   = DataLoader(val_ds,   batch_size=cfg.batch_size, drop_last=True)
        test_dl  = DataLoader(test_ds,  batch_size=cfg.batch_size, drop_last=True)

        encoder = Encoder(len(src_vocab), cfg.embed_dim, cfg.hidden_dim, cfg.layers, cfg.dropout, cfg.cell_type).to(device)
        decoder = Decoder(len(tgt_vocab), cfg.embed_dim, cfg.hidden_dim, cfg.layers, cfg.dropout, cfg.cell_type).to(device)
        model = Seq2Seq(encoder, decoder, device).to(device)

        optimizer = torch.optim.Adam(model.parameters(), lr=cfg.lr)
        criterion = nn.CrossEntropyLoss(ignore_index=src_vocab['<pad>'])
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.5)

        best_val_loss = float('inf')
        patience = 5
        wait = 0

        for epoch in range(cfg.epochs):
            model.train()
            total_loss = 0
            total_acc  = 0

            for src, tgt in train_dl:
                src, tgt = src.to(device), tgt.to(device)
                optimizer.zero_grad()
                output = model(src, tgt, cfg.teacher_forcing)
                out_dim = output.shape[-1]
                loss = criterion(output[:, 1:].reshape(-1, out_dim), tgt[:, 1:].reshape(-1))
                acc = calculate_accuracy(output[:, 1:], tgt[:, 1:], pad_idx=src_vocab['<pad>'])
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()

                total_loss += loss.item()
                total_acc  += acc

            avg_train_loss = total_loss / len(train_dl)
            avg_train_acc  = total_acc / len(train_dl)

            model.eval()
            val_loss = 0
            val_acc  = 0
            val_cer  = 0
            with torch.no_grad():
                for src, tgt in val_dl:
                    src, tgt = src.to(device), tgt.to(device)
                    output = model(src, tgt, teacher_forcing_ratio=0)
                    out_dim = output.shape[-1]
                    loss = criterion(output[:, 1:].reshape(-1, out_dim), tgt[:, 1:].reshape(-1))
                    acc = calculate_accuracy(output[:, 1:], tgt[:, 1:], pad_idx=src_vocab['<pad>'])
                    val_loss += loss.item()
                    val_acc  += acc
                    preds = output.argmax(2).tolist()
                    targets = tgt.tolist()
                    val_cer += calculate_cer(preds, targets, pad_idx=src_vocab['<pad>'])

            avg_val_loss = val_loss / len(val_dl)
            avg_val_acc  = val_acc / len(val_dl)
            avg_val_cer  = val_cer / len(val_dl)

            scheduler.step(avg_val_loss)

            wandb.log({
                'train_loss': avg_train_loss,
                'train_accuracy': avg_train_acc,
                'val_loss': avg_val_loss,
                'val_accuracy': avg_val_acc,
                'val_cer': avg_val_cer,
                'epoch': epoch + 1
            })
            print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.3f} Acc: {avg_train_acc:.3f} | "
                  f"Val Loss: {avg_val_loss:.3f} Acc: {avg_val_acc:.3f} CER: {avg_val_cer:.3f}")

            # Early Stopping Check
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                wait = 0
                torch.save(model.state_dict(), 'best_model.pt')
            else:
                wait += 1
                if wait >= patience:
                    print("Early stopping triggered.")
                    break

        # After training, load the best model and evaluate on the test set.
        model.load_state_dict(torch.load('best_model.pt'))
        model.eval()
        test_loss = 0
        test_acc  = 0
        test_cer  = 0
        with torch.no_grad():
            for src, tgt in test_dl:
                src, tgt = src.to(device), tgt.to(device)
                output = model(src, tgt, teacher_forcing_ratio=0)
                out_dim = output.shape[-1]
                loss = criterion(output[:, 1:].reshape(-1, out_dim), tgt[:, 1:].reshape(-1))
                acc = calculate_accuracy(output[:, 1:], tgt[:, 1:], pad_idx=src_vocab['<pad>'])
                test_loss += loss.item()
                test_acc  += acc
                preds = output.argmax(2).tolist()
                targets = tgt.tolist()
                test_cer += calculate_cer(preds, targets, pad_idx=src_vocab['<pad>'])
        avg_test_loss = test_loss / len(test_dl)
        avg_test_acc  = test_acc / len(test_dl)
        avg_test_cer  = test_cer / len(test_dl)
        print(f"Test Loss: {avg_test_loss:.3f} | Test Acc: {avg_test_acc:.3f} | Test CER: {avg_test_cer:.3f}")

sweep_config = {
    'method': 'bayes',
    'metric': {'name': 'val_loss', 'goal': 'minimize'},
    'parameters': {
        'embed_dim': {'values': [32, 64, 128]},
        'hidden_dim_config': {'values': ['same', 'double']},
        'layers': {'values': [1, 2]},
        'dropout': {'values': [0.2, 0.3]},
        'lr': {'values': [0.001, 0.0005]},
        'cell_type': {'values': ['rnn', 'gru', 'lstm']},
        'teacher_forcing': {'values': [0.5, 0.7]},
        'batch_size': {'value': 64},
        'epochs': {'value': 1}
    }
}

In [29]:
import wandb
wandb.login()

sweep_id = wandb.sweep(sweep_config, project="dakshina-seq2seq")
wandb.agent(sweep_id, function=run, count=1)

Create sweep with ID: q7u772x7
Sweep URL: https://wandb.ai/sai-sakunthala-indian-institute-of-technology-madras/dakshina-seq2seq/sweeps/q7u772x7


[34m[1mwandb[0m: Agent Starting Run: upniv94b with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	device: cpu
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 128
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	layers: 1
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	max_len: 60
[34m[1mwandb[0m: 	teacher_forcing: 0.5


Epoch 1 | Train Loss: 1.969 Acc: 0.402 | Val Loss: 1.860 Acc: 0.412 CER: 1.521
Test Loss: 1.888 | Test Acc: 0.403 | Test CER: 1.684


0,1
epoch,▁
train_accuracy,▁
train_loss,▁
val_accuracy,▁
val_cer,▁
val_loss,▁

0,1
epoch,1.0
train_accuracy,0.40152
train_loss,1.9694
val_accuracy,0.41206
val_cer,1.52082
val_loss,1.86028


In [30]:
wandb.finish()