In [None]:

# Import torch and reload your PyTorch model
import torch


In [None]:
import os

dataset_root = "langData/dataset - Copy"   # adjust if needed
for poet in os.listdir(dataset_root):
    print("Poet folder:", poet)
    poet_path = os.path.join(dataset_root, poet)
    print("Contains:", os.listdir(poet_path))


In [None]:
poet = os.listdir('langData/dataset - Copy')[0]  # first poet folder
print("UR files:", os.listdir(os.path.join(dataset_root, poet, "ur")))
print("EN files:", os.listdir(os.path.join(dataset_root, poet, "en")))


In [None]:
raw_pairs = collect_pairs(dataset_root)
print(f"Total pairs: {len(raw_pairs)}")
print("Example:", raw_pairs[:5])

raw_pairs = collect_pairs(dataset_root)
print(f"Total pairs: {len(raw_pairs)}")
print("Example:", raw_pairs[:5])


In [None]:
# Cell 1: Imports (run once)
import os
import unicodedata
import re
from collections import defaultdict, Counter
import torch
from sklearn.model_selection import train_test_split
import pickle  # For saving

# Cell 2: Collect pairs
def collect_pairs(dataset_root):
    pairs = []
    for poet in os.listdir(dataset_root):
        poet_path = os.path.join(dataset_root, poet)
        if not os.path.isdir(poet_path):
            continue

        ur_path = os.path.join(poet_path, 'ur')
        en_path = os.path.join(poet_path, 'en')

        if os.path.exists(ur_path) and os.path.exists(en_path):
            for filename in os.listdir(ur_path):
                ur_file = os.path.join(ur_path, filename)
                en_file = os.path.join(en_path, filename)  # same name expected
                if os.path.exists(en_file):  # check English file exists
                    try:
                        with open(ur_file, 'r', encoding='utf-8') as f_ur, \
                             open(en_file, 'r', encoding='utf-8') as f_en:
                            ur_lines = f_ur.readlines()
                            en_lines = f_en.readlines()
                            if len(ur_lines) == len(en_lines):
                                for ur_line, en_line in zip(ur_lines, en_lines):
                                    if ur_line.strip() and en_line.strip():
                                        pairs.append((ur_line.strip(), en_line.strip()))
                    except Exception as e:
                        print(f"Error in {filename}: {e}")
    return pairs

# Usage: Replace with your path
dataset_root = 'langData/dataset - Copy'  # Example
raw_pairs = collect_pairs(dataset_root)
print(f"Total pairs: {len(raw_pairs)}")  # Check ~thousands

# Cell 3: Clean text
def clean_text(text, is_urdu=True):
    text = unicodedata.normalize('NFC', text)  # Normalize Unicode
    text = re.sub(r'\s+', ' ', text).strip()  # Extra spaces
    # Keep poetic punct; remove only junk if needed: text = re.sub(r'[^\w\s\u0600-\u06FF]', '', text) for Urdu range
    if not is_urdu:
        text = text.lower()  # Lower Roman Urdu
    return text

cleaned_pairs = [(clean_text(ur, is_urdu=True), clean_text(en, is_urdu=False)) for ur, en in raw_pairs]

# Cell 4: Splits
train_pairs, val_test_pairs = train_test_split(cleaned_pairs, train_size=0.5, random_state=42)
val_pairs, test_pairs = train_test_split(val_test_pairs, test_size=0.5, random_state=42)
print(f"Train: {len(train_pairs)}, Val: {len(val_pairs)}, Test: {len(test_pairs)}")

# Cell 5: BPE from scratch (full impl)
class BPE:
    def __init__(self, vocab_size=10000):  # Adjust size
        self.vocab_size = vocab_size
        self.merges = {}
        self.vocab = {}

    def get_stats(self, byte_arr):
        count = defaultdict(int)
        for pair in zip(byte_arr[:-1], byte_arr[1:]):
            count[pair] += 1
        return count

    def merge(self, text_bytes, pair, new_byte):
        new_bytes = []
        i = 0
        while i < len(text_bytes):
            if i < len(text_bytes) - 1 and text_bytes[i] == pair[0] and text_bytes[i + 1] == pair[1]:
                new_bytes.append(new_byte)
                i += 2
            else:
                new_bytes.append(text_bytes[i])
                i += 1
        return new_bytes

    def train(self, corpus):
        all_text = ' '.join(corpus)  # Concat for training
        text_bytes = list(all_text.encode('utf-8'))  # Byte list
        vocab = {i: bytes([i]) for i in range(256)}
        num_merges = self.vocab_size - 256
        for i in range(num_merges):
            stats = self.get_stats(text_bytes)
            if not stats:
                break
            top_pair = max(stats, key=stats.get)
            new_idx = 256 + i
            text_bytes = self.merge(text_bytes, top_pair, new_idx)
            self.merges[top_pair] = new_idx
            vocab[new_idx] = vocab[top_pair[0]] + vocab[top_pair[1]]
        self.vocab = vocab

    def encode(self, text):
        text_bytes = list(text.encode('utf-8'))
        while len(text_bytes) >= 2:
            stats = self.get_stats(text_bytes)
            pair = min(stats, key=lambda p: self.merges.get(p, float('inf')))
            if pair not in self.merges:
                break
            new_byte = self.merges[pair]
            text_bytes = self.merge(text_bytes, pair, new_byte)
        return text_bytes  # Token IDs

    def decode(self, tokens):
        text = b''.join(self.vocab[t] for t in tokens if t in self.vocab)
        return text.decode('utf-8', errors='replace')

# Train separate BPE for ur and en (on train only)
ur_corpus = [ur for ur, _ in train_pairs]
en_corpus = [en for _, en in train_pairs]

ur_bpe = BPE(vocab_size=5000)  # Small for low-resource
ur_bpe.train(ur_corpus)

en_bpe = BPE(vocab_size=5000)
en_bpe.train(en_corpus)

# Tokenize datasets (add SOS/EOS later in model)
def tokenize_pairs(pairs, src_bpe, tgt_bpe):
    return [(src_bpe.encode(ur), tgt_bpe.encode(en)) for ur, en in pairs]

train_data = tokenize_pairs(train_pairs, ur_bpe, en_bpe)
val_data = tokenize_pairs(val_pairs, ur_bpe, en_bpe)
test_data = tokenize_pairs(test_pairs, ur_bpe, en_bpe)

# Save
with open('bpe_models.pkl', 'wb') as f:
    pickle.dump((ur_bpe, en_bpe), f)

In [None]:
import math
import torch
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size, num_layers=2, dropout=0.1):
        super().__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size  # per-direction hidden size
        self.embedding = nn.Embedding(input_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embedding_dim, hidden_size, num_layers,
            bidirectional=True, dropout=dropout
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src: [seq_len, batch]
        embedded = self.dropout(self.embedding(src))      # [seq_len, batch, emb]
        outputs, (hidden, cell) = self.lstm(embedded)
        # hidden / cell: [num_layers * 2, batch, hidden_size] (2 for directions)

        # Reshape -> [num_layers, 2, batch, hidden_size]
        hidden = hidden.view(self.num_layers, 2, src.size(1), self.hidden_size)
        cell   = cell.view(self.num_layers, 2, src.size(1), self.hidden_size)

        # Concat forward & backward for each layer -> [num_layers, batch, hidden_size*2]
        hidden = torch.cat((hidden[:,0,:,:], hidden[:,1,:,:]), dim=2)
        cell   = torch.cat((cell[:,0,:,:],   cell[:,1,:,:]),   dim=2)

        # outputs shape remains [seq_len, batch, hidden_size*2] (because bidir output)
        return outputs, hidden, cell


class Decoder(nn.Module):
    def __init__(self, output_size, embedding_dim, hidden_size, num_layers=4, dropout=0.1):
        super().__init__()
        # hidden_size here = per-direction encoder hidden_size; decoder's RNN hidden will be hidden_size*2
        self.num_layers = num_layers
        self.embedding = nn.Embedding(output_size, embedding_dim, padding_idx=0)
        self.rnn_hidden = hidden_size * 2
        self.lstm = nn.LSTM(embedding_dim, self.rnn_hidden, num_layers, dropout=dropout)
        self.fc = nn.Linear(self.rnn_hidden, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, tgt_token, hidden, cell):
        # tgt_token: [batch] (one timestep)
        tgt = tgt_token.unsqueeze(0)                    # [1, batch]
        embedded = self.dropout(self.embedding(tgt))   # [1, batch, emb]
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(0))        # [batch, output_size]
        return prediction, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

        self.enc_layers = encoder.num_layers
        self.dec_layers = decoder.num_layers

        # Create bridge (map last-dim from enc_hidden*2 -> decoder.rnn_hidden)
        in_feats = encoder.hidden_size * 2
        out_feats = decoder.rnn_hidden
        if self.enc_layers != self.dec_layers or in_feats != out_feats:
            self.hidden_bridge = nn.Linear(in_feats, out_feats)
            self.cell_bridge = nn.Linear(in_feats, out_feats)
        else:
            self.hidden_bridge = None
            self.cell_bridge = None

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        # src: [src_len, batch], tgt: [tgt_len, batch]
        batch_size = tgt.shape[1]
        tgt_len = tgt.shape[0]
        tgt_vocab_size = self.decoder.fc.out_features
        outputs = torch.zeros(tgt_len, batch_size, tgt_vocab_size, device=src.device)

        # Encoder
        _, hidden, cell = self.encoder(src)
        # hidden & cell: [enc_layers, batch, enc_hidden*2]

        # Bridge (map last dimension) if needed
        if self.hidden_bridge is not None:
            hidden = self.hidden_bridge(hidden)  # applies to last dim -> [enc_layers, batch, dec_hidden]
            cell = self.cell_bridge(cell)

        # Expand (repeat) along layer dimension only ONCE to match decoder.num_layers
        if self.enc_layers != self.dec_layers:
            repeats = math.ceil(self.dec_layers / self.enc_layers)
            hidden = hidden.repeat(repeats, 1, 1)[:self.dec_layers, :, :]  # [dec_layers, batch, dec_hidden]
            cell = cell.repeat(repeats, 1, 1)[:self.dec_layers, :, :]

        # Decoder loop
        input = tgt[0, :]  # SOS tokens
        for t in range(1, tgt_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = tgt[t] if teacher_force else top1

        return outputs


# New Section
Sanity Check

In [None]:
# small shape check (no gradients needed)
input_size = len(ur_bpe.vocab)
output_size = len(en_bpe.vocab)
enc = Encoder(input_size, embedding_dim=32, hidden_size=64, num_layers=2)
dec = Decoder(output_size, embedding_dim=32, hidden_size=64, num_layers=4)
model = Seq2Seq(enc, dec)
src = torch.randint(0, input_size, (10, 8))   # [src_len, batch=8]
tgt = torch.randint(0, output_size, (12, 8))  # [tgt_len, batch=8]
out = model(src, tgt)                         # should run without hidden-size errors
print("out.shape:", out.shape)  # (tgt_len, batch, vocab)


In [None]:
# Cell: Data preparation
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx][0]), torch.tensor(self.data[idx][1])  # src, tgt tokens

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=0, batch_first=False)  # [seq, batch]
    tgt_batch = pad_sequence(tgt_batch, padding_value=0, batch_first=False)
    return src_batch, tgt_batch

# Loaders (adjust batch_size later)
batch_size = 32
train_loader = DataLoader(TranslationDataset(train_data), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(TranslationDataset(val_data), batch_size=batch_size, collate_fn=collate_fn)
test_loader = DataLoader(TranslationDataset(test_data), batch_size=batch_size, collate_fn=collate_fn)

# Cell: Training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hypers (vary in experiments)
input_size = len(ur_bpe.vocab)  # Src vocab
output_size = len(en_bpe.vocab)  # Tgt vocab
embedding_dim = 256
hidden_size = 512
encoder_layers = 2
decoder_layers = 4
dropout = 0.1
learning_rate = 1e-3
num_epochs = 20  # Adjust

encoder = Encoder(input_size, embedding_dim, hidden_size, encoder_layers, dropout)
decoder = Decoder(output_size, embedding_dim, hidden_size, decoder_layers, dropout)
model = Seq2Seq(encoder, decoder).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Pad=0

def train(model, loader, optimizer, criterion, clip=1):
    model.train()
    epoch_loss = 0
    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        tgt = tgt[1:].view(-1)
        loss = criterion(output, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(loader)

def evaluate(model, loader, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, tgt in loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt, 0)  # No teacher forcing
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            tgt = tgt[1:].view(-1)
            loss = criterion(output, tgt)
            epoch_loss += loss.item()
    return epoch_loss / len(loader)

# Train loop
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, criterion)
    val_loss = evaluate(model, val_loader, criterion)
    print(f'Epoch {epoch}: Train Loss {train_loss:.3f}, Val Loss {val_loss:.3f}')

# Save model
torch.save(model.state_dict(), 'model.pt')

In [None]:
# Cell: Metrics + Experiments
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import math
from jiwer import cer, wer  # CER and WER
import pandas as pd
import time

nltk.download('punkt')

def calculate_metrics(model, loader, tgt_bpe):
    model.eval()
    refs, hyps = [], []
    perplexities, cers, wers = [], [], []
    with torch.no_grad():
        for src, tgt in loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt, 0)
            for i in range(src.shape[1]):
                pred = output[:, i, :].argmax(1).cpu().tolist()
                ref = tgt[1:, i].cpu().tolist()

                hyp_text = tgt_bpe.decode(pred)
                ref_text = tgt_bpe.decode(ref)

                refs.append([ref_text.split()])
                hyps.append(hyp_text.split())

                # Perplexity
                loss = criterion(output[1:, i, :], tgt[1:, i])
                perplexities.append(math.exp(loss.item()))

                # CER / WER
                cers.append(cer(ref_text, hyp_text))
                wers.append(wer(ref_text, hyp_text))

    bleu = sum(sentence_bleu(r, h, smoothing_function=SmoothingFunction().method1) for r, h in zip(refs, hyps)) / len(refs)
    avg_perp = sum(perplexities) / len(perplexities)
    avg_cer = sum(cers) / len(cers)
    avg_wer = sum(wers) / len(wers)

    return {"BLEU": bleu, "Perplexity": avg_perp, "CER": avg_cer, "WER": avg_wer}


def run_experiment(embedding_dim, hidden_size, encoder_layers=2, decoder_layers=4,
                   dropout=0.1, lr=1e-3, batch_size=32, exp_name=""):
    # Re-create loaders
    global train_loader, val_loader, test_loader
    train_loader = DataLoader(TranslationDataset(train_data), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(TranslationDataset(val_data), batch_size=batch_size, collate_fn=collate_fn)
    test_loader = DataLoader(TranslationDataset(test_data), batch_size=batch_size, collate_fn=collate_fn)

    # Build model
    encoder = Encoder(input_size, embedding_dim, hidden_size, encoder_layers, dropout).to(device)
    decoder = Decoder(output_size, embedding_dim, hidden_size, decoder_layers, dropout).to(device)
    model = Seq2Seq(encoder, decoder).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # Train
    history = {"train_loss": [], "val_loss": []}
    for epoch in range(num_epochs):
        train_loss = train(model, train_loader, optimizer, criterion)
        val_loss = evaluate(model, val_loader, criterion)
        history["train_loss"].append(train_loss)
        history["val_loss"].append(val_loss)
        print(f"[{exp_name}] Epoch {epoch}: Train {train_loss:.3f}, Val {val_loss:.3f}")

    # Final eval
    test_loss = evaluate(model, test_loader, criterion)
    metrics = calculate_metrics(model, test_loader, en_bpe)
    metrics["Test Loss"] = test_loss
    return metrics, history


def run_all_experiments():
    experiments = [
        {"name": "Small baseline", "embedding_dim":128, "hidden_size":256, "encoder_layers":1, "decoder_layers":2, "dropout":0.1, "lr":1e-3, "batch_size":32},
        {"name": "Medium balanced", "embedding_dim":256, "hidden_size":512, "encoder_layers":2, "decoder_layers":3, "dropout":0.3, "lr":5e-4, "batch_size":64},
        {"name": "Large model", "embedding_dim":512, "hidden_size":512, "encoder_layers":4, "decoder_layers":4, "dropout":0.3, "lr":5e-4, "batch_size":64},
        {"name": "High dropout", "embedding_dim":256, "hidden_size":256, "encoder_layers":2, "decoder_layers":2, "dropout":0.5, "lr":1e-3, "batch_size":32},
        {"name": "Stable slow", "embedding_dim":256, "hidden_size":512, "encoder_layers":3, "decoder_layers":4, "dropout":0.3, "lr":1e-4, "batch_size":128},
    ]

    results = []
    for i, config in enumerate(experiments, 1):
        print("\n" + "="*70)
        print(f"Running Experiment {i}/{len(experiments)}: {config['name']}")
        print(config)
        print("="*70)

        start_time = time.time()
        metrics, history = run_experiment(
            embedding_dim=config["embedding_dim"],
            hidden_size=config["hidden_size"],
            encoder_layers=config["encoder_layers"],
            decoder_layers=config["decoder_layers"],
            dropout=config["dropout"],
            lr=config["lr"],
            batch_size=config["batch_size"],
            exp_name=config["name"]
        )
        elapsed = time.time() - start_time
        metrics["Name"] = config["name"]
        metrics["Time (s)"] = elapsed
        results.append(metrics)

    df = pd.DataFrame(results)
    print("\n=== Final Results ===")
    print(df)
    return df


In [None]:
df_results = run_all_experiments()
