In [1]:
import torch
print("PyTorch:", torch.__version__)
print("CUDA available?", torch.cuda.is_available())


PyTorch: 2.8.0+cu126
CUDA available? True


In [2]:
!rm -rf /content/drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [3]:
from pathlib import Path
PROJECT = Path("/content/drive/MyDrive/25F-7801")
print(PROJECT)


/content/drive/MyDrive/25F-7801


In [4]:
from pathlib import Path

subdirs = [
    "data/raw", "data/processed",
    "src/bpe", "src/model",
    "checkpoints", "results", "docs"
]

for sd in subdirs:
    (PROJECT/sd).mkdir(parents=True, exist_ok=True)

print("Folders created under:", PROJECT)


Folders created under: /content/drive/MyDrive/25F-7801


In [5]:
from pathlib import Path

for path in PROJECT.rglob("*"):
    if path.is_dir():
        level = len(path.relative_to(PROJECT).parts)
        print("  " * level + path.name + "/")


  data/
  src/
  checkpoints/
  results/
  docs/
    raw/
    processed/
    bpe/
    model/


In [6]:
import os
os.chdir(PROJECT)
!pwd
!ls -la


/content/drive/MyDrive/25F-7801
total 20
drwx------ 2 root root 4096 Sep 21 11:45 checkpoints
drwx------ 4 root root 4096 Sep 21 11:45 data
drwx------ 2 root root 4096 Sep 21 11:45 docs
drwx------ 2 root root 4096 Sep 21 11:45 results
drwx------ 4 root root 4096 Sep 21 11:45 src


In [7]:
!git clone https://github.com/amir9ume/urdu_ghazals_rekhta.git


Cloning into 'urdu_ghazals_rekhta'...
remote: Enumerating objects: 112, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 112 (delta 7), reused 6 (delta 6), pack-reused 103 (from 1)[K
Receiving objects: 100% (112/112), 2.03 MiB | 19.64 MiB/s, done.
Resolving deltas: 100% (51/51), done.


In [8]:
!ls -R urdu_ghazals_rekhta | head -50


urdu_ghazals_rekhta:
dataset
LICENSE
README.md
rekhta_parser.ipynb
sample_dataset

urdu_ghazals_rekhta/dataset:
dataset.zip
README.md

urdu_ghazals_rekhta/sample_dataset:
english_transliteration
hi-hazaaron-khvaahishen-aisii-ki-har-khvaahish-pe-dam-nikle-mirza-ghalib-ghazals
ur-hazaaron-khvaahishen-aisii-ki-har-khvaahish-pe-dam-nikle-mirza-ghalib-ghazals


In [10]:
# GitHub se dataset clone
!rm -rf /content/urdu_ghazals_rekhta
!git clone --depth 1 https://github.com/amir9ume/urdu_ghazals_rekhta /content/urdu_ghazals_rekhta

# raw mein copy
import shutil, os
shutil.rmtree(os.path.join(PROJECT, "data/raw"), ignore_errors=True)
shutil.copytree("/content/urdu_ghazals_rekhta", os.path.join(PROJECT, "data/raw"))
print("Copied to:", os.path.join(PROJECT, "data/raw"))


Cloning into '/content/urdu_ghazals_rekhta'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 12 (delta 0), reused 8 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (12/12), 1.94 MiB | 5.29 MiB/s, done.
Copied to: /content/drive/MyDrive/25F-7801/data/raw


In [14]:
!unzip -q /content/drive/MyDrive/25F-7801/data/raw/dataset/dataset.zip -d /content/drive/MyDrive/25F-7801/data/raw/dataset/


In [15]:
!ls -R /content/drive/MyDrive/25F-7801/data/raw/dataset| head -50


/content/drive/MyDrive/25F-7801/data/raw/dataset:
dataset
dataset.zip
__MACOSX
README.md

/content/drive/MyDrive/25F-7801/data/raw/dataset/dataset:
ahmad-faraz
akbar-allahabadi
allama-iqbal
altaf-hussain-hali
ameer-khusrau
bahadur-shah-zafar
dagh-dehlvi
fahmida-riaz
faiz-ahmad-faiz
firaq-gorakhpuri
gulzar
habib-jalib
jaan-nisar-akhtar
jaun-eliya
javed-akhtar
jigar-moradabadi
kaifi-azmi
meer-anees
meer-taqi-meer
mirza-ghalib
mohsin-naqvi
naji-shakir
naseer-turabi
nazm-tabatabai
nida-fazli
noon-meem-rashid
parveen-shakir
sahir-ludhianvi
wali-mohammad-wali
waseem-barelvi

/content/drive/MyDrive/25F-7801/data/raw/dataset/dataset/ahmad-faraz:
en
hi
ur

/content/drive/MyDrive/25F-7801/data/raw/dataset/dataset/ahmad-faraz/en:
aankh-se-duur-na-ho-dil-se-utar-jaaegaa-ahmad-faraz-ghazals
aashiqii-men-miir-jaise-khvaab-mat-dekhaa-karo-ahmad-faraz-ghazals
ab-aur-kyaa-kisii-se-maraasim-badhaaen-ham-ahmad-faraz-ghazals
abhii-kuchh-aur-karishme-gazal-ke-dekhte-hain-ahmad-faraz-ghazals
ab-ke-ham-bichh

In [16]:

!ls -R /content/drive/MyDrive/25F-7801/data/raw/dataset/dataset/ahmad-faraz | head -50


/content/drive/MyDrive/25F-7801/data/raw/dataset/dataset/ahmad-faraz:
en
hi
ur

/content/drive/MyDrive/25F-7801/data/raw/dataset/dataset/ahmad-faraz/en:
aankh-se-duur-na-ho-dil-se-utar-jaaegaa-ahmad-faraz-ghazals
aashiqii-men-miir-jaise-khvaab-mat-dekhaa-karo-ahmad-faraz-ghazals
ab-aur-kyaa-kisii-se-maraasim-badhaaen-ham-ahmad-faraz-ghazals
abhii-kuchh-aur-karishme-gazal-ke-dekhte-hain-ahmad-faraz-ghazals
ab-ke-ham-bichhde-to-shaayad-kabhii-khvaabon-men-milen-ahmad-faraz-ghazals
ab-ke-tajdiid-e-vafaa-kaa-nahiin-imkaan-jaanaan-ahmad-faraz-ghazals
ab-kyaa-sochen-kyaa-haalaat-the-kis-kaaran-ye-zahr-piyaa-hai-ahmad-faraz-ghazals
ab-shauq-se-ki-jaan-se-guzar-jaanaa-chaahiye-ahmad-faraz-ghazals
agarche-zor-havaaon-ne-daal-rakkhaa-hai-ahmad-faraz-ghazals
aisaa-hai-ki-sab-khvaab-musalsal-nahiin-hote-ahmad-faraz-ghazals
aise-chup-hain-ki-ye-manzil-bhii-kadii-ho-jaise-ahmad-faraz-ghazals
ajab-junuun-e-masaafat-men-ghar-se-niklaa-thaa-ahmad-faraz-ghazals
avval-avval-kii-dostii-hai-abhii-ahmad-far

In [None]:
!pwd

/content/drive/MyDrive/25F-7801


In [18]:
# ===== preprocessing.py =====
from pathlib import Path
import re, unicodedata, random, shutil
from collections import Counter
import json

# ---- Project Paths ----
PROJECT = Path("/content/drive/MyDrive/25F-7801")
RAW_BASE = PROJECT/"data/raw"
RAW_ROOT = RAW_BASE/"dataset/dataset"
OUT = PROJECT/"data/processed"
OUT.mkdir(parents=True, exist_ok=True)

# ---- Config ----
# If True, isolated roman izāfa tokens like "e" or "i" will be removed from roman text.
# Set to False to keep them (safer).
REMOVE_IZAF = False

# ---- Normalization ----
_tatweel = re.compile(r"[ـ]+")
_zw_chars = "[\u200C\u200D]"
_ur_space = re.compile(r"\s+")

# hyphen variants and splitting regex (used consistently everywhere)
_hyphen_variants = "[-‐-–—]+"   # ASCII hyphen + some unicode variants
_hyphen_re = re.compile(_hyphen_variants)
_split_on_hyphen_or_space = re.compile(r"\s+|"+_hyphen_variants)

URDU_MAP = {
    "\u064A": "\u06CC",  # ARABIC YEH → FARSI YEH
    "\u0643": "\u06A9",  # ARABIC KAF → KEHEH
}

def normalize_urdu(s: str) -> str:
    if not s:
        return s
    s = unicodedata.normalize("NFKC", s)
    s = "".join(URDU_MAP.get(ch, ch) for ch in s)
    s = _tatweel.sub("", s)
    s = re.sub(_zw_chars, "", s)
    s = _ur_space.sub(" ", s.strip())
    return s

def normalize_roman(s: str) -> str:
    if not s:
        return s
    s = s.strip().lower()
    # convert hyphens to spaces so "karam-e-be" -> "karam e be"
    s = _hyphen_re.sub(" ", s)
    # optionally remove isolated izāfa tokens like 'e' or 'i'
    if REMOVE_IZAF:
        s = re.sub(r'\b(e|i)\b', ' ', s)
    # collapse whitespace
    s = re.sub(r"\s+", " ", s).strip()
    return s

# ---- BPE Core ----
SPACE = "▁"
SPECIALS = ["<pad>", "<bos>", "<eos>", "<unk>"]
PAD,BOS,EOS,UNK = range(4)

def _words(lines, is_urdu=False):
    """
    Yield token sequences for BPE learning.
    Each yielded seq is: [SPACE] + list(characters of token)
    Tokens are split on spaces or hyphens (consistent across pipeline).
    """
    for ln in lines:
        ln = normalize_urdu(ln) if is_urdu else normalize_roman(ln)
        if not ln:
            continue
        # split on whitespace or hyphen variants
        for w in _split_on_hyphen_or_space.split(ln):
            if not w:
                continue
            yield [SPACE] + list(w)

def _count_pairs(seqs):
    pairs = Counter()
    for seq in seqs:
        for i in range(len(seq)-1):
            pairs[(seq[i], seq[i+1])] += 1
    return pairs

def _merge_seq(seq, pair):
    a,b = pair
    out=[]; i=0
    while i < len(seq):
        if i+1 < len(seq) and seq[i]==a and seq[i+1]==b:
            out.append(a+b); i+=2
        else:
            out.append(seq[i]); i+=1
    return out
#voc size 8000
def learn_bpe(train_lines, vocab_size=200, min_count=2, is_urdu=False, verbose_every=100):
    seqs = list(_words(train_lines, is_urdu=is_urdu))
    symbols = Counter()
    for s in seqs: symbols.update(s)
    itos = SPECIALS + sorted(symbols.keys())
    stoi = {t:i for i,t in enumerate(itos)}
    merges = []
    step=0
    while len(itos) < vocab_size:
        pairs = _count_pairs(seqs)
        if not pairs: break
        (a,b), c = pairs.most_common(1)[0]
        if c < min_count: break
        seqs = [_merge_seq(s, (a,b)) for s in seqs]
        tok = a+b
        if tok not in stoi:
            stoi[tok] = len(itos); itos.append(tok)
        merges.append((a,b))
        step+=1
        if verbose_every and step%verbose_every==0:
            print(f"[BPE] step={step} merge=({a},{b}) freq={c} vocab={len(itos)}")
    print(f"[BPE] Done. merges={len(merges)} vocab={len(itos)}")
    return {"itos": itos, "merges": merges, "vocab_size": len(itos)}

def apply_bpe(line, model, add_bos_eos=True, is_urdu=False, debug=False):
    merges = model["merges"]
    itos = model["itos"]
    stoi = {t:i for i,t in enumerate(itos)}
    line = normalize_urdu(line) if is_urdu else normalize_roman(line)
    # split tokens consistently
    words = [] if not line else [w for w in _split_on_hyphen_or_space.split(line) if w]
    pieces=[]
    if words:
        for w in words:
            seq = [SPACE] + list(w)
            for a,b in merges:
                seq = _merge_seq(seq, (a,b))
            if debug: print(f"[ENC] word={w} -> pieces={seq}")
            pieces.extend(seq)
    ids = [stoi.get(p, UNK) for p in pieces]
    if add_bos_eos: ids = [BOS] + ids + [EOS]
    if debug: print(f"[ENC] ids={ids}")
    return ids

def save_bpe(model, path):
    with open(path,"w",encoding="utf-8") as f:
        json.dump(model,f,ensure_ascii=False)
def load_bpe(path):
    with open(path,encoding="utf-8") as f:
        return json.load(f)

# ---- Preview helper ----
def preview_word_pairs(u_line, r_line, max_pairs=10):
    u_line_n = normalize_urdu(u_line)
    r_line_n = normalize_roman(r_line)
    u_words = [w for w in _split_on_hyphen_or_space.split(u_line_n) if w]
    r_words = [w for w in _split_on_hyphen_or_space.split(r_line_n) if w]
    if not u_words or not r_words:
        return
    if abs(len(u_words)-len(r_words))>2:
        return
    m = min(len(u_words), len(r_words), max_pairs)
    print("[PAIR preview]")
    for i in range(m):
        print(f'  "{u_words[i]}" -> {r_words[i]}')

# ---- Collect Urdu–Roman line pairs ----
pairs = []
poets = sorted([p for p in RAW_ROOT.iterdir() if p.is_dir()])
print("Poets found:", len(poets))

for poet in poets:
    ur_dir = poet/"ur"
    en_dir = poet/"en"
    if not ur_dir.exists() or not en_dir.exists():
        continue
    ur_files = {p.name:p for p in ur_dir.iterdir() if p.is_file()}
    en_files = {p.name:p for p in en_dir.iterdir() if p.is_file()}
    shared = sorted(set(ur_files) & set(en_files))
    for name in shared:
        ur_text = ur_files[name].read_text(encoding="utf-8",errors="ignore")
        en_text = en_files[name].read_text(encoding="utf-8",errors="ignore")
        ur_lines = [normalize_urdu(x) for x in ur_text.splitlines() if x.strip()]
        en_lines = [normalize_roman(x) for x in en_text.splitlines() if x.strip()]
        m = min(len(ur_lines), len(en_lines))
        for i in range(m):
            u, r = ur_lines[i], en_lines[i]
            if 2 <= len(u.split()) <= 60 and 1 <= len(r.split()) <= 70:
                pairs.append((u,r))
                # preview some random pairs
                if random.random() < 0.0005:
                    preview_word_pairs(u,r)

print("Total line pairs collected:", len(pairs))
assert pairs, "No pairs collected!"

# ---- Train/Val/Test Split ----
random.seed(42); random.shuffle(pairs)
n=len(pairs); n_train=int(0.5*n); n_val=int(0.25*n)
train=pairs[:n_train]; val=pairs[n_train:n_train+n_val]; test=pairs[n_train+n_val:]

def dump(split,name):
    (OUT/f"{name}.src").write_text("\n".join(u for u,_ in split)+"\n", encoding="utf-8")
    (OUT/f"{name}.tgt").write_text("\n".join(r for _,r in split)+"\n", encoding="utf-8")

dump(train,"train"); dump(val,"val"); dump(test,"test")
print("Saved splits:", OUT, "sizes:", len(train), len(val), len(test))

# ---- Optional: train BPE on both sides ----
urdu_lines=[u for u,_ in train]
roman_lines=[r for _,r in train]
#voc size 2000
ur_model=learn_bpe(urdu_lines, vocab_size=200, is_urdu=True, verbose_every=200)
ro_model=learn_bpe(roman_lines, vocab_size=200, is_urdu=False, verbose_every=200)
save_bpe(ur_model, OUT/"bpe_ur.json")
save_bpe(ro_model, OUT/"bpe_ro.json")

# ---- Quick Debug ----
sample_u,sample_r = pairs[0]
print("\nSample Encoding:")
print("Urdu:", sample_u)
apply_bpe(sample_u, ur_model, is_urdu=True, debug=True)
print("Roman:", sample_r)
apply_bpe(sample_r, ro_model, is_urdu=False, debug=True)
preview_word_pairs(sample_u,sample_r)


Poets found: 30
[PAIR preview]
  "پھر" -> phir
  "کسی" -> kisī
  "کام" -> kaam
  "کا" -> kā
  "باقی" -> baaqī
  "نہیں" -> nahīñ
  "رہتا" -> rahtā
  "انساں" -> insāñ
[PAIR preview]
  "جو" -> jo
  "ذکر" -> zikr
  "آتا" -> aatā
  "ہے" -> hai
  "آخرت" -> āḳhirat
  "کا" -> kā
  "تو" -> to
  "آپ" -> aap
  "ہوتے" -> hote
  "ہیں" -> haiñ
[PAIR preview]
  "وہ" -> vo
  "کون" -> kaun
  "تھا" -> thā
  "کہ" -> ki
  "تمہیں" -> tumheñ
  "جس" -> jis
  "نے" -> ne
  "بے" -> bevafā
  "وفا" -> jaanā
[PAIR preview]
  "اسی" -> isī
  "سے" -> se
  "تو" -> to
  "سر" -> sar
  "آنکھوں" -> āñkhoñ
  "پر" -> par
  "مرا" -> mirā
  "دیوان" -> dīvān
  "لیتے" -> lete
  "ہیں" -> haiñ
[PAIR preview]
  "شام" -> shaam
  "سے" -> se
  "آنکھ" -> aañkh
  "میں" -> meñ
  "نمی" -> namī
  "سی" -> sī
  "ہے" -> hai
[PAIR preview]
  "ایذا" -> īzā
  "دہی" -> dahī
  "کی" -> kī
  "داد" -> daad
  "جو" -> jo
  "پاتا" -> paatā
  "رہا" -> rahā
  "ہوں" -> huuñ
  "میں" -> maiñ
[PAIR preview]
  "آتی" -> aatī
  "ہے" -> hai
  "ہم" -> ham
  "کو" 

In [19]:

# ---- Encode splits into ID files ----
DATA = OUT  # same as OUT
src_model = load_bpe(DATA/"bpe_ur.json")   # Urdu model
tgt_model = load_bpe(DATA/"bpe_ro.json")   # Roman model

def encode_file_to_ids(in_path, out_path, model, is_urdu=False):
    lines = Path(in_path).read_text(encoding="utf-8").splitlines()
    with open(out_path, "w", encoding="utf-8") as f:
        for ln in lines:
            ids = apply_bpe(ln, model, add_bos_eos=True, is_urdu=is_urdu)
            f.write(" ".join(map(str, ids)) + "\n")

# Urdu source side
encode_file_to_ids(DATA/"train.src", DATA/"train.src.ids", src_model, is_urdu=True)
encode_file_to_ids(DATA/"val.src",   DATA/"val.src.ids",   src_model, is_urdu=True)
encode_file_to_ids(DATA/"test.src",  DATA/"test.src.ids",  src_model, is_urdu=True)

# Roman target side
encode_file_to_ids(DATA/"train.tgt", DATA/"train.tgt.ids", tgt_model, is_urdu=False)
encode_file_to_ids(DATA/"val.tgt",   DATA/"val.tgt.ids",   tgt_model, is_urdu=False)
encode_file_to_ids(DATA/"test.tgt",  DATA/"test.tgt.ids",  tgt_model, is_urdu=False)

# Quick check
!sed -n '1,3p' "{DATA}/train.src.ids"; echo "-----"; sed -n '1,3p' "{DATA}/train.tgt.ids"


1 126 53 27 7 154 78 68 64 33 13 27 13 96 74 32 13 106 81 67 183 60 2
1 70 13 96 61 32 48 84 137 67 14 81 64 115 54 76 18 35 75 69 68 2
1 59 22 20 121 105 13 17 23 72 64 36 59 36 63 60 86 54 135 25 75 35 20 62 13 110 102 2
-----
1 171 13 8 16 33 5 79 21 71 52 49 8 24 35 33 35 50 75 8 13 35 108 74 57 114 15 61 2
1 60 35 50 54 13 14 90 183 57 87 74 160 17 113 137 20 92 59 52 2
1 47 48 11 138 92 17 16 33 88 49 123 77 21 46 16 146 151 12 172 97 122 20 186 51 35 29 48 105 2


In [20]:


# ===== BiLSTM Encoder (2 layers) + LSTM Decoder (4 layers) =====
import torch
import torch.nn as nn
import json
from pathlib import Path

# --- specials (same as preprocessing) ---
PAD, BOS, EOS, UNK = 0, 1, 2, 3

# --- Encoder: 2-layer BiLSTM ---
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=256, hid=512, n_layers=2, dropout=0.3):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=PAD)
        # BiLSTM -> hidden per direction = hid//2, concat => hid
        self.rnn = nn.LSTM(
            emb_dim, hid // 2, num_layers=n_layers, batch_first=True,
            dropout=dropout if n_layers > 1 else 0.0, bidirectional=True
        )

    def forward(self, x):
        emb = self.emb(x)                  # (B, T, E)
        out, (h, c) = self.rnn(emb)       # out: (B, T, H)  h/c: (2*n_layers, B, H/2)
        return out, (h, c)

# --- Decoder: 4-layer LSTM ---
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=256, hid=512, n_layers=4, dropout=0.3):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=PAD)
        self.rnn = nn.LSTM(
            emb_dim, hid, num_layers=n_layers, batch_first=True,
            dropout=dropout if n_layers > 1 else 0.0
        )
        self.fc = nn.Linear(hid, vocab_size)

    def forward(self, y_in, state):
        emb = self.emb(y_in)               # (B, T_in, E)
        out, state = self.rnn(emb, state)  # (B, T_in, H)
        logits = self.fc(out)              # (B, T_in, V)
        return logits, state

# --- Seq2Seq wrapper + bridge (encoder -> decoder states) ---
class Seq2Seq(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, emb=256, hid=512, enc_layers=2, dec_layers=4, dropout=0.3):
        super().__init__()
        self.encoder = Encoder(src_vocab, emb, hid, enc_layers, dropout)
        self.decoder = Decoder(tgt_vocab, emb, hid, dec_layers, dropout)
        self.bridge_h = nn.Linear(hid, hid)
        self.bridge_c = nn.Linear(hid, hid)

    def init_dec_state(self, h, c):
        # take last forward/backward enc states and concat
        H_enc = torch.cat([h[-2], h[-1]], dim=-1)  # (B, H)
        C_enc = torch.cat([c[-2], c[-1]], dim=-1)  # (B, H)
        dec_h0 = self.bridge_h(H_enc).unsqueeze(0).repeat(self.decoder.rnn.num_layers, 1, 1)
        dec_c0 = self.bridge_c(C_enc).unsqueeze(0).repeat(self.decoder.rnn.num_layers, 1, 1)
        return dec_h0, dec_c0

    def forward(self, src, tgt_in):
        enc_out, (h, c) = self.encoder(src)
        dec_state = self.init_dec_state(h, c)
        logits, _ = self.decoder(tgt_in, dec_state)   # (B, T_in, V)
        return logits


# ===== Instantiate with your actual vocab sizes =====
PROJECT = Path("/content/drive/MyDrive/25F-7801")
BPE_DIR = PROJECT / "data/processed"

src_vocab = len(json.load(open(BPE_DIR/"bpe_ur.json", encoding="utf-8"))["itos"])
tgt_vocab = len(json.load(open(BPE_DIR/"bpe_ro.json", encoding="utf-8"))["itos"])

model = Seq2Seq(
    src_vocab=src_vocab, tgt_vocab=tgt_vocab,
    emb=256, hid=512, enc_layers=2, dec_layers=4, dropout=0.3
)

# quick shape test
B, Tsrc, Ttgt = 4, 20, 18
x = torch.randint(low=0, high=src_vocab, size=(B, Tsrc))
y_in = torch.randint(low=0, high=tgt_vocab, size=(B, Ttgt))
with torch.no_grad():
    out = model(x, y_in)
print("OK → logits shape:", tuple(out.shape))



OK → logits shape: (4, 18, 200)


In [21]:


# ===== Dataset + Dataloader for NMT =====
import torch
from torch.utils.data import Dataset, DataLoader
from pathlib import Path

PAD, BOS, EOS, UNK = 0, 1, 2, 3  # same specials as preprocessing + model

# --- helper to read .ids files (already BPE-encoded) ---
def read_ids(path):
    with open(path, encoding="utf-8") as f:
        return [list(map(int, ln.strip().split())) for ln in f if ln.strip()]

# --- Dataset class ---
class NMTDataset(Dataset):
    def __init__(self, src_ids_path, tgt_ids_path, max_len=128):
        self.src = read_ids(src_ids_path)
        self.tgt = read_ids(tgt_ids_path)
        assert len(self.src) == len(self.tgt), "Mismatch between src and tgt lines!"
        self.max_len = max_len

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        s = self.src[idx][:self.max_len]
        t = self.tgt[idx][:self.max_len]
        return torch.tensor(s, dtype=torch.long), torch.tensor(t, dtype=torch.long)

# --- Collate fn (for padding in batch) ---
def collate(batch):
    srcs, tgts = zip(*batch)
    max_s = max(len(x) for x in srcs)
    max_t = max(len(x) for x in tgts)
    src_pad = torch.full((len(batch), max_s), PAD, dtype=torch.long)
    tgt_pad = torch.full((len(batch), max_t), PAD, dtype=torch.long)
    for i, (s, t) in enumerate(zip(srcs, tgts)):
        src_pad[i, :len(s)] = s
        tgt_pad[i, :len(t)] = t
    return src_pad, tgt_pad

# ===== Load train/val/test sets =====
PROJECT = Path("/content/drive/MyDrive/25F-7801")
DATA = PROJECT / "data/processed"

train_set = NMTDataset(DATA/"train.src.ids", DATA/"train.tgt.ids")
val_set   = NMTDataset(DATA/"val.src.ids", DATA/"val.tgt.ids")
test_set  = NMTDataset(DATA/"test.src.ids", DATA/"test.tgt.ids")

train_loader = DataLoader(train_set, batch_size=64, shuffle=True, collate_fn=collate)
val_loader   = DataLoader(val_set, batch_size=64, shuffle=False, collate_fn=collate)
test_loader  = DataLoader(test_set, batch_size=64, shuffle=False, collate_fn=collate)

# quick check
for src, tgt in train_loader:
    print("src batch:", src.shape)
    print("tgt batch:", tgt.shape)
    break


src batch: torch.Size([64, 30])
tgt batch: torch.Size([64, 33])


In [58]:
import torch.nn as nn, math, json
from pathlib import Path

# === Vocab Sizes ===
with open(DATA / "bpe_ur.json", encoding="utf-8") as f:
    src_vocab = len(json.load(f)["itos"])
with open(DATA / "bpe_ro.json", encoding="utf-8") as f:
    tgt_vocab = len(json.load(f)["itos"])

# === Device ===
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# === Model ===
model = model.to(device)

# === Loss & Optimizer ===
criterion = nn.CrossEntropyLoss(ignore_index=PAD)
optim = torch.optim.Adam(model.parameters(), lr=5e-4)



# === Training / Validation Loop ===
def run_epoch(loader, train=True):
    model.train(mode=train)
    total, steps = 0.0, 0

    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)

        # Teacher forcing
        inp  = tgt[:, :-1]  # decoder input
        gold = tgt[:, 1:].contiguous().view(-1)  # expected output

        logits = model(src, inp).contiguous().view(-1, tgt_vocab)
        loss = criterion(logits, gold)

        if train:
            optim.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optim.step()

        total += loss.item()
        steps += 1

    return total / max(steps, 1)

# === Checkpoint Saving ===
best_path = PROJECT / "checkpoints" / "best.pt"
best_path.parent.mkdir(parents=True, exist_ok=True)

best_val = float("inf")
EPOCHS = 15

for ep in range(1, EPOCHS + 1):
    tr = run_epoch(train_loader, train=True)
    vl = run_epoch(val_loader,   train=False)
    ppl = math.exp(min(vl, 20))  # clamp to avoid overflow

    print(f"Epoch {ep:02d} | train_loss {tr:.3f} | val_loss {vl:.3f} | val_ppl {ppl:.2f}")

    if vl < best_val:
        best_val = vl
        torch.save(
            {
                "model": model.state_dict(),
                "src_vocab": src_vocab,
                "tgt_vocab": tgt_vocab,
            },
            best_path,
        )
        print("  ↳ saved:", best_path)


Using device: cuda
Epoch 01 | train_loss 0.406 | val_loss 1.277 | val_ppl 3.59
  ↳ saved: /content/drive/MyDrive/25F-7801/checkpoints/best.pt
Epoch 02 | train_loss 0.365 | val_loss 1.290 | val_ppl 3.63
Epoch 03 | train_loss 0.333 | val_loss 1.294 | val_ppl 3.65
Epoch 04 | train_loss 0.307 | val_loss 1.311 | val_ppl 3.71
Epoch 05 | train_loss 0.285 | val_loss 1.313 | val_ppl 3.72
Epoch 06 | train_loss 0.263 | val_loss 1.323 | val_ppl 3.75
Epoch 07 | train_loss 0.245 | val_loss 1.315 | val_ppl 3.72
Epoch 08 | train_loss 0.226 | val_loss 1.337 | val_ppl 3.81
Epoch 09 | train_loss 0.209 | val_loss 1.351 | val_ppl 3.86
Epoch 10 | train_loss 0.199 | val_loss 1.351 | val_ppl 3.86
Epoch 11 | train_loss 0.182 | val_loss 1.369 | val_ppl 3.93
Epoch 12 | train_loss 0.170 | val_loss 1.368 | val_ppl 3.93
Epoch 13 | train_loss 0.159 | val_loss 1.369 | val_ppl 3.93
Epoch 14 | train_loss 0.147 | val_loss 1.370 | val_ppl 3.94
Epoch 15 | train_loss 0.144 | val_loss 1.383 | val_ppl 3.99


In [24]:
# ====== RUN EXPERIMENTS (no evaluation) ======
import math, time, json, torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from pathlib import Path
import pandas as pd

# --- paths ---
PROJECT  = Path("/content/drive/MyDrive/25F-7801")
DATA     = PROJECT/"data/processed"
CKPT_DIR = PROJECT/"checkpoints"; CKPT_DIR.mkdir(parents=True, exist_ok=True)
RES_DIR  = PROJECT/"results";     RES_DIR.mkdir(parents=True, exist_ok=True)

# --- quick checks ---
assert "Seq2Seq" in globals(), "Seq2Seq not defined — re-run your model-architecture cell."
for p in ["train.src.ids","train.tgt.ids","val.src.ids","val.tgt.ids"]:
    assert (DATA/p).exists(), f"Missing {p} — re-run BPE encode step."

PAD, BOS, EOS, UNK = 0, 1, 2, 3
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# --- vocab sizes ---
src_vocab = len(json.load(open(DATA/"bpe_ur.json", encoding="utf-8"))["itos"])
tgt_vocab = len(json.load(open(DATA/"bpe_ro.json", encoding="utf-8"))["itos"])

# --- dataset/dataloader helpers ---
def read_ids(path):
    with open(path, encoding="utf-8") as f:
        return [list(map(int, ln.strip().split())) for ln in f if ln.strip()]

class NMTDataset(Dataset):
    def __init__(self, src_path, tgt_path, max_len=128):
        self.src = read_ids(src_path); self.tgt = read_ids(tgt_path)
        assert len(self.src)==len(self.tgt)
        self.max_len = max_len
    def __len__(self): return len(self.src)
    def __getitem__(self, i):
        s = self.src[i][:self.max_len]; t = self.tgt[i][:self.max_len]
        return torch.tensor(s), torch.tensor(t)

def collate(batch):
    srcs, tgts = zip(*batch)
    max_s=max(len(x) for x in srcs); max_t=max(len(x) for x in tgts)
    S=torch.full((len(batch),max_s), PAD, dtype=torch.long)
    T=torch.full((len(batch),max_t), PAD, dtype=torch.long)
    for i,(s,t) in enumerate(zip(srcs,tgts)):
        S[i,:len(s)]=s; T[i,:len(t)]=t
    return S,T

# --- one experiment: train & return metrics ---
def run_experiment(tag, emb, hid, enc_layers, dec_layers, dropout, lr, batch, epochs=8, max_len=128):
    train_ds=NMTDataset(DATA/"train.src.ids", DATA/"train.tgt.ids", max_len=max_len)
    val_ds  =NMTDataset(DATA/"val.src.ids",   DATA/"val.tgt.ids",   max_len=max_len)
    train_ld=DataLoader(train_ds, batch_size=batch, shuffle=True,  collate_fn=collate)
    val_ld  =DataLoader(val_ds,   batch_size=batch, shuffle=False, collate_fn=collate)

    model = Seq2Seq(src_vocab, tgt_vocab, emb=emb, hid=hid,
                    enc_layers=enc_layers, dec_layers=dec_layers, dropout=dropout).to(device)
    crit  = nn.CrossEntropyLoss(ignore_index=PAD)
    optim = torch.optim.AdamW(model.parameters(), lr=lr)

    def run_epoch(loader, train=True):
        model.train(mode=train); total=0.0; steps=0
        for src, tgt in loader:
            src, tgt = src.to(device), tgt.to(device)
            inp  = tgt[:, :-1]
            gold = tgt[:, 1:].contiguous().view(-1)
            logits = model(src, inp).contiguous().view(-1, tgt_vocab)
            loss = crit(logits, gold)
            if train:
                optim.zero_grad(); loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optim.step()
            total += float(loss); steps += 1
        return total/max(steps,1)

    best_val=float("inf"); t0=time.time()
    for ep in range(1, epochs+1):
        tr = run_epoch(train_ld, True)
        vl = run_epoch(val_ld,   False)
        ppl = math.exp(min(vl, 20))
        print(f"[{tag}] Ep {ep:02d} | train {tr:.3f} | val {vl:.3f} | ppl {ppl:.2f}")
        if vl < best_val:
            best_val = vl
            torch.save({"model": model.state_dict(),
                        "src_vocab": src_vocab,
                        "tgt_vocab": tgt_vocab}, CKPT_DIR/f"best_{tag}.pt")

    dur = round(time.time()-t0,1)
    return dict(tag=tag, emb=emb, hid=hid, enc_layers=enc_layers, dec_layers=dec_layers,
                dropout=dropout, lr=lr, batch=batch, epochs=epochs, max_len=max_len,
                val_loss=round(best_val,4), val_ppl=round(math.exp(min(best_val,20)),2),
                time_s=dur, ckpt=str(CKPT_DIR/f"best_{tag}.pt"))

# --- define experiments ---
configs = [
    dict(tag="A_emb128_hid256_do0.1", emb=128, hid=256, enc_layers=2, dec_layers=4, dropout=0.1, lr=1e-3, batch=32, epochs=10),
    dict(tag="B_emb256_hid512_do0.3", emb=256, hid=512, enc_layers=2, dec_layers=4, dropout=0.3, lr=5e-4, batch=64, epochs=10),
    dict(tag="C_emb512_hid512_do0.5", emb=512, hid=512, enc_layers=2, dec_layers=4, dropout=0.5, lr=1e-4, batch=64, epochs=10),
]

# --- run all and save CSV ---
rows=[]
for cfg in configs:
    print("\n====== Running:", cfg["tag"], "======")
    rows.append(run_experiment(**cfg))

df = pd.DataFrame(rows)
display(df)
out_csv = RES_DIR/"experiments_val_only.csv"
df.to_csv(out_csv, index=False)
print("Saved:", out_csv)


Device: cuda



Consider using tensor.detach() first. (Triggered internally at /pytorch/torch/csrc/autograd/generated/python_variable_methods.cpp:835.)
  total += float(loss); steps += 1


[A_emb128_hid256_do0.1] Ep 01 | train 4.762 | val 4.440 | ppl 84.73
[A_emb128_hid256_do0.1] Ep 02 | train 4.042 | val 3.735 | ppl 41.90
[A_emb128_hid256_do0.1] Ep 03 | train 3.523 | val 3.332 | ppl 27.99
[A_emb128_hid256_do0.1] Ep 04 | train 3.174 | val 3.045 | ppl 21.00
[A_emb128_hid256_do0.1] Ep 05 | train 2.893 | val 2.816 | ppl 16.70
[A_emb128_hid256_do0.1] Ep 06 | train 2.669 | val 2.659 | ppl 14.28
[A_emb128_hid256_do0.1] Ep 07 | train 2.488 | val 2.514 | ppl 12.35
[A_emb128_hid256_do0.1] Ep 08 | train 2.335 | val 2.413 | ppl 11.17
[A_emb128_hid256_do0.1] Ep 09 | train 2.197 | val 2.329 | ppl 10.27
[A_emb128_hid256_do0.1] Ep 10 | train 2.071 | val 2.245 | ppl 9.44

[B_emb256_hid512_do0.3] Ep 01 | train 4.706 | val 4.488 | ppl 88.91
[B_emb256_hid512_do0.3] Ep 02 | train 4.064 | val 3.608 | ppl 36.89
[B_emb256_hid512_do0.3] Ep 03 | train 3.252 | val 3.082 | ppl 21.81
[B_emb256_hid512_do0.3] Ep 04 | train 2.823 | val 2.693 | ppl 14.78
[B_emb256_hid512_do0.3] Ep 05 | train 2.439 | va

Unnamed: 0,tag,emb,hid,enc_layers,dec_layers,dropout,lr,batch,epochs,max_len,val_loss,val_ppl,time_s,ckpt
0,A_emb128_hid256_do0.1,128,256,2,4,0.1,0.001,32,10,128,2.2446,9.44,54.4,/content/drive/MyDrive/25F-7801/checkpoints/be...
1,B_emb256_hid512_do0.3,256,512,2,4,0.3,0.0005,64,10,128,1.6657,5.29,102.3,/content/drive/MyDrive/25F-7801/checkpoints/be...
2,C_emb512_hid512_do0.5,512,512,2,4,0.5,0.0001,64,10,128,3.3916,29.71,110.9,/content/drive/MyDrive/25F-7801/checkpoints/be...


Saved: /content/drive/MyDrive/25F-7801/results/experiments_val_only.csv


In [25]:
!pip -q install sacrebleu python-Levenshtein
import torch, json, math, sacrebleu, Levenshtein
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

# --- paths ---
PROJECT = Path("/content/drive/MyDrive/25F-7801")
DATA    = PROJECT/"data/processed"
CKPT    = PROJECT/"checkpoints/best.pt"

# --- specials ---
PAD, BOS, EOS, UNK = 0,1,2,3
device = "cuda" if torch.cuda.is_available() else "cpu"

# --- model class must exist ---
assert "Seq2Seq" in globals(), "Seq2Seq class missing — re-run the model-architecture cell."

# --- vocab sizes (directly from processed folder) ---
src_vocab = len(json.load(open(DATA/"bpe_ur.json", encoding="utf-8"))["itos"])
tgt_vocab = len(json.load(open(DATA/"bpe_ro.json", encoding="utf-8"))["itos"])

# --- dataset & collate ---
def read_ids(path):
    with open(path, encoding="utf-8") as f:
        return [list(map(int, ln.strip().split())) for ln in f if ln.strip()]

class NMTDataset(Dataset):
    def __init__(self, src_path, tgt_path, max_len=128):
        self.src = read_ids(src_path); self.tgt = read_ids(tgt_path)
        assert len(self.src)==len(self.tgt)
        self.max_len = max_len
    def __len__(self): return len(self.src)
    def __getitem__(self, i):
        s = self.src[i][:self.max_len]; t = self.tgt[i][:self.max_len]
        return torch.tensor(s), torch.tensor(t)

def collate(batch):
    srcs, tgts = zip(*batch)
    max_s=max(len(x) for x in srcs); max_t=max(len(x) for x in tgts)
    S=torch.full((len(batch),max_s), PAD, dtype=torch.long)
    T=torch.full((len(batch),max_t), PAD, dtype=torch.long)
    for i,(s,t) in enumerate(zip(srcs,tgts)): S[i,:len(s)]=s; T[i,:len(t)]=t
    return S,T

# --- load test data ---
test_ds = NMTDataset(DATA/"test.src.ids", DATA/"test.tgt.ids", max_len=128)
test_ld = DataLoader(test_ds, batch_size=64, shuffle=False, collate_fn=collate)

# --- build model & load checkpoint ---
# ⚠️ adapt emb/hid/layers to match the training config you used
model = Seq2Seq(src_vocab, tgt_vocab, emb=256, hid=512,
                enc_layers=2, dec_layers=4, dropout=0.3).to(device)

ckpt = torch.load(CKPT, map_location=device)
model.load_state_dict(ckpt["model"])
model.eval()

# --- detokenize (target side) ---
tgt_itos = json.load(open(DATA/"bpe_ro.json", encoding="utf-8"))["itos"]
def ids_to_text(ids):
    toks = [tgt_itos[i] if i < len(tgt_itos) else "<unk>" for i in ids]
    toks = [t for t in toks if t not in ["<pad>","<bos>","<eos>","<unk>"]]
    s=""
    for t in toks:
        s += " "+t[1:] if t.startswith("▁") else t
    return s.strip()

# --- greedy decoding ---
def greedy_decode(src_ids, max_len=128):
    src = torch.tensor([src_ids], dtype=torch.long, device=device)
    with torch.no_grad():
        enc_out,(h,c)=model.encoder(src)
        H=torch.cat([h[-2],h[-1]],dim=-1); C=torch.cat([c[-2],c[-1]],dim=-1)
        dh=model.bridge_h(H).unsqueeze(0).repeat(model.decoder.rnn.num_layers,1,1)
        dc=model.bridge_c(C).unsqueeze(0).repeat(model.decoder.rnn.num_layers,1,1)
        y=torch.tensor([[BOS]], dtype=torch.long, device=device)
        out=[]
        for _ in range(max_len):
            logits,(dh,dc)=model.decoder(y,(dh,dc))
            nid=logits[:,-1,:].argmax(dim=-1).item()
            if nid==EOS: break
            out.append(nid)
            y=torch.cat([y, torch.tensor([[nid]], device=device)], dim=1)
    return out

# --- test perplexity ---
criterion = nn.CrossEntropyLoss(ignore_index=PAD)
def test_loss_perplexity():
    model.eval()
    total, steps = 0.0, 0
    with torch.no_grad():
        for src, tgt in test_ld:
            src, tgt = src.to(device), tgt.to(device)
            inp  = tgt[:, :-1]
            gold = tgt[:, 1:].contiguous().view(-1)
            logits = model(src, inp).contiguous().view(-1, tgt_vocab)
            loss = criterion(logits, gold)
            total += float(loss); steps += 1
    avg = total / max(steps,1)
    ppl = math.exp(min(avg, 20))
    return avg, ppl

# --- full evaluation ---
test_src_ids = read_ids(DATA/"test.src.ids")
test_tgt_ids = read_ids(DATA/"test.tgt.ids")

hyps, refs = [], []
for s_ids, t_ids in zip(test_src_ids, test_tgt_ids):
    hyp_ids = greedy_decode(s_ids)
    hyps.append(ids_to_text(hyp_ids))
    refs.append(ids_to_text([i for i in t_ids if i not in (PAD,BOS,EOS)]))

bleu = sacrebleu.corpus_bleu(hyps, [refs]).score

cer_sum, chars = 0, 0
for h, r in zip(hyps, refs):
    cer_sum += Levenshtein.distance(h, r)
    chars   += max(len(r), 1)
cer = 100.0 * cer_sum / max(chars,1)

tst_loss, tst_ppl = test_loss_perplexity()
print(f"TEST — BLEU: {bleu:.2f} | CER: {cer:.2f}% | Loss: {tst_loss:.3f} | PPL: {tst_ppl:.2f}")

# --- sample outputs ---
print("\nQualitative examples:")
for i in range(min(5, len(hyps))):
    print(f"[{i+1}]")
    print("PRED:", hyps[i])
    print("REF :", refs[i])
    print("---")


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m
[?25hTEST — BLEU: 0.49 | CER: 73.14% | Loss: 1.299 | PPL: 3.67

Qualitative examples:
[1]
PRED: maiñ hī nahīñ sar
REF : maiñ akelā hī nahīñ barbād sab
---
[2]
PRED: baddā e tujh
REF : ba.ad muddat ke ye ai 'dāġh' samajh meñ aayā
---
[3]
PRED: tujh o vālā
REF : tam.īz e lāla o gul se hai nāla e bulbul
---
[4]
PRED: mohte ye zarā
REF : muñh khole haiñ ye zaḳhm jo bismil ke chaar pāñch
---
[5]
PRED: barbā.eñ
REF : barbād e mohabbat kī duā saath liye jā
---


StreamLit

In [52]:
from pathlib import Path

PROJECT = Path("/content/drive/MyDrive/25F-7801")
checks = {
    "best_ckpt" : PROJECT/"checkpoints"/"best.pt",
    "bpe_ur"    : PROJECT/"data"/"processed"/"bpe_ur.json",
    "bpe_ro"    : PROJECT/"data"/"processed"/"bpe_ro.json",
    "train_ids" : PROJECT/"data"/"processed"/"train.src.ids"
}

for name, p in checks.items():
    print(f"{name:10s} => {p}  exists? {p.exists()}")


best_ckpt  => /content/drive/MyDrive/25F-7801/checkpoints/best.pt  exists? True
bpe_ur     => /content/drive/MyDrive/25F-7801/data/processed/bpe_ur.json  exists? True
bpe_ro     => /content/drive/MyDrive/25F-7801/data/processed/bpe_ro.json  exists? True
train_ids  => /content/drive/MyDrive/25F-7801/data/processed/train.src.ids  exists? True


In [53]:
import shutil
from pathlib import Path

PROJECT = Path("/content/drive/MyDrive/25F-7801")
DEPLOY = Path("/content/deploy_app")

# remove any old deploy dir and create fresh layout
if DEPLOY.exists():
    shutil.rmtree(DEPLOY)
(DEPLOY/"data"/"processed").mkdir(parents=True, exist_ok=True)
(DEPLOY/"checkpoints").mkdir(parents=True, exist_ok=True)

# source files (from your project)
src_bpe_ur = PROJECT/"data"/"processed"/"bpe_ur.json"
src_bpe_ro = PROJECT/"data"/"processed"/"bpe_ro.json"
best_ckpt   = PROJECT/"checkpoints"/"best.pt"

# copy into deploy folder
shutil.copy2(src_bpe_ur, DEPLOY/"data"/"processed"/"bpe_ur.json")
shutil.copy2(src_bpe_ro, DEPLOY/"data"/"processed"/"bpe_ro.json")
shutil.copy2(best_ckpt,  DEPLOY/"checkpoints"/"best.pt")

# list files in deploy tree
print("Files in /content/deploy_app/:")
for p in sorted(DEPLOY.rglob("*")):
    print(" ", p.relative_to(DEPLOY))


Files in /content/deploy_app/:
  checkpoints
  checkpoints/best.pt
  data
  data/processed
  data/processed/bpe_ro.json
  data/processed/bpe_ur.json


In [54]:
req = """\
streamlit
torch
# optional for eval if needed:
# sacrebleu
# python-Levenshtein
"""
open("/content/deploy_app/requirements.txt","w",encoding="utf-8").write(req)
print("Wrote requirements.txt")


Wrote requirements.txt


In [55]:
from pathlib import Path
DEPLOY = Path("/content/deploy_app")
APP_PY = DEPLOY/"app.py"

app_code = r'''\
import json, re, torch
import torch.nn as nn
import streamlit as st
from pathlib import Path

# ---------- Paths ----------
ROOT = Path(".")
BPE_DIR = ROOT/"data"/"processed"
CKPT    = ROOT/"checkpoints"/"best.pt"

PAD, BOS, EOS, UNK = 0,1,2,3
SPACE = "▁"
SPECIALS = {"<pad>", "<bos>", "<eos>", "<unk>"}

# ---------- BPE utils ----------
def _merge_seq(seq, pair):
    a,b = pair; out=[]; i=0
    while i < len(seq):
        if i+1 < len(seq) and seq[i]==a and seq[i+1]==b:
            out.append(a+b); i+=2
        else:
            out.append(seq[i]); i+=1
    return out

def load_bpe(path):
    with open(path, encoding="utf-8") as f:
        return json.load(f)

def apply_bpe(line, model, add_bos_eos=True):
    # simple bpe applicator consistent with your preprocessing (lowercase + split on spaces)
    merges = model["merges"]
    itos   = model["itos"]
    stoi   = {t:i for i,t in enumerate(itos)}
    line = re.sub(r"\s+"," ", line.strip().lower())
    pieces=[]
    if line:
        for w in line.split(" "):
            seq=[SPACE]+list(w)
            for a,b in merges:
                seq=_merge_seq(seq,(a,b))
            pieces.extend(seq)
    ids=[stoi.get(p, UNK) for p in pieces]
    return [BOS]+ids+[EOS] if add_bos_eos else ids

def ids_to_text(ids, itos):
    toks=[itos[i] if 0<=i<len(itos) else "<unk>" for i in ids]
    toks=[t for t in toks if t not in SPECIALS]
    s=""
    for t in toks:
        s+=(" "+t[1:] if t.startswith(SPACE) else t)
    return s.strip()

# ---------- Model (same architecture you used) ----------
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb=256, hid=512, layers=2, dropout=0.3):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb, padding_idx=PAD)
        self.rnn = nn.LSTM(emb, hid//2, num_layers=layers, batch_first=True,
                           dropout=dropout if layers>1 else 0.0, bidirectional=True)
    def forward(self, x):
        emb = self.emb(x)
        out, (h,c) = self.rnn(emb)
        return out, (h,c)

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb=256, hid=512, layers=4, dropout=0.3):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb, padding_idx=PAD)
        self.rnn = nn.LSTM(emb, hid, num_layers=layers, batch_first=True,
                           dropout=dropout if layers>1 else 0.0)
        self.fc = nn.Linear(hid, vocab_size)
    def forward(self, y, state):
        emb = self.emb(y)
        out, state = self.rnn(emb, state)
        return self.fc(out), state

class Seq2Seq(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, emb=256, hid=512, enc_layers=2, dec_layers=4, dropout=0.3):
        super().__init__()
        self.encoder = Encoder(src_vocab, emb, hid, enc_layers, dropout)
        self.decoder = Decoder(tgt_vocab, emb, hid, dec_layers, dropout)
        self.bridge_h = nn.Linear(hid, hid)
        self.bridge_c = nn.Linear(hid, hid)

    def init_dec(self, h, c):
        H = torch.cat([h[-2], h[-1]], dim=-1)
        C = torch.cat([c[-2], c[-1]], dim=-1)
        dh = self.bridge_h(H).unsqueeze(0).repeat(self.decoder.rnn.num_layers, 1, 1)
        dc = self.bridge_c(C).unsqueeze(0).repeat(self.decoder.rnn.num_layers, 1, 1)
        return dh, dc

    @torch.no_grad()
    def greedy(self, src_ids, max_len=128, device="cpu"):
        src = torch.tensor([src_ids], dtype=torch.long, device=device)
        enc_out, (h,c) = self.encoder(src)
        dh, dc = self.init_dec(h, c)
        y = torch.tensor([[BOS]], dtype=torch.long, device=device)
        out = []
        for _ in range(max_len):
            logits, (dh, dc) = self.decoder(y, (dh, dc))
            nid = logits[:, -1, :].argmax(dim=-1).item()
            if nid == EOS:
                break
            out.append(nid)
            y = torch.cat([y, torch.tensor([[nid]], device=device)], dim=1)
        return out

# ---------- Load artifacts ----------
device = "cuda" if torch.cuda.is_available() else "cpu"

src_bpe = load_bpe(BPE_DIR/"bpe_ur.json")
tgt_bpe = load_bpe(BPE_DIR/"bpe_ro.json")
tgt_itos = tgt_bpe["itos"]

# try to read src/tgt sizes from checkpoint if present, else from bpe files
state = torch.load(CKPT, map_location=device)
src_vocab = state.get("src_vocab", len(src_bpe["itos"])) if isinstance(state, dict) else len(src_bpe["itos"])
tgt_vocab = state.get("tgt_vocab", len(tgt_bpe["itos"])) if isinstance(state, dict) else len(tgt_bpe["itos"])

model = Seq2Seq(src_vocab, tgt_vocab).to(device)
# load state dict — supports the format {"model": state_dict} used earlier
if isinstance(state, dict) and "model" in state:
    model.load_state_dict(state["model"])
else:
    model.load_state_dict(state)

model.eval()

# ---------- UI ----------
st.set_page_config(page_title="Urdu → Roman Urdu", page_icon="📝")
st.title("Urdu → Roman Urdu Translator")
st.caption("BiLSTM Encoder–Decoder with BPE (from scratch) · PyTorch")

inp = st.text_area("Urdu input:", height=140, value="میں تم سے محبت کرتا ہوں")
max_len = st.slider("Max output length", 32, 256, 128, step=16)

if st.button("Translate"):
    if not inp.strip():
        st.warning("Please enter Urdu text.")
    else:
        try:
            src_ids = apply_bpe(inp, src_bpe, add_bos_eos=True)
            pred_ids = model.greedy(src_ids, max_len=max_len, device=device)
            out = ids_to_text(pred_ids, tgt_itos)
            st.subheader("Roman Urdu")
            st.write(out if out else "(empty)")
        except Exception as e:
            st.error("Error during translation: " + str(e))
'''

APP_PY.write_text(app_code, encoding="utf-8")
print("Wrote", APP_PY)


Wrote /content/deploy_app/app.py


In [56]:
# quick local check (does not start streamlit)
import json, torch
from pathlib import Path

DEPLOY = Path("/content/deploy_app")
DATA = DEPLOY/"data"/"processed"
CKPT  = DEPLOY/"checkpoints"/"best.pt"

print("Files in deploy:", list(DATA.iterdir()), CKPT.exists())

# load BPEs and model (same logic as app)
src_bpe = json.load(open(DATA/"bpe_ur.json", encoding="utf-8"))
tgt_bpe = json.load(open(DATA/"bpe_ro.json", encoding="utf-8"))
print("BPE sizes:", len(src_bpe["itos"]), len(tgt_bpe["itos"]))

# try to instantiate model and load state
import importlib.util, sys
# We will run a tiny snippet similar to app.py (reuse the Seq2Seq class by importing the app file is messy),
# so just attempt loading the checkpoint dict to ensure it's readable:
ck = torch.load(CKPT, map_location="cpu")
print("Checkpoint keys:", list(ck.keys()) if isinstance(ck, dict) else "raw state")


Files in deploy: [PosixPath('/content/deploy_app/data/processed/bpe_ur.json'), PosixPath('/content/deploy_app/data/processed/bpe_ro.json')] True
BPE sizes: 200 200
Checkpoint keys: ['model', 'src_vocab', 'tgt_vocab']


In [57]:
import shutil
from pathlib import Path

base_dir = Path("/content")
deploy_dir = base_dir / "deploy_app"
zip_path = base_dir / "app_bundle.zip"

# remove old if exists
if zip_path.exists():
    zip_path.unlink()

# create archive
shutil.make_archive(base_name=str(zip_path.with_suffix('')), format="zip",
                    root_dir=deploy_dir.parent, base_dir=deploy_dir.name)

print("✅ Bundle created at:", zip_path)


✅ Bundle created at: /content/app_bundle.zip


In [33]:
!pip install pyngrok


Collecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.3.0-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.3.0


In [39]:
!ngrok config add-authtoken 330zWQS47ftvB2Rr0gmHOYCsNGu_2uV8bk1gEgfkWaeLLHUzQ

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [40]:
from pyngrok import ngrok

public_url = ngrok.connect(8501)
print("🔗 Public URL:", public_url)


🔗 Public URL: NgrokTunnel: "https://0f5edb5b75be.ngrok-free.app" -> "http://localhost:8501"


In [41]:
!streamlit run app.py --server.port 8501 &>/dev/null&


In [43]:
from pyngrok import ngrok

# 1) Disconnect any existing tunnels
for t in ngrok.get_tunnels():
    try:
        ngrok.disconnect(t.public_url)
    except Exception:
        pass

# 2) Kill the ngrok agent to reset session
ngrok.kill()




In [46]:
from pyngrok import ngrok

# Start tunnel to the running service on 8501
public_url = ngrok.connect(addr="http://127.0.0.1:8501", bind_tls=True)
print("Public URL:", public_url)


Public URL: NgrokTunnel: "https://8ae55b3697ae.ngrok-free.app" -> "http://127.0.0.1:8501"


In [None]:
%%writefile /content/deploy_app/requirements.txt
streamlit
torch
torchvision
torchaudio
numpy


Writing /content/deploy_app/requirements.txt
