# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import re


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
train = pd.read_csv("/kaggle/input/deep-past-initiative-machine-translation/train.csv")
test  = pd.read_csv("/kaggle/input/deep-past-initiative-machine-translation/test.csv")

print("Train:", train.shape)
print("Test :", test.shape)

train.head()


In [None]:
print(train['transliteration'].iloc[0])
print("----")
print(train['translation'].iloc[0])


In [None]:
sub_map = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")

def clean_transliteration(text):
    if pd.isna(text):
        return ""

    text = text.translate(sub_map)

    # normalize determinatives
    text = re.sub(r"\(d\)", "D_", text)

    # remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [None]:
sample = train['transliteration'].iloc[0]
print(sample)
print("----")
print(clean_transliteration(sample))


In [None]:
def clean_translation(text):
    if pd.isna(text):
        return ""

    # normalize quotes
    text = text.replace("“", '"').replace("”", '"')
    text = text.replace("’", "'")

    # normalize repeated punctuation
    text = re.sub(r"\.{2,}", ".", text)
    text = re.sub(r",{2,}", ",", text)

    # normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [None]:
sample_t = train['translation'].iloc[0]
print(sample_t)
print("----")
print(clean_translation(sample_t))


In [None]:
# If src/tgt columns already exist, this won't break anything.
# Create them if missing.

if "src" not in train.columns:
    # If you used clean_src earlier, map it; otherwise create from transliteration (raw)
    train["src"] = train.get("clean_src", train["transliteration"]).astype(str)

if "tgt" not in train.columns:
    # If you used clean_tgt earlier, map it; otherwise create from translation (raw)
    train["tgt"] = train.get("clean_tgt", train["translation"]).astype(str)

print("Columns now include src/tgt?", "src" in train.columns, "tgt" in train.columns)
print(train[["src","tgt"]].head(2))


In [None]:
import os
os.makedirs("/kaggle/working/spm", exist_ok=True)

src_path = "/kaggle/working/spm/src.txt"
tgt_path = "/kaggle/working/spm/tgt.txt"

with open(src_path, "w", encoding="utf-8") as f:
    for s in train["src"].astype(str).tolist():
        f.write(s.replace("\n", " ") + "\n")

with open(tgt_path, "w", encoding="utf-8") as f:
    for s in train["tgt"].astype(str).tolist():
        f.write(s.replace("\n", " ") + "\n")

print("Wrote:", src_path, tgt_path)


In [None]:
!pip -q install sentencepiece
import sentencepiece as spm


In [None]:
import sentencepiece as spm

spm.SentencePieceTrainer.train(
    input=joint_path,
    model_prefix="/kaggle/working/spm/oa_joint_unigram",
    vocab_size=5000,          # <= 5058 (from the error)
    model_type="unigram",
    character_coverage=1.0,
    user_defined_symbols=["<gap>", "<big_gap>", "D_", "<morph>"],
    hard_vocab_limit=False    # prevents this error in future
)

print("Trained SentencePiece model with vocab_size=5000.")


In [None]:
import os, sentencepiece as spm

print("Files in /kaggle/working/spm:")
print(os.listdir("/kaggle/working/spm"))

sp = spm.SentencePieceProcessor()
sp.load("/kaggle/working/spm/oa_joint_unigram.model")

print("Tokenizer loaded   Vocab:", sp.get_piece_size())


In [None]:
print("SRC pieces:")
print(sp.encode(train["src"].iloc[0], out_type=str)[:60])

print("\nTGT pieces:")
print(sp.encode(train["tgt"].iloc[0], out_type=str)[:60])


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

def encode_with_sp(text):
    ids = sp.encode(text, out_type=int)
    return [sp.bos_id()] + ids + [sp.eos_id()]

src_ids = [encode_with_sp(s) for s in train["src"].astype(str)]
tgt_ids = [encode_with_sp(s) for s in train["tgt"].astype(str)]

idx = np.arange(len(train))
train_idx, val_idx = train_test_split(idx, test_size=0.1, random_state=42)

print("Train:", len(train_idx), "Val:", len(val_idx))


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PAD_ID = sp.eos_id()

class MTDataset(Dataset):
    def __init__(self, src, tgt, idxs):
        self.src = [src[i] for i in idxs]
        self.tgt = [tgt[i] for i in idxs]
    def __len__(self): return len(self.src)
    def __getitem__(self, i): return self.src[i], self.tgt[i]

def collate_fn(batch, max_src=512, max_tgt=256):
    src, tgt = zip(*batch)
    src = [s[:max_src] for s in src]
    tgt = [t[:max_tgt] for t in tgt]

    src_len = max(len(s) for s in src)
    tgt_len = max(len(t) for t in tgt)

    src_pad = torch.full((len(src), src_len), PAD_ID, dtype=torch.long)
    tgt_pad = torch.full((len(tgt), tgt_len), PAD_ID, dtype=torch.long)

    for i, (s, t) in enumerate(zip(src, tgt)):
        src_pad[i, :len(s)] = torch.tensor(s)
        tgt_pad[i, :len(t)] = torch.tensor(t)

    return src_pad, tgt_pad

train_loader = DataLoader(MTDataset(src_ids, tgt_ids, train_idx), batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(MTDataset(src_ids, tgt_ids, val_idx),   batch_size=32, shuffle=False, collate_fn=collate_fn)

print("Batches:", len(train_loader), len(val_loader))


In [None]:
import torch.nn as nn
import math

VOCAB = sp.get_piece_size()

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=4096, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))
    def forward(self, x):
        return self.dropout(x + self.pe[:, :x.size(1)])

class TinyTransformerMT(nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=4, layers=3, dim_ff=1024, dropout=0.1):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, d_model)
        self.pos = PositionalEncoding(d_model, dropout=dropout)
        self.tf  = nn.Transformer(
            d_model=d_model, nhead=nhead,
            num_encoder_layers=layers, num_decoder_layers=layers,
            dim_feedforward=dim_ff, dropout=dropout,
            batch_first=True
        )
        self.out = nn.Linear(d_model, vocab_size)

    def forward(self, src_ids, tgt_in_ids):
        src = self.pos(self.emb(src_ids))
        tgt = self.pos(self.emb(tgt_in_ids))
        T = tgt_in_ids.size(1)
        tgt_mask = torch.triu(torch.ones(T, T, device=src.device), diagonal=1).bool()
        h = self.tf(src, tgt, tgt_mask=tgt_mask)
        return self.out(h)

model = TinyTransformerMT(VOCAB).to(device)


In [None]:
import torch.nn as nn
import math

VOCAB = sp.get_piece_size()

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=4096, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))
    def forward(self, x):
        return self.dropout(x + self.pe[:, :x.size(1)])

class TinyTransformerMT(nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=4, layers=3, dim_ff=1024, dropout=0.1):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, d_model)
        self.pos = PositionalEncoding(d_model, dropout=dropout)
        self.tf  = nn.Transformer(
            d_model=d_model, nhead=nhead,
            num_encoder_layers=layers, num_decoder_layers=layers,
            dim_feedforward=dim_ff, dropout=dropout,
            batch_first=True
        )
        self.out = nn.Linear(d_model, vocab_size)

    def forward(self, src_ids, tgt_in_ids):
        src = self.pos(self.emb(src_ids))
        tgt = self.pos(self.emb(tgt_in_ids))
        T = tgt_in_ids.size(1)
        tgt_mask = torch.triu(torch.ones(T, T, device=src.device), diagonal=1).bool()
        h = self.tf(src, tgt, tgt_mask=tgt_mask)
        return self.out(h)

model = TinyTransformerMT(VOCAB).to(device)


In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID)
optimizer = optim.AdamW(model.parameters(), lr=3e-4)

def run_epoch(loader, train_mode=True):
    model.train(train_mode)
    total = 0.0
    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)
        tgt_in, tgt_out = tgt[:, :-1], tgt[:, 1:]

        logits = model(src, tgt_in)
        loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_out.reshape(-1))

        if train_mode:
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        total += loss.item()
    return total / len(loader)

for ep in range(3):
    tr = run_epoch(train_loader, True)
    va = run_epoch(val_loader, False)
    print(f"Epoch {ep+1}: train_loss={tr:.4f} val_loss={va:.4f}")


In [None]:
!pip -q install sacrebleu
import sacrebleu


In [None]:
import torch

@torch.no_grad()
def beam_decode_batch(src, beam_size=5, max_len=128, length_penalty=0.8):
    model.eval()
    src = src.to(device)

    # each beam: (seq, logprob)
    beams = [(torch.full((src.size(0), 1), sp.bos_id(), device=device), torch.zeros(src.size(0), device=device))]

    for _ in range(max_len):
        new_beams = []
        for seq, score in beams:
            logits = model(src, seq)
            logp = torch.log_softmax(logits[:, -1], dim=-1)
            topk = torch.topk(logp, beam_size, dim=-1)

            for k in range(beam_size):
                next_id = topk.indices[:, k:k+1]
                next_score = score + topk.values[:, k]
                new_seq = torch.cat([seq, next_id], dim=1)
                new_beams.append((new_seq, next_score))

        # rank beams by length-penalized avg score
        def rank_key(item):
            seq, sc = item
            lp = (seq.size(1) ** length_penalty)
            return (sc / lp).mean().item()

        new_beams.sort(key=rank_key, reverse=True)
        beams = new_beams[:beam_size]

        if all((b[0][:, -1] == sp.eos_id()).all() for b in beams):
            break

    return beams[0][0]


In [None]:
@torch.no_grad()
def eval_on_val(max_batches=10):
    preds, refs = [], []
    seen = 0
    for i, (src, tgt) in enumerate(val_loader):
        if i >= max_batches:
            break
        pred_ids = beam_decode_batch(src, beam_size=5, max_len=128, length_penalty=0.8)
        for j in range(pred_ids.size(0)):
            preds.append(sp.decode(pred_ids[j].tolist()))
            refs.append(train["tgt"].iloc[val_idx[seen]])
            seen += 1

    bleu = sacrebleu.corpus_bleu(preds, [refs])
    chrf = sacrebleu.corpus_chrf(preds, [refs], word_order=2)
    final = (bleu.score * chrf.score) ** 0.5

    print(f"BLEU  : {bleu.score:.2f}")
    print(f"chrF++: {chrf.score:.2f}")
    print(f"FINAL : {final:.2f}")

eval_on_val(max_batches=10)


In [None]:
!pip install -q sacrebleu


In [None]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.chrf_score import corpus_chrf

def compute_metrics(preds, refs):
    refs_tok = [[r.split()] for r in refs]
    preds_tok = [p.split() for p in preds]

    bleu = corpus_bleu(refs_tok, preds_tok, smoothing_function=SmoothingFunction().method4)
    chrf = corpus_chrf(refs, preds)

    return bleu * 100, chrf * 100


In [None]:
import os, re
import pandas as pd
import sentencepiece as spm

# 1) Load data again (because kernel resets kill variables)
train = pd.read_csv("/kaggle/input/deep-past-initiative-machine-translation/train.csv")
test  = pd.read_csv("/kaggle/input/deep-past-initiative-machine-translation/test.csv")

# 2) Cleaning (same as your proven good version)
sub_map = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")

def clean_transliteration(text):
    text = str(text).translate(sub_map)
    text = re.sub(r"\(d\)", "D_", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def clean_translation(text):
    text = str(text).replace("“", '"').replace("”", '"').replace("’", "'")
    text = re.sub(r"\s+", " ", text).strip()
    return text

train["src"] = train["transliteration"].apply(clean_transliteration)
train["tgt"] = train["translation"].apply(clean_translation)

# 3) Build spm files
os.makedirs("/kaggle/working/spm", exist_ok=True)

src_path   = "/kaggle/working/spm/src.txt"
tgt_path   = "/kaggle/working/spm/tgt.txt"
joint_path = "/kaggle/working/spm/joint.txt"

with open(src_path, "w", encoding="utf-8") as f:
    for s in train["src"]:
        f.write(s.replace("\n", " ") + "\n")

with open(tgt_path, "w", encoding="utf-8") as f:
    for s in train["tgt"]:
        f.write(s.replace("\n", " ") + "\n")

with open(joint_path, "w", encoding="utf-8") as out:
    with open(src_path, "r", encoding="utf-8") as f:
        out.write(f.read())
    with open(tgt_path, "r", encoding="utf-8") as f:
        out.write(f.read())

print(" src.txt / tgt.txt / joint.txt created")

# 4) Train sentencepiece
spm.SentencePieceTrainer.train(
    input=joint_path,
    model_prefix="/kaggle/working/spm/oa_joint_unigram",
    vocab_size=5000,
    model_type="unigram",
    character_coverage=1.0,
    user_defined_symbols=["<gap>", "<big_gap>", "D_", "<morph>"],
    hard_vocab_limit=False
)
print(" SentencePiece trained")

# 5) Load tokenizer
sp = spm.SentencePieceProcessor()
sp.load("/kaggle/working/spm/oa_joint_unigram.model")
print(" Tokenizer loaded | vocab:", sp.get_piece_size())


In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PAD_ID = sp.eos_id()

def encode_with_sp(text):
    ids = sp.encode(text, out_type=int)
    return [sp.bos_id()] + ids + [sp.eos_id()]

src_ids = [encode_with_sp(s) for s in train["src"].astype(str)]
tgt_ids = [encode_with_sp(s) for s in train["tgt"].astype(str)]

idx = np.arange(len(train))
train_idx, val_idx = train_test_split(idx, test_size=0.1, random_state=42)

class MTDataset(Dataset):
    def __init__(self, src, tgt, idxs):
        self.src = [src[i] for i in idxs]
        self.tgt = [tgt[i] for i in idxs]
    def __len__(self): return len(self.src)
    def __getitem__(self, i): return self.src[i], self.tgt[i]

def collate_fn(batch, max_src=512, max_tgt=256):
    src, tgt = zip(*batch)
    src = [s[:max_src] for s in src]
    tgt = [t[:max_tgt] for t in tgt]

    src_len = max(len(s) for s in src)
    tgt_len = max(len(t) for t in tgt)

    src_pad = torch.full((len(src), src_len), PAD_ID, dtype=torch.long)
    tgt_pad = torch.full((len(tgt), tgt_len), PAD_ID, dtype=torch.long)

    for i, (s, t) in enumerate(zip(src, tgt)):
        src_pad[i, :len(s)] = torch.tensor(s)
        tgt_pad[i, :len(t)] = torch.tensor(t)

    return src_pad, tgt_pad

train_loader = DataLoader(MTDataset(src_ids, tgt_ids, train_idx), batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(MTDataset(src_ids, tgt_ids, val_idx),   batch_size=32, shuffle=False, collate_fn=collate_fn)

print(" loaders ready | train batches:", len(train_loader), "val batches:", len(val_loader))


In [None]:
import torch.nn as nn
import math

VOCAB = sp.get_piece_size()

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=4096, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))
    def forward(self, x):
        return self.dropout(x + self.pe[:, :x.size(1)])

class TinyTransformerMT(nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=4, layers=3, dim_ff=1024, dropout=0.1):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, d_model)
        self.pos = PositionalEncoding(d_model, dropout=dropout)
        self.tf  = nn.Transformer(
            d_model=d_model, nhead=nhead,
            num_encoder_layers=layers, num_decoder_layers=layers,
            dim_feedforward=dim_ff, dropout=dropout,
            batch_first=True
        )
        self.out = nn.Linear(d_model, vocab_size)

    def forward(self, src_ids, tgt_in_ids):
        src = self.pos(self.emb(src_ids))
        tgt = self.pos(self.emb(tgt_in_ids))
        T = tgt_in_ids.size(1)
        tgt_mask = torch.triu(torch.ones(T, T, device=src.device), diagonal=1).bool()
        h = self.tf(src, tgt, tgt_mask=tgt_mask)
        return self.out(h)

model = TinyTransformerMT(VOCAB).to(device)
print(" model ready on", device)


In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID)
optimizer = optim.AdamW(model.parameters(), lr=3e-4)

def run_epoch(loader, train_mode=True):
    model.train(train_mode)
    total = 0.0
    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)
        tgt_in, tgt_out = tgt[:, :-1], tgt[:, 1:]

        logits = model(src, tgt_in)
        loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_out.reshape(-1))

        if train_mode:
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        total += loss.item()
    return total / len(loader)

for ep in range(3):
    tr = run_epoch(train_loader, True)
    va = run_epoch(val_loader, False)
    print(f"Epoch {ep+1}: train_loss={tr:.4f} val_loss={va:.4f}")

# optional: save weights so next restart doesn't waste time
torch.save(model.state_dict(), "/kaggle/working/model_baseline.pt")
print(" saved: /kaggle/working/model_baseline.pt")


In [None]:
!pip -q install sacrebleu
import sacrebleu
import torch

@torch.no_grad()
def beam_decode_batch(src, beam_size=5, max_len=160, length_penalty=0.8):
    model.eval()
    src = src.to(device)

    beams = [(torch.full((src.size(0), 1), sp.bos_id(), device=device), torch.zeros(src.size(0), device=device))]

    for _ in range(max_len):
        new_beams = []
        for seq, score in beams:
            logits = model(src, seq)
            logp = torch.log_softmax(logits[:, -1], dim=-1)
            topk = torch.topk(logp, beam_size, dim=-1)

            for k in range(beam_size):
                next_id = topk.indices[:, k:k+1]
                next_score = score + topk.values[:, k]
                new_seq = torch.cat([seq, next_id], dim=1)
                new_beams.append((new_seq, next_score))

        def rank(item):
            seq, sc = item
            lp = (seq.size(1) ** length_penalty)
            return (sc / lp).mean().item()

        new_beams.sort(key=rank, reverse=True)
        beams = new_beams[:beam_size]

        if all((b[0][:, -1] == sp.eos_id()).all() for b in beams):
            break

    return beams[0][0]

@torch.no_grad()
def eval_val(max_batches=20, beam_size=5, lp=0.8):
    preds, refs = [], []
    seen = 0
    for b, (src, tgt) in enumerate(val_loader):
        if b >= max_batches:
            break
        pred_ids = beam_decode_batch(src, beam_size=beam_size, length_penalty=lp)
        for i in range(pred_ids.size(0)):
            preds.append(sp.decode(pred_ids[i].tolist()))
            refs.append(train["tgt"].iloc[val_idx[seen]])
            seen += 1

    bleu = sacrebleu.corpus_bleu(preds, [refs]).score
    chrf = sacrebleu.corpus_chrf(preds, [refs], word_order=2).score
    final = (bleu * chrf) ** 0.5
    print(f"beam={beam_size} lp={lp} | BLEU={bleu:.2f} chrF++={chrf:.2f} FINAL={final:.2f}")

eval_val(max_batches=20, beam_size=5, lp=0.8)


In [None]:


import os, re, pandas as pd, sentencepiece as spm, torch

# 1) Load data
train = pd.read_csv("/kaggle/input/deep-past-initiative-machine-translation/train.csv")

# 2) Clean functions
sub_map = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")

def clean_transliteration(t):
    t = str(t).translate(sub_map)
    t = re.sub(r"\(d\)", "D_", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def clean_translation(t):
    t = str(t).replace("“", '"').replace("”", '"').replace("’", "'")
    t = re.sub(r"\s+", " ", t).strip()
    return t

train["src"] = train["transliteration"].apply(clean_transliteration)
train["tgt"] = train["translation"].apply(clean_translation)

# 3) Build SPM files
os.makedirs("/kaggle/working/spm", exist_ok=True)

src_path   = "/kaggle/working/spm/src.txt"
tgt_path   = "/kaggle/working/spm/tgt.txt"
joint_path = "/kaggle/working/spm/joint.txt"

with open(src_path, "w", encoding="utf-8") as f:
    f.write("\n".join(train["src"].tolist()))

with open(tgt_path, "w", encoding="utf-8") as f:
    f.write("\n".join(train["tgt"].tolist()))

with open(joint_path, "w", encoding="utf-8") as f:
    f.write(open(src_path).read())
    f.write("\n")
    f.write(open(tgt_path).read())

# 4) Train SentencePiece (always safe)
spm.SentencePieceTrainer.train(
    input=joint_path,
    model_prefix="/kaggle/working/spm/oa_joint_unigram",
    vocab_size=5000,
    model_type="unigram",
    character_coverage=1.0,
    user_defined_symbols=["<gap>", "<big_gap>", "D_", "<morph>"],
    hard_vocab_limit=False
)

# 5) Load tokenizer
sp = spm.SentencePieceProcessor()
sp.load("/kaggle/working/spm/oa_joint_unigram.model")

PAD_ID = sp.eos_id()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(" BOOTSTRAP DONE")
print("Vocab:", sp.get_piece_size(), "| PAD_ID:", PAD_ID, "| Device:", device)


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader

def encode_with_sp(text):
    ids = sp.encode(str(text), out_type=int)
    return [sp.bos_id()] + ids + [sp.eos_id()]

src_ids = [encode_with_sp(s) for s in train["src"]]
tgt_ids = [encode_with_sp(s) for s in train["tgt"]]

idx = np.arange(len(train))
train_idx, val_idx = train_test_split(idx, test_size=0.1, random_state=42)

class MTDataset(Dataset):
    def __init__(self, src, tgt, idxs):
        self.src = [src[i] for i in idxs]
        self.tgt = [tgt[i] for i in idxs]
    def __len__(self): return len(self.src)
    def __getitem__(self, i): return self.src[i], self.tgt[i]

def collate_fn(batch, max_src=512, max_tgt=256):
    src, tgt = zip(*batch)
    src = [s[:max_src] for s in src]
    tgt = [t[:max_tgt] for t in tgt]

    src_len = max(len(s) for s in src)
    tgt_len = max(len(t) for t in tgt)

    src_pad = torch.full((len(src), src_len), PAD_ID, dtype=torch.long)
    tgt_pad = torch.full((len(tgt), tgt_len), PAD_ID, dtype=torch.long)

    for i, (s, t) in enumerate(zip(src, tgt)):
        src_pad[i, :len(s)] = torch.tensor(s)
        tgt_pad[i, :len(t)] = torch.tensor(t)
    return src_pad, tgt_pad

train_loader = DataLoader(MTDataset(src_ids, tgt_ids, train_idx), batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(MTDataset(src_ids, tgt_ids, val_idx),   batch_size=32, shuffle=False, collate_fn=collate_fn)

print(" loaders ready | train batches:", len(train_loader), "val batches:", len(val_loader))


In [None]:
import torch.nn as nn
import math

VOCAB = sp.get_piece_size()

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=4096, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))
    def forward(self, x):
        return self.dropout(x + self.pe[:, :x.size(1)])

class TinyTransformerMT(nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=4, layers=3, dim_ff=1024, dropout=0.1):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, d_model)
        self.pos = PositionalEncoding(d_model, dropout=dropout)
        self.tf  = nn.Transformer(
            d_model=d_model, nhead=nhead,
            num_encoder_layers=layers, num_decoder_layers=layers,
            dim_feedforward=dim_ff, dropout=dropout,
            batch_first=True
        )
        self.out = nn.Linear(d_model, vocab_size)

    def forward(self, src_ids, tgt_in_ids):
        src = self.pos(self.emb(src_ids))
        tgt = self.pos(self.emb(tgt_in_ids))
        T = tgt_in_ids.size(1)
        tgt_mask = torch.triu(torch.ones(T, T, device=src.device), diagonal=1).bool()
        h = self.tf(src, tgt, tgt_mask=tgt_mask)
        return self.out(h)

model = TinyTransformerMT(VOCAB).to(device)
print(" model ready on", device)


In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID)
optimizer = optim.AdamW(model.parameters(), lr=3e-4)

def run_epoch(loader, train_mode=True):
    model.train(train_mode)
    total = 0.0
    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)
        tgt_in, tgt_out = tgt[:, :-1], tgt[:, 1:]

        logits = model(src, tgt_in)
        loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_out.reshape(-1))

        if train_mode:
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        total += loss.item()
    return total / len(loader)

for ep in range(2):
    tr = run_epoch(train_loader, True)
    va = run_epoch(val_loader, False)
    print(f"Epoch {ep+1}: train_loss={tr:.4f} val_loss={va:.4f}")

print(" training done")


In [None]:
!pip -q install sacrebleu
import sacrebleu
import torch

@torch.no_grad()
def beam_decode_batch(src, beam_size=5, max_len=160, length_penalty=0.8):
    model.eval()
    src = src.to(device)

    beams = [(torch.full((src.size(0), 1), sp.bos_id(), device=device), torch.zeros(src.size(0), device=device))]

    for _ in range(max_len):
        new_beams = []
        for seq, score in beams:
            logits = model(src, seq)
            logp = torch.log_softmax(logits[:, -1], dim=-1)
            topk = torch.topk(logp, beam_size, dim=-1)

            for k in range(beam_size):
                next_id = topk.indices[:, k:k+1]
                next_score = score + topk.values[:, k]
                new_seq = torch.cat([seq, next_id], dim=1)
                new_beams.append((new_seq, next_score))

        def rank(item):
            seq, sc = item
            lp = (seq.size(1) ** length_penalty)
            return (sc / lp).mean().item()

        new_beams.sort(key=rank, reverse=True)
        beams = new_beams[:beam_size]

        if all((b[0][:, -1] == sp.eos_id()).all() for b in beams):
            break

    return beams[0][0]

@torch.no_grad()
def eval_val(max_batches=20, beam_size=5, lp=0.8):
    preds, refs = [], []
    seen = 0
    for b, (src, tgt) in enumerate(val_loader):
        if b >= max_batches:
            break
        pred_ids = beam_decode_batch(src, beam_size=beam_size, length_penalty=lp)
        for i in range(pred_ids.size(0)):
            preds.append(sp.decode(pred_ids[i].tolist()))
            refs.append(train["tgt"].iloc[val_idx[seen]])
            seen += 1

    bleu = sacrebleu.corpus_bleu(preds, [refs]).score
    chrf = sacrebleu.corpus_chrf(preds, [refs], word_order=2).score
    final = (bleu * chrf) ** 0.5
    print(f"[BASELINE] beam={beam_size} lp={lp} | BLEU={bleu:.2f} chrF++={chrf:.2f} FINAL={final:.2f}")
    return bleu, chrf, final

eval_val(max_batches=20, beam_size=5, lp=0.8)


In [None]:
import pandas as pd
import re

lex = pd.read_csv("/kaggle/input/deep-past-initiative-machine-translation/OA_Lexicon_eBL.csv")
print("Lexicon shape:", lex.shape)
print("Columns:", list(lex.columns))

lex.head()


In [None]:
cols = [c.lower() for c in lex.columns]
lex.columns = cols

# best-guess columns
src_col = None
tgt_col = None
for c in cols:
    if src_col is None and any(k in c for k in ["translit", "form", "lemma", "akk"]):
        src_col = c
    if tgt_col is None and any(k in c for k in ["normal", "english", "name", "norm"]):
        tgt_col = c

print("Using columns:", src_col, "->", tgt_col)

lex_map = {}
if src_col and tgt_col:
    tmp = lex[[src_col, tgt_col]].dropna()
    for s, t in tmp.values:
        s = str(s).strip()
        t = str(t).strip()
        if len(s) >= 2 and len(t) >= 2:
            lex_map[s] = t

print("Lexicon map size:", len(lex_map))


In [None]:
# take top-K keys (speed)
LEX_K = 3000
lex_keys = sorted(list(lex_map.keys()), key=len, reverse=True)[:LEX_K]

lex_re = re.compile(r"\b(" + "|".join(map(re.escape, lex_keys)) + r")\b")

def lexicon_postprocess(text: str) -> str:
    return lex_re.sub(lambda m: lex_map.get(m.group(1), m.group(1)), text)


In [None]:
import torch

@torch.no_grad()
def sample_decode_batch(src, max_len=160, temperature=1.0, topk=40):
    model.eval()
    src = src.to(device)
    ys = torch.full((src.size(0), 1), sp.bos_id(), device=device)

    for _ in range(max_len):
        logits = model(src, ys)[:, -1, :] / temperature
        probs = torch.softmax(logits, dim=-1)

        topv, topi = torch.topk(probs, k=min(topk, probs.size(-1)), dim=-1)
        next_id = topi.gather(1, torch.multinomial(topv, 1))
        ys = torch.cat([ys, next_id], dim=1)

        if (next_id == sp.eos_id()).all():
            break
    return ys

def rerank_score(s: str) -> float:
    s = s.strip()
    if len(s) < 8:
        return -1e9
    # reward reasonable length, penalize ugly spacing
    penalty = 0
    penalty += 10 * s.count("  ")
    penalty += 5  * (s.count(" ,") + s.count(" .") + s.count(" :"))
    penalty += 5  * (s.count("..") + s.count(",,"))
    return len(s) - penalty

@torch.no_grad()
def nbest_decode_rerank(src, beam_size=5, lp=0.8, n_samples=4):
    # candidate 0: beam
    beam_ids = beam_decode_batch(src, beam_size=beam_size, max_len=160, length_penalty=lp)
    cand_lists = [[sp.decode(beam_ids[i].tolist()) for i in range(beam_ids.size(0))]]

    # candidates 1..n: sampling
    for _ in range(n_samples):
        samp_ids = sample_decode_batch(src, max_len=160, temperature=1.0, topk=40)
        cand_lists.append([sp.decode(samp_ids[i].tolist()) for i in range(samp_ids.size(0))])

    final = []
    B = len(cand_lists[0])
    for i in range(B):
        options = [cand_lists[k][i] for k in range(len(cand_lists))]
        # apply lexicon correction
        options = [lexicon_postprocess(o) for o in options]
        options.sort(key=rerank_score, reverse=True)
        final.append(options[0])
    return final


In [None]:
!pip -q install sacrebleu


In [None]:
import sacrebleu
print("sacrebleu version:", sacrebleu.__version__)


In [None]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.chrf_score import corpus_chrf


In [None]:
def compute_metrics(preds, refs):
    # BLEU
    refs_tok = [[r.split()] for r in refs]
    preds_tok = [p.split() for p in preds]

    bleu = corpus_bleu(
        refs_tok,
        preds_tok,
        smoothing_function=SmoothingFunction().method4
    ) * 100

    # chrF++
    chrf = corpus_chrf(refs, preds, beta=2) * 100

    final = (bleu * chrf) ** 0.5
    return bleu, chrf, final


In [None]:
@torch.no_grad()
def eval_val_boosted_fallback(max_batches=20, beam_size=5, lp=0.8, n_samples=4):
    preds, refs = [], []
    seen = 0

    for b, (src, tgt) in enumerate(val_loader):
        if b >= max_batches:
            break

        out_txt = nbest_decode_rerank(src, beam_size=beam_size, lp=lp, n_samples=n_samples)

        for i in range(len(out_txt)):
            preds.append(out_txt[i])
            refs.append(train["tgt"].iloc[val_idx[seen]])
            seen += 1

    bleu, chrf, final = compute_metrics(preds, refs)
    print(f"[BOOSTED-FALLBACK] beam={beam_size} lp={lp} n_samples={n_samples} | BLEU={bleu:.2f} chrF++={chrf:.2f} FINAL={final:.2f}")
    return bleu, chrf, final


In [None]:
import pandas as pd
import re

# Reload train data
train = pd.read_csv("/kaggle/input/deep-past-initiative-machine-translation/train.csv")

# Recreate cleaned columns (must match bootstrap)
sub_map = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")

def clean_transliteration(t):
    t = str(t).translate(sub_map)
    t = re.sub(r"\(d\)", "D_", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def clean_translation(t):
    t = str(t).replace("“", '"').replace("”", '"').replace("’", "'")
    t = re.sub(r"\s+", " ", t).strip()
    return t

train["src"] = train["transliteration"].apply(clean_transliteration)
train["tgt"] = train["translation"].apply(clean_translation)

print(" train reloaded:", train.shape)
print(train[["src","tgt"]].head(1))


In [None]:
import os, re, pandas as pd, sentencepiece as spm

# Reload data (safe)
train = pd.read_csv("/kaggle/input/deep-past-initiative-machine-translation/train.csv")

sub_map = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")

def clean_transliteration(t):
    t = str(t).translate(sub_map)
    t = re.sub(r"\(d\)", "D_", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def clean_translation(t):
    t = str(t).replace("“", '"').replace("”", '"').replace("’", "'")
    t = re.sub(r"\s+", " ", t).strip()
    return t

train["src"] = train["transliteration"].apply(clean_transliteration)
train["tgt"] = train["translation"].apply(clean_translation)

# Build SPM files
os.makedirs("/kaggle/working/spm", exist_ok=True)

src_path   = "/kaggle/working/spm/src.txt"
tgt_path   = "/kaggle/working/spm/tgt.txt"
joint_path = "/kaggle/working/spm/joint.txt"

with open(src_path, "w", encoding="utf-8") as f:
    f.write("\n".join(train["src"].tolist()))

with open(tgt_path, "w", encoding="utf-8") as f:
    f.write("\n".join(train["tgt"].tolist()))

with open(joint_path, "w", encoding="utf-8") as f:
    f.write(open(src_path).read() + "\n" + open(tgt_path).read())

# Train SentencePiece
spm.SentencePieceTrainer.train(
    input=joint_path,
    model_prefix="/kaggle/working/spm/oa_joint_unigram",
    vocab_size=5000,
    model_type="unigram",
    character_coverage=1.0,
    user_defined_symbols=["<gap>", "<big_gap>", "D_", "<morph>"],
    hard_vocab_limit=False
)

# Load tokenizer
sp = spm.SentencePieceProcessor()
sp.load("/kaggle/working/spm/oa_joint_unigram.model")

print(" sp loaded | vocab:", sp.get_piece_size())


In [None]:
def encode_with_sp(text):
    ids = sp.encode(str(text), out_type=int)
    return [sp.bos_id()] + ids + [sp.eos_id()]

print("Encoder ready ")


In [None]:
src_ids = [encode_with_sp(s) for s in train["src"]]
tgt_ids = [encode_with_sp(s) for s in train["tgt"]]

print("Encoded:", len(src_ids), len(tgt_ids))


In [None]:
import os, re, pandas as pd, sentencepiece as spm, torch

# 1) Load data
train = pd.read_csv("/kaggle/input/deep-past-initiative-machine-translation/train.csv")

sub_map = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")

def clean_transliteration(t):
    t = str(t).translate(sub_map)
    t = re.sub(r"\(d\)", "D_", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def clean_translation(t):
    t = str(t).replace("“", '"').replace("”", '"').replace("’", "'")
    t = re.sub(r"\s+", " ", t).strip()
    return t

train["src"] = train["transliteration"].apply(clean_transliteration)
train["tgt"] = train["translation"].apply(clean_translation)

# 2) Build SentencePiece input
os.makedirs("/kaggle/working/spm", exist_ok=True)

src_path   = "/kaggle/working/spm/src.txt"
tgt_path   = "/kaggle/working/spm/tgt.txt"
joint_path = "/kaggle/working/spm/joint.txt"

with open(src_path, "w", encoding="utf-8") as f:
    f.write("\n".join(train["src"].tolist()))
with open(tgt_path, "w", encoding="utf-8") as f:
    f.write("\n".join(train["tgt"].tolist()))
with open(joint_path, "w", encoding="utf-8") as f:
    f.write(open(src_path).read() + "\n" + open(tgt_path).read())

# 3) Train + load SentencePiece
spm.SentencePieceTrainer.train(
    input=joint_path,
    model_prefix="/kaggle/working/spm/oa_joint_unigram",
    vocab_size=5000,
    model_type="unigram",
    character_coverage=1.0,
    user_defined_symbols=["<gap>", "<big_gap>", "D_", "<morph>"],
    hard_vocab_limit=False
)

sp = spm.SentencePieceProcessor()
sp.load("/kaggle/working/spm/oa_joint_unigram.model")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PAD_ID = sp.eos_id()

print(" CELL A DONE | sp loaded | PAD_ID:", PAD_ID)


In [None]:
def encode_with_sp(text):
    ids = sp.encode(str(text), out_type=int)
    return [sp.bos_id()] + ids + [sp.eos_id()]

print(" encoder ready")


In [None]:
src_ids = [encode_with_sp(s) for s in train["src"]]
tgt_ids = [encode_with_sp(s) for s in train["tgt"]]

print("Encoded:", len(src_ids), len(tgt_ids))


In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

idx = np.arange(len(train))
train_idx, val_idx = train_test_split(idx, test_size=0.1, random_state=42)

class MTDataset(Dataset):
    def __init__(self, src, tgt, idxs):
        self.src = [src[i] for i in idxs]
        self.tgt = [tgt[i] for i in idxs]
    def __len__(self): return len(self.src)
    def __getitem__(self, i): return self.src[i], self.tgt[i]

def collate_fn(batch, max_src=512, max_tgt=256):
    src, tgt = zip(*batch)
    src = [s[:max_src] for s in src]
    tgt = [t[:max_tgt] for t in tgt]
    src_len = max(len(s) for s in src)
    tgt_len = max(len(t) for t in tgt)

    src_pad = torch.full((len(src), src_len), PAD_ID, dtype=torch.long)
    tgt_pad = torch.full((len(tgt), tgt_len), PAD_ID, dtype=torch.long)
    for i, (s, t) in enumerate(zip(src, tgt)):
        src_pad[i, :len(s)] = torch.tensor(s)
        tgt_pad[i, :len(t)] = torch.tensor(t)
    return src_pad, tgt_pad

train_loader = DataLoader(MTDataset(src_ids, tgt_ids, train_idx), batch_size=32, shuffle=True,  collate_fn=collate_fn)
val_loader   = DataLoader(MTDataset(src_ids, tgt_ids, val_idx),   batch_size=32, shuffle=False, collate_fn=collate_fn)

print(" loaders ready | train batches:", len(train_loader), "val batches:", len(val_loader))


In [None]:
import torch.nn as nn
import math

VOCAB = sp.get_piece_size()

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=4096, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))
    def forward(self, x):
        return self.dropout(x + self.pe[:, :x.size(1)])

class TinyTransformerMT(nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=4, layers=3, dim_ff=1024, dropout=0.1):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, d_model)
        self.pos = PositionalEncoding(d_model, dropout=dropout)
        self.tf  = nn.Transformer(
            d_model=d_model, nhead=nhead,
            num_encoder_layers=layers, num_decoder_layers=layers,
            dim_feedforward=dim_ff, dropout=dropout,
            batch_first=True
        )
        self.out = nn.Linear(d_model, vocab_size)

    def forward(self, src_ids, tgt_in_ids):
        src = self.pos(self.emb(src_ids))
        tgt = self.pos(self.emb(tgt_in_ids))
        T = tgt_in_ids.size(1)
        tgt_mask = torch.triu(torch.ones(T, T, device=src.device), diagonal=1).bool()
        h = self.tf(src, tgt, tgt_mask=tgt_mask)
        return self.out(h)

model = TinyTransformerMT(VOCAB).to(device)
print(" model ready on", device)


In [None]:
import torch.optim as optim
import torch.nn.functional as F

optimizer = optim.AdamW(model.parameters(), lr=3e-4)

def loss_label_smoothing(logits, target, ignore_index=PAD_ID, eps=0.1):
    # logits: (B,T,V), target: (B,T)
    V = logits.size(-1)
    logits = logits.reshape(-1, V)
    target = target.reshape(-1)

    mask = target != ignore_index
    logits = logits[mask]
    target = target[mask]

    log_probs = F.log_softmax(logits, dim=-1)
    nll = -log_probs.gather(1, target.unsqueeze(1)).squeeze(1)
    smooth = -log_probs.mean(dim=-1)
    return ((1 - eps) * nll + eps * smooth).mean()

def run_epoch(loader, train_mode=True, eps=0.1):
    model.train(train_mode)
    total = 0.0
    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)
        tgt_in, tgt_out = tgt[:, :-1], tgt[:, 1:]

        logits = model(src, tgt_in)
        loss = loss_label_smoothing(logits, tgt_out, eps=eps)

        if train_mode:
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        total += loss.item()
    return total / len(loader)

for ep in range(3):
    tr = run_epoch(train_loader, True, eps=0.1)
    va = run_epoch(val_loader, False, eps=0.1)
    print(f"Epoch {ep+1}: train_loss={tr:.4f} val_loss={va:.4f}")

print(" training done")


In [None]:
import pandas as pd
import numpy as np
import re

# Reload train data
train = pd.read_csv("/kaggle/input/deep-past-initiative-machine-translation/train.csv")

# Recreate cleaned columns (must match earlier)
sub_map = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")

def clean_transliteration(t):
    t = str(t).translate(sub_map)
    t = re.sub(r"\(d\)", "D_", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def clean_translation(t):
    t = str(t).replace("“", '"').replace("”", '"').replace("’", "'")
    t = re.sub(r"\s+", " ", t).strip()
    return t

train["src"] = train["transliteration"].apply(clean_transliteration)
train["tgt"] = train["translation"].apply(clean_translation)

print(" train reloaded:", train.shape)
print(train[["src", "tgt"]].head(1))


In [None]:
def encode_with_sp(text):
    ids = sp.encode(str(text), out_type=int)
    return [sp.bos_id()] + ids + [sp.eos_id()]

print(" encode_with_sp defined")


In [None]:


import os, re, pandas as pd, numpy as np, torch, sentencepiece as spm

# ---------- 1. Load data ----------
train = pd.read_csv("/kaggle/input/deep-past-initiative-machine-translation/train.csv")

sub_map = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")

def clean_transliteration(t):
    t = str(t).translate(sub_map)
    t = re.sub(r"\(d\)", "D_", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def clean_translation(t):
    t = str(t).replace("“", '"').replace("”", '"').replace("’", "'")
    t = re.sub(r"\s+", " ", t).strip()
    return t

train["src"] = train["transliteration"].apply(clean_transliteration)
train["tgt"] = train["translation"].apply(clean_translation)

# ---------- 2. Train SentencePiece ----------
os.makedirs("/kaggle/working/spm", exist_ok=True)

src_path   = "/kaggle/working/spm/src.txt"
tgt_path   = "/kaggle/working/spm/tgt.txt"
joint_path = "/kaggle/working/spm/joint.txt"

with open(src_path, "w", encoding="utf-8") as f:
    f.write("\n".join(train["src"]))

with open(tgt_path, "w", encoding="utf-8") as f:
    f.write("\n".join(train["tgt"]))

with open(joint_path, "w", encoding="utf-8") as f:
    f.write(open(src_path).read() + "\n" + open(tgt_path).read())

spm.SentencePieceTrainer.train(
    input=joint_path,
    model_prefix="/kaggle/working/spm/oa_joint_unigram",
    vocab_size=5000,
    model_type="unigram",
    character_coverage=1.0,
    user_defined_symbols=["<gap>", "<big_gap>", "D_", "<morph>"],
    hard_vocab_limit=False
)

sp = spm.SentencePieceProcessor()
sp.load("/kaggle/working/spm/oa_joint_unigram.model")

# ---------- 3. Device + PAD ----------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PAD_ID = sp.eos_id()

# ---------- 4. Encoder ----------
def encode_with_sp(text):
    ids = sp.encode(str(text), out_type=int)
    return [sp.bos_id()] + ids + [sp.eos_id()]

# ---------- 5. Encode dataset ----------
src_ids = [encode_with_sp(s) for s in train["src"]]
tgt_ids = [encode_with_sp(s) for s in train["tgt"]]

print(" BOOTSTRAP COMPLETE")
print("Train size:", train.shape)
print("Vocab size:", sp.get_piece_size())
print("Encoded samples:", len(src_ids), len(tgt_ids))


In [None]:
import pandas as pd
import re

# reload train data
train = pd.read_csv("/kaggle/input/deep-past-initiative-machine-translation/train.csv")

sub_map = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")

def clean_transliteration(t):
    t = str(t).translate(sub_map)
    t = re.sub(r"\(d\)", "D_", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def clean_translation(t):
    t = str(t).replace("“", '"').replace("”", '"').replace("’", "'")
    t = re.sub(r"\s+", " ", t).strip()
    return t

train["src"] = train["transliteration"].apply(clean_transliteration)
train["tgt"] = train["translation"].apply(clean_translation)

print(" train loaded:", train.shape)


In [None]:
idx = np.arange(len(train))
train_idx, val_idx = train_test_split(idx, test_size=0.1, random_state=42)


In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split


In [None]:
def encode_with_sp(text):
    ids = sp.encode(str(text), out_type=int)
    return [sp.bos_id()] + ids + [sp.eos_id()]

print(" encode_with_sp defined")


In [None]:


import os, re, pandas as pd, numpy as np, torch, sentencepiece as spm

# 1) Load data
train = pd.read_csv("/kaggle/input/deep-past-initiative-machine-translation/train.csv")

sub_map = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")

def clean_transliteration(t):
    t = str(t).translate(sub_map)
    t = re.sub(r"\(d\)", "D_", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def clean_translation(t):
    t = str(t).replace("“", '"').replace("”", '"').replace("’", "'")
    t = re.sub(r"\s+", " ", t).strip()
    return t

train["src"] = train["transliteration"].apply(clean_transliteration)
train["tgt"] = train["translation"].apply(clean_translation)

# 2) SentencePiece
os.makedirs("/kaggle/working/spm", exist_ok=True)

with open("/kaggle/working/spm/src.txt", "w") as f:
    f.write("\n".join(train["src"]))
with open("/kaggle/working/spm/tgt.txt", "w") as f:
    f.write("\n".join(train["tgt"]))
with open("/kaggle/working/spm/joint.txt", "w") as f:
    f.write(open("/kaggle/working/spm/src.txt").read() + "\n" +
            open("/kaggle/working/spm/tgt.txt").read())

spm.SentencePieceTrainer.train(
    input="/kaggle/working/spm/joint.txt",
    model_prefix="/kaggle/working/spm/oa",
    vocab_size=5000,
    model_type="unigram",
    character_coverage=1.0,
    hard_vocab_limit=False
)

sp = spm.SentencePieceProcessor()
sp.load("/kaggle/working/spm/oa.model")

# 3) Encoder
def encode_with_sp(text):
    ids = sp.encode(str(text), out_type=int)
    return [sp.bos_id()] + ids + [sp.eos_id()]

# 4) Encode data
src_ids = [encode_with_sp(s) for s in train["src"]]
tgt_ids = [encode_with_sp(s) for s in train["tgt"]]

PAD_ID = sp.eos_id()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(" EVERYTHING READY")
print("train:", train.shape)
print("vocab:", sp.get_piece_size())
print("encoded:", len(src_ids), len(tgt_ids))


In [None]:
import pandas as pd, numpy as np, torch, re
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.chrf_score import corpus_chrf

# -----------------------
# 1) Build val_loader (if missing)
# -----------------------
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

idx = np.arange(len(train))
train_idx, val_idx = train_test_split(idx, test_size=0.1, random_state=42)

class MTDataset(Dataset):
    def __init__(self, src, tgt, idxs):
        self.src = [src[i] for i in idxs]
        self.tgt = [tgt[i] for i in idxs]
    def __len__(self): return len(self.src)
    def __getitem__(self, i): return self.src[i], self.tgt[i]

def collate_fn(batch, max_src=512, max_tgt=256):
    src, tgt = zip(*batch)
    src = [s[:max_src] for s in src]
    tgt = [t[:max_tgt] for t in tgt]
    src_len = max(len(s) for s in src)
    tgt_len = max(len(t) for t in tgt)

    src_pad = torch.full((len(src), src_len), PAD_ID, dtype=torch.long)
    tgt_pad = torch.full((len(tgt), tgt_len), PAD_ID, dtype=torch.long)
    for i, (s, t) in enumerate(zip(src, tgt)):
        src_pad[i, :len(s)] = torch.tensor(s)
        tgt_pad[i, :len(t)] = torch.tensor(t)
    return src_pad, tgt_pad

val_loader = DataLoader(MTDataset(src_ids, tgt_ids, val_idx), batch_size=32, shuffle=False, collate_fn=collate_fn)
print(" val_loader batches:", len(val_loader))

# -----------------------
# 2) Safety check: model must exist
# -----------------------
try:
    model
except NameError:
    raise NameError(" model not defined. Please run your MODEL + TRAINING cell first, then run this cell again.")

# -----------------------
# 3) Lexicon postprocess (optional but helpful)
# -----------------------
lex = pd.read_csv("/kaggle/input/deep-past-initiative-machine-translation/OA_Lexicon_eBL.csv")
lex.columns = [c.lower() for c in lex.columns]
src_col = next((c for c in lex.columns if any(k in c for k in ["translit","lemma","form","akk"])), None)
tgt_col = next((c for c in lex.columns if any(k in c for k in ["normal","english","name","norm"])), None)

lex_map = {}
if src_col and tgt_col:
    for s, t in lex[[src_col, tgt_col]].dropna().values:
        s, t = str(s).strip(), str(t).strip()
        if len(s) >= 2 and len(t) >= 2:
            lex_map[s] = t

LEX_K = 2000
lex_keys = sorted(list(lex_map.keys()), key=len, reverse=True)[:LEX_K]
lex_re = re.compile(r"\b(" + "|".join(map(re.escape, lex_keys)) + r")\b") if lex_keys else None

def lexicon_postprocess(text: str) -> str:
    if not lex_re: return text
    return lex_re.sub(lambda m: lex_map.get(m.group(1), m.group(1)), text)

# -----------------------
# 4) Beam decode
# -----------------------
@torch.no_grad()
def beam_decode_batch(src, beam_size=5, max_len=160, length_penalty=0.8):
    model.eval()
    src = src.to(device)
    beams = [(torch.full((src.size(0), 1), sp.bos_id(), device=device), torch.zeros(src.size(0), device=device))]

    for _ in range(max_len):
        new_beams = []
        for seq, score in beams:
            logits = model(src, seq)
            logp = torch.log_softmax(logits[:, -1], dim=-1)
            topk = torch.topk(logp, beam_size, dim=-1)

            for k in range(beam_size):
                next_id = topk.indices[:, k:k+1]
                next_score = score + topk.values[:, k]
                new_seq = torch.cat([seq, next_id], dim=1)
                new_beams.append((new_seq, next_score))

        def rank(item):
            seq, sc = item
            lp = (seq.size(1) ** length_penalty)
            return (sc / lp).mean().item()

        new_beams.sort(key=rank, reverse=True)
        beams = new_beams[:beam_size]

        if all((b[0][:, -1] == sp.eos_id()).all() for b in beams):
            break

    return beams[0][0]

def decode_ids(ids):
    # remove special tokens safely
    ids = [i for i in ids if i not in (sp.bos_id(),)]
    if sp.eos_id() in ids:
        ids = ids[:ids.index(sp.eos_id())]
    return sp.decode(ids)

# -----------------------
# 5) Metrics + eval_boosted
# -----------------------
def compute_metrics(preds, refs):
    refs_tok = [[r.split()] for r in refs]
    preds_tok = [p.split() for p in preds]
    bleu = corpus_bleu(refs_tok, preds_tok, smoothing_function=SmoothingFunction().method4) * 100
    chrf = corpus_chrf(refs, preds, beta=2) * 100
    final = (bleu * chrf) ** 0.5
    return bleu, chrf, final

@torch.no_grad()
def eval_boosted(max_batches=20, beam=5, lp=0.8):
    preds, refs = [], []
    seen = 0
    for b, (src, tgt) in enumerate(val_loader):
        if b >= max_batches: break
        out_ids = beam_decode_batch(src, beam_size=beam, length_penalty=lp)
        for i in range(out_ids.size(0)):
            pred = decode_ids(out_ids[i].tolist())
            pred = lexicon_postprocess(pred)
            preds.append(pred)
            refs.append(train["tgt"].iloc[val_idx[seen]])
            seen += 1

    bleu, chrf, final = compute_metrics(preds, refs)
    print(f"[EVAL] beam={beam} lp={lp} | BLEU={bleu:.2f} chrF++={chrf:.2f} FINAL={final:.2f}")
    print("Sample pred:", preds[0][:200])
    print("Sample ref :", refs[0][:200])
    return bleu, chrf, final

# -----------------------
# 6) RUN NOW
# -----------------------
eval_boosted(max_batches=20, beam=5, lp=0.8)


In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Safety: required variables should exist
assert "train" in globals(), "train missing (reload train)."
assert "src_ids" in globals() and "tgt_ids" in globals(), "src_ids/tgt_ids missing (run bootstrap encoding)."
assert "PAD_ID" in globals(), "PAD_ID missing."
assert len(src_ids) == len(tgt_ids) == len(train), "Mismatch in lengths!"

# Split
idx = np.arange(len(train))
train_idx, val_idx = train_test_split(idx, test_size=0.1, random_state=42)

class MTDataset(Dataset):
    def __init__(self, src, tgt, idxs):
        self.src = [src[i] for i in idxs]
        self.tgt = [tgt[i] for i in idxs]
    def __len__(self): return len(self.src)
    def __getitem__(self, i): return self.src[i], self.tgt[i]

def collate_fn(batch, max_src=512, max_tgt=256):
    src, tgt = zip(*batch)
    src = [s[:max_src] for s in src]
    tgt = [t[:max_tgt] for t in tgt]

    src_len = max(len(s) for s in src)
    tgt_len = max(len(t) for t in tgt)

    src_pad = torch.full((len(src), src_len), PAD_ID, dtype=torch.long)
    tgt_pad = torch.full((len(tgt), tgt_len), PAD_ID, dtype=torch.long)

    for i, (s, t) in enumerate(zip(src, tgt)):
        src_pad[i, :len(s)] = torch.tensor(s)
        tgt_pad[i, :len(t)] = torch.tensor(t)

    return src_pad, tgt_pad

train_loader = DataLoader(
    MTDataset(src_ids, tgt_ids, train_idx),
    batch_size=32, shuffle=True, collate_fn=collate_fn
)

val_loader = DataLoader(
    MTDataset(src_ids, tgt_ids, val_idx),
    batch_size=32, shuffle=False, collate_fn=collate_fn
)

print(" loaders ready!")
print("train batches:", len(train_loader))
print("val batches  :", len(val_loader))
print("train samples:", len(train_idx), "val samples:", len(val_idx))


In [None]:
# ===============================
# MASTER BOOTSTRAP (RUN ONCE)
# ===============================

import os, re, math
import pandas as pd
import numpy as np
import torch
import sentencepiece as spm
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# ---------- 1. Load data ----------
train = pd.read_csv("/kaggle/input/deep-past-initiative-machine-translation/train.csv")

sub_map = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")

def clean_transliteration(t):
    t = str(t).translate(sub_map)
    t = re.sub(r"\(d\)", "D_", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def clean_translation(t):
    t = str(t).replace("“", '"').replace("”", '"').replace("’", "'")
    t = re.sub(r"\s+", " ", t).strip()
    return t

train["src"] = train["transliteration"].apply(clean_transliteration)
train["tgt"] = train["translation"].apply(clean_translation)

# ---------- 2. SentencePiece ----------
os.makedirs("/kaggle/working/spm", exist_ok=True)

with open("/kaggle/working/spm/src.txt", "w") as f:
    f.write("\n".join(train["src"]))
with open("/kaggle/working/spm/tgt.txt", "w") as f:
    f.write("\n".join(train["tgt"]))
with open("/kaggle/working/spm/joint.txt", "w") as f:
    f.write(open("/kaggle/working/spm/src.txt").read() + "\n" +
            open("/kaggle/working/spm/tgt.txt").read())

spm.SentencePieceTrainer.train(
    input="/kaggle/working/spm/joint.txt",
    model_prefix="/kaggle/working/spm/oa",
    vocab_size=5000,
    model_type="unigram",
    character_coverage=1.0,
    hard_vocab_limit=False
)

sp = spm.SentencePieceProcessor()
sp.load("/kaggle/working/spm/oa.model")

# ---------- 3. Encode ----------
def encode_with_sp(text):
    ids = sp.encode(str(text), out_type=int)
    return [sp.bos_id()] + ids + [sp.eos_id()]

src_ids = [encode_with_sp(s) for s in train["src"]]
tgt_ids = [encode_with_sp(s) for s in train["tgt"]]

PAD_ID = sp.eos_id()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---------- 4. Loaders ----------
idx = np.arange(len(train))
train_idx, val_idx = train_test_split(idx, test_size=0.1, random_state=42)

class MTDataset(Dataset):
    def __init__(self, src, tgt, idxs):
        self.src = [src[i] for i in idxs]
        self.tgt = [tgt[i] for i in idxs]
    def __len__(self): return len(self.src)
    def __getitem__(self, i): return self.src[i], self.tgt[i]

def collate_fn(batch, max_src=512, max_tgt=256):
    src, tgt = zip(*batch)
    src = [s[:max_src] for s in src]
    tgt = [t[:max_tgt] for t in tgt]

    src_len = max(len(s) for s in src)
    tgt_len = max(len(t) for t in tgt)

    src_pad = torch.full((len(src), src_len), PAD_ID, dtype=torch.long)
    tgt_pad = torch.full((len(tgt), tgt_len), PAD_ID, dtype=torch.long)

    for i, (s, t) in enumerate(zip(src, tgt)):
        src_pad[i, :len(s)] = torch.tensor(s)
        tgt_pad[i, :len(t)] = torch.tensor(t)

    return src_pad, tgt_pad

train_loader = DataLoader(
    MTDataset(src_ids, tgt_ids, train_idx),
    batch_size=32, shuffle=True, collate_fn=collate_fn
)

val_loader = DataLoader(
    MTDataset(src_ids, tgt_ids, val_idx),
    batch_size=32, shuffle=False, collate_fn=collate_fn
)

print(" BOOTSTRAP COMPLETE")
print("train:", train.shape)
print("vocab:", sp.get_piece_size())
print("encoded:", len(src_ids))
print("train_loader:", len(train_loader), "val_loader:", len(val_loader))


In [None]:
import torch, torch.nn as nn, torch.nn.functional as F
import math
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.chrf_score import corpus_chrf

# -----------------------
# 1) Model
# -----------------------
VOCAB = sp.get_piece_size()

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=4096, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))
    def forward(self, x):
        return self.dropout(x + self.pe[:, :x.size(1)])

class TinyTransformerMT(nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=4, layers=3, dim_ff=1024, dropout=0.1):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, d_model)
        self.pos = PositionalEncoding(d_model, dropout=dropout)
        self.tf  = nn.Transformer(
            d_model=d_model, nhead=nhead,
            num_encoder_layers=layers, num_decoder_layers=layers,
            dim_feedforward=dim_ff, dropout=dropout,
            batch_first=True
        )
        self.out = nn.Linear(d_model, vocab_size)
    def forward(self, src_ids, tgt_in_ids):
        src = self.pos(self.emb(src_ids))
        tgt = self.pos(self.emb(tgt_in_ids))
        T = tgt_in_ids.size(1)
        tgt_mask = torch.triu(torch.ones(T, T, device=src.device), diagonal=1).bool()
        h = self.tf(src, tgt, tgt_mask=tgt_mask)
        return self.out(h)

model = TinyTransformerMT(VOCAB).to(device)

# -----------------------
# 2) Loss + optimizer
# -----------------------
opt = torch.optim.AdamW(model.parameters(), lr=3e-4)

def loss_label_smoothing(logits, target, ignore_index=PAD_ID, eps=0.1):
    V = logits.size(-1)
    logits = logits.reshape(-1, V)
    target = target.reshape(-1)
    mask = target != ignore_index
    logits = logits[mask]
    target = target[mask]
    log_probs = F.log_softmax(logits, dim=-1)
    nll = -log_probs.gather(1, target.unsqueeze(1)).squeeze(1)
    smooth = -log_probs.mean(dim=-1)
    return ((1 - eps) * nll + eps * smooth).mean()

def run_epoch(loader, train_mode=True, eps=0.1):
    model.train(train_mode)
    total = 0.0
    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)
        tgt_in, tgt_out = tgt[:, :-1], tgt[:, 1:]
        logits = model(src, tgt_in)
        loss = loss_label_smoothing(logits, tgt_out, eps=eps)

        if train_mode:
            opt.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()

        total += loss.item()
    return total / len(loader)

# -----------------------
# 3) Train 2 epochs
# -----------------------
for ep in range(2):
    tr = run_epoch(train_loader, True, eps=0.1)
    va = run_epoch(val_loader, False, eps=0.1)
    print(f"Epoch {ep+1}: train_loss={tr:.4f} val_loss={va:.4f}")

print(" model trained")

# -----------------------
# 4) Beam decode + eval
# -----------------------
@torch.no_grad()
def beam_decode_batch(src, beam_size=5, max_len=160, length_penalty=0.8):
    model.eval()
    src = src.to(device)
    beams = [(torch.full((src.size(0), 1), sp.bos_id(), device=device), torch.zeros(src.size(0), device=device))]

    for _ in range(max_len):
        new_beams = []
        for seq, score in beams:
            logits = model(src, seq)
            logp = torch.log_softmax(logits[:, -1], dim=-1)
            topk = torch.topk(logp, beam_size, dim=-1)
            for k in range(beam_size):
                next_id = topk.indices[:, k:k+1]
                next_score = score + topk.values[:, k]
                new_seq = torch.cat([seq, next_id], dim=1)
                new_beams.append((new_seq, next_score))

        def rank(item):
            seq, sc = item
            lp = (seq.size(1) ** length_penalty)
            return (sc / lp).mean().item()

        new_beams.sort(key=rank, reverse=True)
        beams = new_beams[:beam_size]

        if all((b[0][:, -1] == sp.eos_id()).all() for b in beams):
            break

    return beams[0][0]

def decode_ids(ids):
    ids = [i for i in ids if i != sp.bos_id()]
    if sp.eos_id() in ids:
        ids = ids[:ids.index(sp.eos_id())]
    return sp.decode(ids)

def compute_metrics(preds, refs):
    refs_tok = [[r.split()] for r in refs]
    preds_tok = [p.split() for p in preds]
    bleu = corpus_bleu(refs_tok, preds_tok, smoothing_function=SmoothingFunction().method4) * 100
    chrf = corpus_chrf(refs, preds, beta=2) * 100
    final = (bleu * chrf) ** 0.5
    return bleu, chrf, final

@torch.no_grad()
def quick_eval(max_batches=10, beam=5, lp=0.8):
    preds, refs = [], []
    seen = 0
    for b, (src, tgt) in enumerate(val_loader):
        if b >= max_batches: break
        out_ids = beam_decode_batch(src, beam_size=beam, length_penalty=lp)
        for i in range(out_ids.size(0)):
            preds.append(decode_ids(out_ids[i].tolist()))
            refs.append(train["tgt"].iloc[val_idx[seen]])
            seen += 1

    bleu, chrf, final = compute_metrics(preds, refs)
    print(f"[EVAL] beam={beam} lp={lp} | BLEU={bleu:.2f} chrF++={chrf:.2f} FINAL={final:.2f}")
    print("pred:", preds[0][:160])
    print("ref :", refs[0][:160])
    return bleu, chrf, final

quick_eval(max_batches=10, beam=5, lp=0.8)


In [None]:
# -----------------------
# Build submission.csv
# -----------------------
import pandas as pd
import torch

# 1) Load test
test = pd.read_csv("/kaggle/input/deep-past-initiative-machine-translation/test.csv")
test["src"] = test["transliteration"].apply(clean_transliteration)

# 2) Helper: predict in batches using beam
@torch.no_grad()
def predict_texts_beam(texts, batch_size=32, beam=5, lp=0.8):
    outs = []
    for i in range(0, len(texts), batch_size):
        chunk = texts[i:i+batch_size]
        enc = [encode_with_sp(s) for s in chunk]
        maxlen = max(len(x) for x in enc)
        src = torch.full((len(enc), maxlen), PAD_ID, dtype=torch.long)
        for j, x in enumerate(enc):
            src[j, :len(x)] = torch.tensor(x)
        out_ids = beam_decode_batch(src, beam_size=beam, length_penalty=lp)
        for k in range(out_ids.size(0)):
            outs.append(decode_ids(out_ids[k].tolist()))
    return outs

preds = predict_texts_beam(test["src"].tolist(), batch_size=32, beam=5, lp=0.8)

# 3) Save submission
sub = pd.read_csv("/kaggle/input/deep-past-initiative-machine-translation/sample_submission.csv")
sub["translation"] = preds
sub.to_csv("submission.csv", index=False)

print(" submission.csv created:", sub.shape)
sub.head()
