In [1]:
# Cell 1 — Paths & file discovery
from pathlib import Path

DATA_ROOT = Path("/kaggle/input/data-vlsp")
PROC_DIR = DATA_ROOT / "processed"

assert PROC_DIR.exists(), f"Không thấy folder: {PROC_DIR}"

splits = ["train", "valid", "test"]
langs = ["en", "vi"]

files = {}
missing = []
for sp in splits:
    for lg in langs:
        p = PROC_DIR / f"{sp}.{lg}"
        files[(sp, lg)] = p
        if not p.exists():
            missing.append(str(p))

if missing:
    raise FileNotFoundError("Thiếu file:\n" + "\n".join(missing))

print("OK: Found all processed files:")
for sp in splits:
    print(f"- {sp}: {files[(sp,'en')]}  |  {files[(sp,'vi')]}")

OK: Found all processed files:
- train: /kaggle/input/data-vlsp/processed/train.en  |  /kaggle/input/data-vlsp/processed/train.vi
- valid: /kaggle/input/data-vlsp/processed/valid.en  |  /kaggle/input/data-vlsp/processed/valid.vi
- test: /kaggle/input/data-vlsp/processed/test.en  |  /kaggle/input/data-vlsp/processed/test.vi


In [2]:
# Cell 2 — Fast line counter
def count_lines(path: Path) -> int:
    n = 0
    with path.open("rb") as f:
        for _ in f:
            n += 1
    return n

counts = {}
for sp in splits:
    for lg in langs:
        counts[(sp, lg)] = count_lines(files[(sp, lg)])

print("Line counts:")
for sp in splits:
    en_n = counts[(sp, "en")]
    vi_n = counts[(sp, "vi")]
    print(f"{sp:>5}: en={en_n:,}  vi={vi_n:,}  match={en_n==vi_n}")

Line counts:
train: en=490,000  vi=490,000  match=True
valid: en=10,000  vi=10,000  match=True
 test: en=3,000  vi=3,000  match=True


In [3]:
# Cell 3 — Parallel scan per split: detect mismatches, empty lines, bad chars, length stats
from itertools import zip_longest
import unicodedata
import heapq

def analyze_parallel(src_path: Path, tgt_path: Path, max_report=20, nfc_check_limit=50000):
    report = {
        "n_src": 0,
        "n_tgt": 0,
        "n_pairs": 0,
        "mismatch_lines": 0,
        "mismatch_examples": [],
        "empty_src": 0,
        "empty_tgt": 0,
        "empty_both": 0,
        "empty_only_src": 0,
        "empty_only_tgt": 0,
        "badchar_src": 0,   # '�' replacement char
        "badchar_tgt": 0,
        "maxlen_src": 0,
        "maxlen_tgt": 0,
        "sumlen_src": 0,
        "sumlen_tgt": 0,
        "top_long_src": [],  # (len, idx, preview)
        "top_long_tgt": [],
        "nfc_changed_tgt": 0,
        "nfc_checked_tgt": 0,
    }

    # open as text with replace to surface decode issues via '�'
    with src_path.open("r", encoding="utf-8", errors="replace") as fs, \
         tgt_path.open("r", encoding="utf-8", errors="replace") as ft:

        for idx, (s, t) in enumerate(zip_longest(fs, ft, fillvalue=None), start=1):
            if s is None:
                report["n_tgt"] += 1
                report["mismatch_lines"] += 1
                if len(report["mismatch_examples"]) < max_report:
                    report["mismatch_examples"].append((idx, None, t[:200] if t else None))
                continue
            if t is None:
                report["n_src"] += 1
                report["mismatch_lines"] += 1
                if len(report["mismatch_examples"]) < max_report:
                    report["mismatch_examples"].append((idx, s[:200] if s else None, None))
                continue

            report["n_pairs"] += 1
            report["n_src"] += 1
            report["n_tgt"] += 1

            s = s.rstrip("\n\r")
            t = t.rstrip("\n\r")

            # empty-line checks
            s_empty = (len(s.strip()) == 0)
            t_empty = (len(t.strip()) == 0)
            if s_empty: report["empty_src"] += 1
            if t_empty: report["empty_tgt"] += 1
            if s_empty and t_empty: report["empty_both"] += 1
            if s_empty and not t_empty: report["empty_only_src"] += 1
            if t_empty and not s_empty: report["empty_only_tgt"] += 1

            # replacement char checks
            if "�" in s: report["badchar_src"] += 1
            if "�" in t: report["badchar_tgt"] += 1

            # length stats
            ls = len(s)
            lt = len(t)
            report["sumlen_src"] += ls
            report["sumlen_tgt"] += lt
            report["maxlen_src"] = max(report["maxlen_src"], ls)
            report["maxlen_tgt"] = max(report["maxlen_tgt"], lt)

            # keep top 20 longest
            if len(report["top_long_src"]) < 20:
                heapq.heappush(report["top_long_src"], (ls, idx, s[:200]))
            else:
                heapq.heappushpop(report["top_long_src"], (ls, idx, s[:200]))

            if len(report["top_long_tgt"]) < 20:
                heapq.heappush(report["top_long_tgt"], (lt, idx, t[:200]))
            else:
                heapq.heappushpop(report["top_long_tgt"], (lt, idx, t[:200]))

            # NFC normalization check on target (Vietnamese usually)
            if report["nfc_checked_tgt"] < nfc_check_limit:
                report["nfc_checked_tgt"] += 1
                if unicodedata.normalize("NFC", t) != t:
                    report["nfc_changed_tgt"] += 1

    # finalize
    report["meanlen_src"] = report["sumlen_src"] / max(report["n_src"], 1)
    report["meanlen_tgt"] = report["sumlen_tgt"] / max(report["n_tgt"], 1)

    report["top_long_src"] = sorted(report["top_long_src"], reverse=True)
    report["top_long_tgt"] = sorted(report["top_long_tgt"], reverse=True)
    return report

all_reports = {}
for sp in splits:
    rep = analyze_parallel(files[(sp, "en")], files[(sp, "vi")])
    all_reports[sp] = rep

for sp in splits:
    rep = all_reports[sp]
    print(f"\n=== {sp.upper()} ===")
    print(f"Pairs: {rep['n_pairs']:,} | src lines: {rep['n_src']:,} | tgt lines: {rep['n_tgt']:,}")
    print(f"Mismatched length lines: {rep['mismatch_lines']:,}")
    print(f"Empty: src={rep['empty_src']:,}, tgt={rep['empty_tgt']:,}, both={rep['empty_both']:,}, only_src={rep['empty_only_src']:,}, only_tgt={rep['empty_only_tgt']:,}")
    print(f"Decode replacement char '�': src={rep['badchar_src']:,}, tgt={rep['badchar_tgt']:,}")
    print(f"Length: mean_src={rep['meanlen_src']:.1f}, max_src={rep['maxlen_src']:,} | mean_tgt={rep['meanlen_tgt']:.1f}, max_tgt={rep['maxlen_tgt']:,}")
    print(f"NFC changed (tgt check sample {rep['nfc_checked_tgt']:,}): {rep['nfc_changed_tgt']:,}")

    if rep["mismatch_examples"]:
        print("First mismatches (idx, src_preview, tgt_preview):")
        for ex in rep["mismatch_examples"][:5]:
            print(" -", ex[0], "|", (ex[1] or "None"), "|", (ex[2] or "None"))


=== TRAIN ===
Pairs: 490,000 | src lines: 490,000 | tgt lines: 490,000
Mismatched length lines: 0
Empty: src=0, tgt=0, both=0, only_src=0, only_tgt=0
Decode replacement char '�': src=1, tgt=0
Length: mean_src=141.4, max_src=3,216 | mean_tgt=138.2, max_tgt=2,385
NFC changed (tgt check sample 50,000): 0

=== VALID ===
Pairs: 10,000 | src lines: 10,000 | tgt lines: 10,000
Mismatched length lines: 0
Empty: src=0, tgt=0, both=0, only_src=0, only_tgt=0
Decode replacement char '�': src=0, tgt=0
Length: mean_src=140.6, max_src=2,218 | mean_tgt=137.4, max_tgt=2,190
NFC changed (tgt check sample 10,000): 0

=== TEST ===
Pairs: 3,000 | src lines: 3,000 | tgt lines: 3,000
Mismatched length lines: 0
Empty: src=0, tgt=0, both=0, only_src=0, only_tgt=0
Decode replacement char '�': src=1, tgt=0
Length: mean_src=140.8, max_src=835 | mean_tgt=135.3, max_tgt=877
NFC changed (tgt check sample 3,000): 0


In [4]:
# Cell 4 — Show a few random aligned samples per split
import random

def random_samples(src_path: Path, tgt_path: Path, k=5, seed=42):
    rng = random.Random(seed)
    # reservoir sampling of line indices
    samples = []
    with src_path.open("r", encoding="utf-8", errors="replace") as fs, \
         tgt_path.open("r", encoding="utf-8", errors="replace") as ft:
        for i, (s, t) in enumerate(zip(fs, ft), start=1):
            if len(samples) < k:
                samples.append((i, s.rstrip("\n\r"), t.rstrip("\n\r")))
            else:
                j = rng.randint(1, i)
                if j <= k:
                    samples[j-1] = (i, s.rstrip("\n\r"), t.rstrip("\n\r"))
    return samples

for sp in splits:
    print(f"\n--- Random samples: {sp} ---")
    samp = random_samples(files[(sp,"en")], files[(sp,"vi")], k=5, seed=123 + splits.index(sp))
    for idx, s, t in samp:
        print(f"[{idx}] EN: {s[:160]}")
        print(f"     VI: {t[:160]}")


--- Random samples: train ---
[33048] EN: Treatment of Tetanus Supportive care, particularly respiratory support Wound debridement Tetanus antitoxin Benzodiazepines for muscle spasms Antibiotics Sometim
     VI: Điều trị uốn ván Điều trị hỗ trợ, đặc biệt hỗ trợ hô hấp Mở ổ vết thương Kháng độc tố uốn ván Benzodiazepin cho co thắt cơ Metronidazole hoặc penicillin Đôi khi
[196267] EN: We surveyed all patients diagnosed with vertebral compression fracture at Duc Giang Hospital during the data collection period from 2015 to 2018.
     VI: Chọn mẫu toàn bộ người bệnh được chẩn đoán xẹp thân đốt sống tại Bệnh viện Đức Giang trong thời gian thu thập số liệu từ năm 2015 đến 2018.
[161486] EN: Dental treatment to smooth the edges and improve appearance is elective.
     VI: Can thiệp nha khoa để làm nhẵn các cạnh và cải thiện thẩm mỹ.
[368225] EN: If the cyst is no longer palpable, it is considered benign.
     VI: Nếu nang không còn sờ thấy, nó sẽ được coi là lành tính.
[484156] EN: Conclusio

In [5]:
# Cell 5 — Report top longest lines (useful to decide later whether to filter outliers)
for sp in splits:
    rep = all_reports[sp]
    print(f"\n--- {sp.upper()} top-5 longest EN lines ---")
    for ln, idx, preview in rep["top_long_src"][:5]:
        print(f"len={ln:>6}  idx={idx:>7}  preview={preview}")

    print(f"--- {sp.upper()} top-5 longest VI lines ---")
    for ln, idx, preview in rep["top_long_tgt"][:5]:
        print(f"len={ln:>6}  idx={idx:>7}  preview={preview}")


--- TRAIN top-5 longest EN lines ---
len=  3216  idx= 409820  preview=Typically the lesions that can be detected with MRI are those that contain blood products 23. hemorrhagic powder burn lesions appear bright on T1 fat-saturated sequences small solid deep lesions may b
len=  2868  idx= 238050  preview=Patients' cough may also worsen after inhalation of radiopharmaceuticals, further increasing this risk.47, 48 As a result, multiple authors have suggested performing only a planar perfusion scan, perh
len=  2569  idx= 327139  preview=Indications for Percutaneous Cricothyrotomy Apnea, severe respiratory failure, or impending respiratory arrest requiring endotracheal intubation and either Failed attempts at orotracheal or nasotrache
len=  2569  idx=   9170  preview=Indications for Percutaneous Cricothyrotomy Apnea, severe respiratory failure, or impending respiratory arrest requiring endotracheal intubation and either Failed attempts at orotracheal or nasotrache
len=  2343  idx= 142807  p

In [6]:
# Cell 1 — Setup (paths + model choice + HF cache)
from pathlib import Path
import os

DATA_ROOT = Path("/kaggle/input/data-vlsp")
PROC_DIR = DATA_ROOT / "processed"
assert PROC_DIR.exists(), f"Missing: {PROC_DIR}"

# Qwen model (best fit for 2xT4 + 490k lines)
MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"

# Cache to working (optional but helps)
os.environ["HF_HOME"] = "/kaggle/working/hf"
os.environ["TRANSFORMERS_CACHE"] = "/kaggle/working/hf/transformers"
os.environ["HF_DATASETS_CACHE"] = "/kaggle/working/hf/datasets"

OUT_DIR = Path("/kaggle/working/vlsp_step2")
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("PROC_DIR:", PROC_DIR)
print("OUT_DIR :", OUT_DIR)
print("MODEL_ID:", MODEL_ID)

PROC_DIR: /kaggle/input/data-vlsp/processed
OUT_DIR : /kaggle/working/vlsp_step2
MODEL_ID: Qwen/Qwen2.5-1.5B-Instruct


In [7]:
# Cell 2 — Load tokenizer + define prompt templates (EN->VI and VI->EN)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def make_prompt_en2vi(src_en: str) -> str:
    return (
        "You are a professional medical translator.\n"
        "### Task: Translate English to Vietnamese (medical domain)\n"
        f"### English: {src_en}\n"
        "### Vietnamese:"
    )

def make_prompt_vi2en(src_vi: str) -> str:
    return (
        "You are a professional medical translator.\n"
        "### Task: Translate Vietnamese to English (medical domain)\n"
        f"### Vietnamese: {src_vi}\n"
        "### English:"
    )

print("Tokenizer OK. eos_token =", repr(tokenizer.eos_token), "| pad_token =", repr(tokenizer.pad_token))
print("\nExample EN->VI prompt:\n", make_prompt_en2vi("Juvenile idiopathic arthritis (JIA) is uncommon."))
print("\nExample VI->EN prompt:\n", make_prompt_vi2en("Bệnh lý viêm khớp thiếu niên tự phát không hay gặp."))



tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Tokenizer OK. eos_token = '<|im_end|>' | pad_token = '<|endoftext|>'

Example EN->VI prompt:
 You are a professional medical translator.
### Task: Translate English to Vietnamese (medical domain)
### English: Juvenile idiopathic arthritis (JIA) is uncommon.
### Vietnamese:

Example VI->EN prompt:
 You are a professional medical translator.
### Task: Translate Vietnamese to English (medical domain)
### Vietnamese: Bệnh lý viêm khớp thiếu niên tự phát không hay gặp.
### English:


In [8]:
# Cell 3 — Build SFT datasets (2 directions) from processed files; save to disk
from datasets import Dataset, DatasetDict, Features, Value
import unicodedata

def _clean_line(x: str) -> str:
    # minimal cleaning: strip + NFC
    x = x.rstrip("\n\r")
    x = x.strip()
    x = unicodedata.normalize("NFC", x)
    return x

def build_sft_split(split: str, direction: str, filter_badchar: bool = True) -> Dataset:
    """
    direction: 'en2vi' or 'vi2en'
    Produces fields: id, src, tgt, text
    """
    assert direction in ["en2vi", "vi2en"]
    src_lang = "en" if direction == "en2vi" else "vi"
    tgt_lang = "vi" if direction == "en2vi" else "en"
    src_path = PROC_DIR / f"{split}.{src_lang}"
    tgt_path = PROC_DIR / f"{split}.{tgt_lang}"

    feats = Features({
        "id": Value("int32"),
        "src": Value("string"),
        "tgt": Value("string"),
        "text": Value("string"),
    })

    def gen():
        dropped = 0
        kept = 0
        with src_path.open("r", encoding="utf-8", errors="replace") as fs, \
             tgt_path.open("r", encoding="utf-8", errors="replace") as ft:
            for i, (s, t) in enumerate(zip(fs, ft), start=1):
                s = _clean_line(s)
                t = _clean_line(t)

                # no empty lines expected, but keep robust
                if (not s) or (not t):
                    dropped += 1
                    continue

                # filter replacement-char '�' only for train/valid (NOT test)
                if filter_badchar and ("�" in s or "�" in t):
                    dropped += 1
                    continue

                if direction == "en2vi":
                    prompt = make_prompt_en2vi(s)
                else:
                    prompt = make_prompt_vi2en(s)

                text = prompt + " " + t + (tokenizer.eos_token or "")
                kept += 1
                yield {"id": i, "src": s, "tgt": t, "text": text}

        # generator end

    ds = Dataset.from_generator(gen, features=feats)
    ds = ds.with_format("python")
    return ds

def build_and_save(direction: str):
    # filter_badchar for train/valid; keep test raw for later BLEU alignment
    train_ds = build_sft_split("train", direction, filter_badchar=True)
    valid_ds = build_sft_split("valid", direction, filter_badchar=True)

    out = DatasetDict(train=train_ds, valid=valid_ds)
    save_path = OUT_DIR / f"sft_{direction}"
    out.save_to_disk(str(save_path))
    print(f"Saved: {save_path}")
    print(" - train:", len(train_ds), "valid:", len(valid_ds))
    return save_path

path_en2vi = build_and_save("en2vi")
path_vi2en = build_and_save("vi2en")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/489999 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Saved: /kaggle/working/vlsp_step2/sft_en2vi
 - train: 489999 valid: 10000


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/489999 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Saved: /kaggle/working/vlsp_step2/sft_vi2en
 - train: 489999 valid: 10000


In [9]:
# Cell 4 — Quick sanity: load back + print a few samples (both directions)
from datasets import load_from_disk
import random

ds_en2vi = load_from_disk(str(path_en2vi))
ds_vi2en = load_from_disk(str(path_vi2en))

def show_samples(ds, split="train", k=3, seed=42):
    rng = random.Random(seed)
    idxs = [rng.randrange(0, len(ds[split])) for _ in range(k)]
    for j, idx in enumerate(idxs, start=1):
        ex = ds[split][idx]
        print(f"\nSample {j} | id={ex['id']}")
        print("SRC:", ex["src"][:200])
        print("TGT:", ex["tgt"][:200])
        print("TEXT(prompt+target) preview:\n", ex["text"][:400], " ...")

print("EN2VI sizes:", {k: len(v) for k,v in ds_en2vi.items()})
show_samples(ds_en2vi, "train", k=2, seed=1)

print("\nVI2EN sizes:", {k: len(v) for k,v in ds_vi2en.items()})
show_samples(ds_vi2en, "train", k=2, seed=2)

EN2VI sizes: {'train': 489999, 'valid': 10000}

Sample 1 | id=70446
SRC: In females, who have 2 (or, with sex chromosomal abnormalities, > 2) X chromosomes (except in eggs), all but one of the X chromosomes is inactivated; ie, most of the alleles on that chromosome are not
TGT: Ở nữ, những người có 2 (hoặc, với bất thường nhiễm sắc thể giới tính, > 2) nhiễm sắc thể X (ngoại trừ ở trứng), tất cả trừ một trong các nhiễm sắc thể X bị bất hoạt; tức là, hầu hết các alen trên nhiễ
TEXT(prompt+target) preview:
 You are a professional medical translator.
### Task: Translate English to Vietnamese (medical domain)
### English: In females, who have 2 (or, with sex chromosomal abnormalities, > 2) X chromosomes (except in eggs), all but one of the X chromosomes is inactivated; ie, most of the alleles on that chromosome are not expressed.
### Vietnamese: Ở nữ, những người có 2 (hoặc, với bất thường nhiễm sắc th  ...

Sample 2 | id=298428
SRC: This result contributes to supporting clinicians in the p

In [10]:
# Cell 5 — Token length estimation (to decide max_seq_length later)
import numpy as np

def token_len_stats(ds_split, sample_n=20000, seed=123):
    n = len(ds_split)
    take = min(sample_n, n)
    rng = np.random.default_rng(seed)
    idxs = rng.choice(n, size=take, replace=False)

    lens = []
    for idx in idxs:
        text = ds_split[int(idx)]["text"]
        lens.append(len(tokenizer(text, add_special_tokens=False)["input_ids"]))

    lens = np.array(lens)
    stats = {
        "n_sample": int(take),
        "p50": int(np.percentile(lens, 50)),
        "p90": int(np.percentile(lens, 90)),
        "p95": int(np.percentile(lens, 95)),
        "p99": int(np.percentile(lens, 99)),
        "max": int(lens.max()),
        "mean": float(lens.mean()),
    }
    return stats

print("EN2VI train token stats:", token_len_stats(ds_en2vi["train"], sample_n=20000))
print("EN2VI valid token stats:", token_len_stats(ds_en2vi["valid"], sample_n=10000))

print("\nVI2EN train token stats:", token_len_stats(ds_vi2en["train"], sample_n=20000))
print("VI2EN valid token stats:", token_len_stats(ds_vi2en["valid"], sample_n=10000))

EN2VI train token stats: {'n_sample': 20000, 'p50': 88, 'p90': 158, 'p95': 193, 'p99': 304, 'max': 1332, 'mean': 101.54205}
EN2VI valid token stats: {'n_sample': 10000, 'p50': 88, 'p90': 156, 'p95': 192, 'p99': 296, 'max': 1192, 'mean': 100.5592}

VI2EN train token stats: {'n_sample': 20000, 'p50': 88, 'p90': 158, 'p95': 193, 'p99': 304, 'max': 1332, 'mean': 101.5537}
VI2EN valid token stats: {'n_sample': 10000, 'p50': 88, 'p90': 156, 'p95': 192, 'p99': 297, 'max': 1192, 'mean': 100.5716}
