In [23]:
from pathlib import Path

raw_dir = Path("../data/raw")   # nếu chạy từ thư mục gốc project

files = ["train.en", "train.vi",
         "valid.en", "valid.vi",
         "test.en", "test.vi"]

for fn in files:
    path = raw_dir / fn
    with open(path, encoding="utf-8") as f:
        n_lines = sum(1 for _ in f)
    print(f"{fn}: {n_lines} lines")

train.en: 794523 lines
train.vi: 794523 lines
valid.en: 1553 lines
valid.vi: 1553 lines
test.en: 1268 lines
test.vi: 1268 lines


In [19]:
# ============================================================
# CELL 0: Imports & paths (chạy đầu tiên)
# ============================================================
from pathlib import Path
import re
import html
import unicodedata

# nếu notebook nằm trong thư mục /notebook thì dùng ".."
ROOT = Path("..").resolve()

DATA_DIR = ROOT / "data"
RAW_DIR = DATA_DIR / "raw"
OPUS_DIR = DATA_DIR / "opus"
PROC_DIR = DATA_DIR / "processed"

for p in [RAW_DIR, OPUS_DIR, PROC_DIR]:
    p.mkdir(parents=True, exist_ok=True)

print("ROOT   :", ROOT)
print("RAW    :", RAW_DIR)
print("OPUS   :", OPUS_DIR)
print("PROC   :", PROC_DIR)

def count_lines(path: Path) -> int:
    with path.open(encoding="utf-8") as f:
        return sum(1 for _ in f)

def stats(prefix: str, en_path: Path, vi_path: Path):
    print(
        f"[{prefix}] EN={count_lines(en_path):,}  |  VI={count_lines(vi_path):,}"
    )


ROOT   : C:\Users\ADMIN\Desktop\nmt-transformer
RAW    : C:\Users\ADMIN\Desktop\nmt-transformer\data\raw
OPUS   : C:\Users\ADMIN\Desktop\nmt-transformer\data\opus
PROC   : C:\Users\ADMIN\Desktop\nmt-transformer\data\processed


In [20]:
# ============================================================
# CELL 1: Kiểm tra file IWSLT + OPUS đang có
# ============================================================
# 6 file IWSLT hiện có trong raw/
required_raw = [
    "train.en", "train.vi",
    "valid.en", "valid.vi",
    "test.en",  "test.vi",
]
for name in required_raw:
    p = RAW_DIR / name
    if not p.exists():
        raise FileNotFoundError(f"Missing RAW file: {p}")
    else:
        print("OK:", p)

# 4 file OPUS (TED + QED) trong opus/
required_opus = [
    "TED2020.en-vi.en",
    "TED2020.en-vi.vi",
    "QED.en-vi.en",
    "QED.en-vi.vi",
]
for name in required_opus:
    p = OPUS_DIR / name
    if not p.exists():
        raise FileNotFoundError(f"Missing OPUS file: {p}")
    else:
        print("OK:", p)

stats("IWSLT train (current RAW)", RAW_DIR/"train.en", RAW_DIR/"train.vi")
stats("IWSLT valid", RAW_DIR/"valid.en", RAW_DIR/"valid.vi")
stats("IWSLT test",  RAW_DIR/"test.en",  RAW_DIR/"test.vi")


OK: C:\Users\ADMIN\Desktop\nmt-transformer\data\raw\train.en
OK: C:\Users\ADMIN\Desktop\nmt-transformer\data\raw\train.vi
OK: C:\Users\ADMIN\Desktop\nmt-transformer\data\raw\valid.en
OK: C:\Users\ADMIN\Desktop\nmt-transformer\data\raw\valid.vi
OK: C:\Users\ADMIN\Desktop\nmt-transformer\data\raw\test.en
OK: C:\Users\ADMIN\Desktop\nmt-transformer\data\raw\test.vi
OK: C:\Users\ADMIN\Desktop\nmt-transformer\data\opus\TED2020.en-vi.en
OK: C:\Users\ADMIN\Desktop\nmt-transformer\data\opus\TED2020.en-vi.vi
OK: C:\Users\ADMIN\Desktop\nmt-transformer\data\opus\QED.en-vi.en
OK: C:\Users\ADMIN\Desktop\nmt-transformer\data\opus\QED.en-vi.vi
[IWSLT train (current RAW)] EN=133,317  |  VI=133,317
[IWSLT valid] EN=1,553  |  VI=1,553
[IWSLT test] EN=1,268  |  VI=1,268


In [21]:
# ============================================================
# CELL 2: Backup train IWSLT & định nghĩa hàm ghép song ngữ
# ============================================================
# Nếu đã có backup thì giữ nguyên, nếu chưa thì tạo
iwslt_en_bak = RAW_DIR / "train_iwslt.en"
iwslt_vi_bak = RAW_DIR / "train_iwslt.vi"

if not iwslt_en_bak.exists():
    (RAW_DIR / "train.en").rename(iwslt_en_bak)
    print("Backup train.en  -> train_iwslt.en")
else:
    print("Backup EN đã tồn tại:", iwslt_en_bak)

if not iwslt_vi_bak.exists():
    (RAW_DIR / "train.vi").rename(iwslt_vi_bak)
    print("Backup train.vi  -> train_iwslt.vi")
else:
    print("Backup VI đã tồn tại:", iwslt_vi_bak)

def concat_parallel(en_paths, vi_paths, out_en: Path, out_vi: Path):
    """Ghép nhiều corpus song ngữ lại, giữ alignment từng dòng."""
    assert len(en_paths) == len(vi_paths)
    out_en.parent.mkdir(parents=True, exist_ok=True)
    out_vi.parent.mkdir(parents=True, exist_ok=True)

    total_pairs = 0
    with out_en.open("w", encoding="utf-8") as g_en, \
         out_vi.open("w", encoding="utf-8") as g_vi:
        for en_p, vi_p in zip(en_paths, vi_paths):
            print(f">> Adding {en_p.name} + {vi_p.name}")
            with en_p.open(encoding="utf-8") as f_en, \
                 vi_p.open(encoding="utf-8") as f_vi:
                for en_line, vi_line in zip(f_en, f_vi):
                    en_line = en_line.rstrip("\n")
                    vi_line = vi_line.rstrip("\n")

                    # Nếu cả hai đều rỗng thì bỏ
                    if not en_line.strip() and not vi_line.strip():
                        continue

                    g_en.write(en_line + "\n")
                    g_vi.write(vi_line + "\n")
                    total_pairs += 1

    print(f"=> Written {total_pairs:,} parallel pairs")
    print("   EN:", out_en)
    print("   VI:", out_vi)


Backup train.en  -> train_iwslt.en
Backup train.vi  -> train_iwslt.vi


In [22]:
# ============================================================
# CELL 3: GHÉP IWSLT + TED2020 + QED -> data/raw/train.en/vi
# ============================================================
ted_en = OPUS_DIR / "TED2020.en-vi.en"
ted_vi = OPUS_DIR / "TED2020.en-vi.vi"
qed_en = OPUS_DIR / "QED.en-vi.en"
qed_vi = OPUS_DIR / "QED.en-vi.vi"

en_files = [iwslt_en_bak, ted_en, qed_en]
vi_files = [iwslt_vi_bak, ted_vi, qed_vi]

concat_parallel(en_files, vi_files, RAW_DIR / "train.en", RAW_DIR / "train.vi")
stats("train (merged raw)", RAW_DIR/"train.en", RAW_DIR/"train.vi")


>> Adding train_iwslt.en + train_iwslt.vi
>> Adding TED2020.en-vi.en + TED2020.en-vi.vi
>> Adding QED.en-vi.en + QED.en-vi.vi
=> Written 794,523 parallel pairs
   EN: C:\Users\ADMIN\Desktop\nmt-transformer\data\raw\train.en
   VI: C:\Users\ADMIN\Desktop\nmt-transformer\data\raw\train.vi
[train (merged raw)] EN=794,523  |  VI=794,523
