In [2]:
from pathlib import Path
import numpy as np

RAW_DIR = Path("../data/raw")

def load_and_clean(path):
    sents = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            line = line.lower()
            sents.append(line)
    return sents

train_en = load_and_clean(RAW_DIR / "train.en")
train_vi = load_and_clean(RAW_DIR / "train.vi")

len(train_en), len(train_vi)

(133168, 133205)

In [3]:
def sent_lengths(sents):
    return [len(s.split()) for s in sents]

len_en = sent_lengths(train_en)
len_vi = sent_lengths(train_vi)

def summarize(name, lengths):
    arr = np.array(lengths)
    print(f"== {name} ==")
    print("min:", arr.min())
    print("mean:", arr.mean())
    print("median:", np.median(arr))
    print("95 percentile:", np.percentile(arr, 95))
    print("99 percentile:", np.percentile(arr, 99))
    print("max:", arr.max())

summarize("EN", len_en)
summarize("VI", len_vi)


== EN ==
min: 1
mean: 20.322111918779285
median: 16.0
95 percentile: 47.0
99 percentile: 71.0
max: 628
== VI ==
min: 1
mean: 24.86023797905484
median: 20.0
95 percentile: 59.0
99 percentile: 89.0
max: 850


In [None]:
from pathlib import Path

RAW_DIR = Path("../data/raw")
PROC_DIR = Path("../data/processed")
PROC_DIR.mkdir(parents=True, exist_ok=True)

MAX_LEN_FILTER = 100   # filter theo white-space token
def clean_line(line: str) -> str:
    line = line.strip()
    line = line.lower()
    # nếu muốn sau này có thể thêm normalize unicode, xử lý dấu…
    return line

def filter_and_save(split):
    src_in = RAW_DIR / f"{split}.en"
    tgt_in = RAW_DIR / f"{split}.vi"

    src_out = PROC_DIR / f"{split}.en"
    tgt_out = PROC_DIR / f"{split}.vi"

    kept = 0
    dropped = 0

    with open(src_in, encoding="utf-8") as f_src, \
         open(tgt_in, encoding="utf-8") as f_tgt, \
         open(src_out, "w", encoding="utf-8") as f_src_out, \
         open(tgt_out, "w", encoding="utf-8") as f_tgt_out:

        for s_en, s_vi in zip(f_src, f_tgt):
            s_en = clean_line(s_en)
            s_vi = clean_line(s_vi)

            if not s_en or not s_vi:
                dropped += 1
                continue

            if len(s_en.split()) > MAX_LEN_FILTER or len(s_vi.split()) > MAX_LEN_FILTER:
                dropped += 1
                continue

            f_src_out.write(s_en + "\n")
            f_tgt_out.write(s_vi + "\n")
            kept += 1

    print(f"{split}: kept = {kept}, dropped = {dropped}")

for split in ["train", "valid", "test"]:
    filter_and_save(split)

train: kept = 132406, dropped = 911
valid: kept = 1550, dropped = 3
test: kept = 1262, dropped = 6


In [5]:
for split in ["train", "valid", "test"]:
    with open(PROC_DIR / f"{split}.en", encoding="utf-8") as f:
        print(split, "EN lines:", sum(1 for _ in f))
    with open(PROC_DIR / f"{split}.vi", encoding="utf-8") as f:
        print(split, "VI lines:", sum(1 for _ in f))
    print("----")

train EN lines: 132406
train VI lines: 132406
----
valid EN lines: 1550
valid VI lines: 1550
----
test EN lines: 1262
test VI lines: 1262
----
