In [1]:
from pathlib import Path

PROC_DIR = Path("../data/processed")
SPM_DIR = Path("../data/spm")
SPM_DIR.mkdir(parents=True, exist_ok=True)

combined_path = SPM_DIR / "train_combined.txt"

with open(PROC_DIR / "train.en", encoding="utf-8") as f_en, \
     open(PROC_DIR / "train.vi", encoding="utf-8") as f_vi, \
     open(combined_path, "w", encoding="utf-8") as f_out:
    for line in f_en:
        f_out.write(line)
    for line in f_vi:
        f_out.write(line)

print("Done, combined file at:", combined_path)

Done, combined file at: ..\data\spm\train_combined.txt


In [2]:
import sentencepiece as spm

input_file = str(combined_path)
model_prefix = str(SPM_DIR / "spm_unigram")   # sẽ tạo spm_unigram.model & .vocab

spm.SentencePieceTrainer.Train(
    input=input_file,
    model_prefix=model_prefix,
    vocab_size=8000,
    model_type="unigram",          # hoặc "bpe" nếu bạn thích
    character_coverage=0.9995,     # bao gần hết ký tự
    pad_id=0,
    unk_id=1,
    bos_id=2,
    eos_id=3
)

In [5]:
from pathlib import Path
import sys

# Thêm đường dẫn tới thư mục src vào sys.path
ROOT = Path("..").resolve()
SRC_DIR = ROOT / "src"
sys.path.append(str(SRC_DIR))

from tokenizer import SubwordTokenizer

tok = SubwordTokenizer(ROOT / "data/spm/spm_unigram.model")

sample_en = "We need to build a simple Transformer from scratch."
sample_vi = "Chúng ta cần xây dựng một mô hình Transformer đơn giản từ đầu."

print("EN ids:", tok.encode_src(sample_en))
dec_in, dec_out = tok.encode_tgt(sample_vi)
print("VI dec_in:", dec_in)
print("VI dec_out:", dec_out)

print("Decoded EN:", tok.decode(tok.encode_src(sample_en)))
print("Decoded VI:", tok.decode(dec_in))

EN ids: [18, 1, 143, 343, 11, 1054, 13, 1039, 18, 1, 2057, 6, 4300, 204, 136, 4455, 199]
VI dec_in: [2, 18, 1, 266, 2510, 65, 37, 213, 539, 19, 605, 20, 386, 192, 18, 1, 2057, 6, 4300, 204, 490, 637, 98, 121, 199]
VI dec_out: [18, 1, 266, 2510, 65, 37, 213, 539, 19, 605, 20, 386, 192, 18, 1, 2057, 6, 4300, 204, 490, 637, 98, 121, 199, 3]
Decoded EN:  ⁇ e need to build a simple  ⁇ ransformer from scratch.
Decoded VI:  ⁇ húng ta cần xây dựng một mô hình  ⁇ ransformer đơn giản từ đầu.
