Setup & Imports

In [None]:
import sys
import numpy as np
import sentencepiece as spm
import torch
from pathlib import Path
from torch.utils.data import DataLoader

# 1. Cấu hình đường dẫn (Paths)
# Lưu ý: Notebook đang nằm trong thư mục 'notebook/' nên root là '..'
ROOT = Path("..").resolve()
DATA_RAW = ROOT / "data/raw"
DATA_PROC = ROOT / "data/processed"
DATA_SPM = ROOT / "data/spm"
SRC_DIR = ROOT / "src"

# 2. Tạo thư mục nếu chưa có
DATA_PROC.mkdir(parents=True, exist_ok=True)
DATA_SPM.mkdir(parents=True, exist_ok=True)

# 3. Thêm folder 'src' vào hệ thống để import module
if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

print(f"Project Root: {ROOT}")
print(f"Data Raw: {DATA_RAW}")
print(f"Data Processed: {DATA_PROC}")

Check Raw Data

In [None]:
print("=== 1. CHECK RAW DATA ===")
files = ["train.en", "train.vi", "valid.en", "valid.vi", "test.en", "test.vi"]

for fn in files:
    path = DATA_RAW / fn
    if path.exists():
        with open(path, encoding="utf-8") as f:
            n_lines = sum(1 for _ in f)
        print(f"{fn:<10}: {n_lines} lines")
    else:
        print(f"⚠️ {fn:<10}: NOT FOUND (Hãy kiểm tra lại thư mục data/raw)")

Statistics

In [None]:
print("\n=== 2. STATISTICS (RAW DATA) ===")

def load_lines(path):
    if not path.exists(): return []
    with open(path, encoding="utf-8") as f:
        return [line.strip().lower() for line in f if line.strip()]

# Load tạm tập train để thống kê
train_en = load_lines(DATA_RAW / "train.en")
train_vi = load_lines(DATA_RAW / "train.vi")

def summarize_lengths(name, sents):
    if not sents: return
    lengths = [len(s.split()) for s in sents] # Tách từ theo khoảng trắng
    arr = np.array(lengths)
    print(f"\n--- {name} ---")
    print(f"Min: {arr.min()}, Max: {arr.max()}")
    print(f"Mean: {arr.mean():.2f}, Median: {np.median(arr)}")
    print(f"95th percentile: {np.percentile(arr, 95)}")
    print(f"99th percentile: {np.percentile(arr, 99)}")

summarize_lengths("English", train_en)
summarize_lengths("Vietnamese", train_vi)

# Xóa biến để giải phóng RAM
del train_en, train_vi

Filter & Process

In [None]:
print("\n=== 3. PREPROCESSING & FILTERING ===")

MAX_LEN = 100  # Ngưỡng cắt câu (dựa trên thống kê percentile 99%)

def clean_line(line: str) -> str:
    # Chuyển thường và bỏ khoảng trắng thừa
    return line.strip().lower()

def process_split(split_name):
    src_in = DATA_RAW / f"{split_name}.en"
    tgt_in = DATA_RAW / f"{split_name}.vi"
    
    src_out = DATA_PROC / f"{split_name}.en"
    tgt_out = DATA_PROC / f"{split_name}.vi"
    
    if not src_in.exists() or not tgt_in.exists():
        print(f"Skipping {split_name} (files not found)")
        return

    kept = 0
    dropped = 0

    with open(src_in, encoding="utf-8") as f_src, \
         open(tgt_in, encoding="utf-8") as f_tgt, \
         open(src_out, "w", encoding="utf-8") as f_src_out, \
         open(tgt_out, "w", encoding="utf-8") as f_tgt_out:

        for s_en, s_vi in zip(f_src, f_tgt):
            s_en = clean_line(s_en)
            s_vi = clean_line(s_vi)

            # 1. Bỏ câu rỗng
            if not s_en or not s_vi:
                dropped += 1
                continue
            
            # 2. Bỏ câu quá dài (Outliers)
            if len(s_en.split()) > MAX_LEN or len(s_vi.split()) > MAX_LEN:
                dropped += 1
                continue

            f_src_out.write(s_en + "\n")
            f_tgt_out.write(s_vi + "\n")
            kept += 1

    print(f"Processed {split_name.upper()}: Kept {kept}, Dropped {dropped}")

# Chạy cho cả 3 tập
for split in ["train", "valid", "test"]:
    process_split(split)

Tokenizer (SentencePiece)

In [None]:
print("\n=== 4. TRAIN TOKENIZER (SENTENCEPIECE) ===")

# 4.1. Tạo file gộp (Combined text) từ tập TRAIN đã xử lý
combined_path = DATA_SPM / "train_combined.txt"
path_train_en = DATA_PROC / "train.en"
path_train_vi = DATA_PROC / "train.vi"

if path_train_en.exists() and path_train_vi.exists():
    with open(path_train_en, encoding="utf-8") as f_en, \
         open(path_train_vi, encoding="utf-8") as f_vi, \
         open(combined_path, "w", encoding="utf-8") as f_out:
        for line in f_en: f_out.write(line)
        for line in f_vi: f_out.write(line)
    
    print(f"Created combined file: {combined_path}")

    # 4.2. Train SentencePiece Model
    model_prefix = str(DATA_SPM / "spm_unigram")
    
    # Cấu hình Tokenizer
    spm.SentencePieceTrainer.Train(
        input=str(combined_path),
        model_prefix=model_prefix,
        vocab_size=8000,           # Kích thước từ điển shared
        model_type="unigram",      # Thuật toán Unigram tốt cho dịch máy
        character_coverage=0.9995, # Độ phủ ký tự
        pad_id=0,                  # ID mặc định
        unk_id=1,
        bos_id=2,
        eos_id=3
    )
    print(f"✅ Tokenizer trained successfully! Saved to: {model_prefix}.model")
else:
    print("❌ Error: Processed training files not found.")

Dataset & DataLoader

In [None]:
print("\n=== 5. VERIFY DATA PIPELINE ===")

try:
    # Import các class tự viết trong src/
    from tokenizer import SubwordTokenizer
    from dataset import NMTDataset, collate_fn

    # 1. Load Tokenizer
    model_path = DATA_SPM / "spm_unigram.model"
    if not model_path.exists():
        raise FileNotFoundError("Tokenizer model not found! Run Cell 5 first.")
        
    tok = SubwordTokenizer(model_path)
    
    # Test mã hóa thử 1 câu
    test_str = "hello world"
    print(f"Test Tokenizer ('{test_str}'): {tok.encode_src(test_str)}")

    # 2. Khởi tạo Dataset (Load từ folder processed)
    train_dataset = NMTDataset(
        data_dir=str(DATA_PROC),
        split="train",
        tokenizer=tok,
        max_src_len=70,
        max_tgt_len=70,
    )
    print(f"Dataset Size: {len(train_dataset)}")

    # 3. Khởi tạo DataLoader
    train_loader = DataLoader(
        train_dataset,
        batch_size=4,        # Test batch nhỏ
        shuffle=True,
        collate_fn=lambda batch: collate_fn(batch, pad_id=tok.pad_id)
    )

    # 4. Lấy thử 1 batch
    batch = next(iter(train_loader))
    print("\n✅ Batch Output Shapes:")
    for k, v in batch.items():
        print(f" - {k}: {v.shape} | Type: {v.dtype}")

except ImportError as e:
    print(f"❌ ImportError: {e}")
    print("Gợi ý: Kiểm tra lại file src/tokenizer.py và src/dataset.py")
except Exception as e:
    print(f"❌ Error: {e}")