In [1]:
import sys
sys.path.append('..')

import json
import sentencepiece as spm
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from tqdm import tqdm
import os

## 1. Load Dataset

In [2]:
# Load filtered dataset from notebook 01
print("Loading filtered data from JSON...")
with open('../data/raw/phomt_filtered.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"Loaded {len(data)} sentence pairs")
print(f"\nFirst example:")
print(f"VI: {data[0]['vi']}")
print(f"EN: {data[0]['en']}")

Loading filtered data from JSON...
Loaded 350000 sentence pairs

First example:
VI: C√¢u chuy·ªán b·∫Øt ƒë·∫ßu v·ªõi bu·ªïi l·ªÖ ƒë·∫øm ng∆∞·ª£c .
EN: It begins with a countdown .


## 2. Data Cleaning

In [3]:
# Data is already filtered in notebook 01, no additional cleaning needed
# Just prepare text files for SentencePiece training

print("Preparing data for tokenizer training...")

# Create temporary files for training
os.makedirs('../data/processed', exist_ok=True)

# Write Vietnamese sentences to temp file
vi_train_file = '../data/processed/train_vi.txt'
with open(vi_train_file, 'w', encoding='utf-8') as f:
    for item in data:
        f.write(item['vi'] + '\n')

# Write English sentences to temp file  
en_train_file = '../data/processed/train_en.txt'
with open(en_train_file, 'w', encoding='utf-8') as f:
    for item in data:
        f.write(item['en'] + '\n')

print(f"‚úÖ Created training files:")
print(f"   {vi_train_file}")
print(f"   {en_train_file}")

Preparing data for tokenizer training...
‚úÖ Created training files:
   ../data/processed/train_vi.txt
   ../data/processed/train_en.txt


## 3. Tokenization & Vocabulary Building

## 3.5. Split Dataset

In [4]:
# Train SentencePiece tokenizers for Vietnamese and English
print("=" * 60)
print("TRAINING SENTENCEPIECE TOKENIZERS")
print("=" * 60)

# Configuration
vocab_size = 32000
model_type = 'bpe'  # or 'unigram'

# Train Vietnamese tokenizer
print("\n[1/2] Training Vietnamese tokenizer...")
vi_model_prefix = '../data/processed/spm_vi'
spm.SentencePieceTrainer.train(
    input=vi_train_file,
    model_prefix=vi_model_prefix,
    vocab_size=vocab_size,
    model_type=model_type,
    character_coverage=0.9995,
    pad_id=0,
    unk_id=1,
    bos_id=2,
    eos_id=3,
    pad_piece='<pad>',
    unk_piece='<unk>',
    bos_piece='<s>',
    eos_piece='</s>',
    user_defined_symbols=['<mask>'],
    max_sentence_length=4096
)
print(f"‚úÖ Saved to {vi_model_prefix}.model")

# Train English tokenizer
print("\n[2/2] Training English tokenizer...")
en_model_prefix = '../data/processed/spm_en'
spm.SentencePieceTrainer.train(
    input=en_train_file,
    model_prefix=en_model_prefix,
    vocab_size=vocab_size,
    model_type=model_type,
    character_coverage=1.0,
    pad_id=0,
    unk_id=1,
    bos_id=2,
    eos_id=3,
    pad_piece='<pad>',
    unk_piece='<unk>',
    bos_piece='<s>',
    eos_piece='</s>',
    user_defined_symbols=['<mask>'],
    max_sentence_length=4096
)
print(f"‚úÖ Saved to {en_model_prefix}.model")

# Load tokenizers
sp_vi = spm.SentencePieceProcessor()
sp_vi.load(f'{vi_model_prefix}.model')

sp_en = spm.SentencePieceProcessor()
sp_en.load(f'{en_model_prefix}.model')

print(f"\n‚úÖ Vietnamese vocab size: {sp_vi.get_piece_size()}")
print(f"‚úÖ English vocab size: {sp_en.get_piece_size()}")

# Test tokenization
test_vi = "T√¥i ƒëang h·ªçc ti·∫øng Anh ."
test_en = "I am learning English ."
print(f"\nTest tokenization:")
print(f"VI: {test_vi}")
print(f"   Tokens: {sp_vi.encode_as_pieces(test_vi)}")
print(f"   IDs: {sp_vi.encode_as_ids(test_vi)}")
print(f"\nEN: {test_en}")
print(f"   Tokens: {sp_en.encode_as_pieces(test_en)}")
print(f"   IDs: {sp_en.encode_as_ids(test_en)}")

TRAINING SENTENCEPIECE TOKENIZERS

[1/2] Training Vietnamese tokenizer...
‚úÖ Saved to ../data/processed/spm_vi.model

[2/2] Training English tokenizer...
‚úÖ Saved to ../data/processed/spm_en.model

‚úÖ Vietnamese vocab size: 32000
‚úÖ English vocab size: 32000

Test tokenization:
VI: T√¥i ƒëang h·ªçc ti·∫øng Anh .
   Tokens: ['‚ñÅT√¥i', '‚ñÅƒëang', '‚ñÅh·ªçc', '‚ñÅti·∫øng', '‚ñÅAnh', '‚ñÅ.']
   IDs: [150, 165, 185, 563, 659, 20]

EN: I am learning English .
   Tokens: ['‚ñÅI', '‚ñÅam', '‚ñÅlearning', '‚ñÅEnglish', '‚ñÅ.']
   IDs: [42, 477, 1563, 2325, 15]


In [5]:
# Split data: 300K train, 25K val, 25K test
print("Splitting data into train/val/test...")

train_data = data[:300000]
val_data = data[300000:325000]
test_data = data[325000:350000]

print(f"Train: {len(train_data)} samples")
print(f"Val:   {len(val_data)} samples")
print(f"Test:  {len(test_data)} samples")

Splitting data into train/val/test...
Train: 300000 samples
Val:   25000 samples
Test:  25000 samples


## 4. Create PyTorch Dataset

In [6]:
# Create PyTorch Dataset class
class TranslationDataset(Dataset):
    def __init__(self, data, sp_src, sp_tgt, max_length=128):
        """
        Args:
            data: List of dicts with 'vi' and 'en' keys
            sp_src: SentencePiece processor for source (Vietnamese)
            sp_tgt: SentencePiece processor for target (English)
            max_length: Maximum sequence length (including BOS/EOS)
        """
        self.data = data
        self.sp_src = sp_src
        self.sp_tgt = sp_tgt
        self.max_length = max_length
        
        # Special token IDs
        self.pad_id = 0
        self.bos_id = 2
        self.eos_id = 3
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        # Encode source (Vietnamese): <s> + tokens + </s>
        src_ids = self.sp_src.encode_as_ids(item['vi'])
        src_ids = [self.bos_id] + src_ids + [self.eos_id]
        
        # Encode target (English): <s> + tokens + </s>
        tgt_ids = self.sp_tgt.encode_as_ids(item['en'])
        tgt_ids = [self.bos_id] + tgt_ids + [self.eos_id]
        
        # Truncate if too long
        if len(src_ids) > self.max_length:
            src_ids = src_ids[:self.max_length-1] + [self.eos_id]
        if len(tgt_ids) > self.max_length:
            tgt_ids = tgt_ids[:self.max_length-1] + [self.eos_id]
        
        return {
            'src': torch.tensor(src_ids, dtype=torch.long),
            'tgt': torch.tensor(tgt_ids, dtype=torch.long),
            'src_text': item['vi'],
            'tgt_text': item['en']
        }

# Create datasets
print("Creating PyTorch datasets...")
train_dataset = TranslationDataset(train_data, sp_vi, sp_en, max_length=128)
val_dataset = TranslationDataset(val_data, sp_vi, sp_en, max_length=128)
test_dataset = TranslationDataset(test_data, sp_vi, sp_en, max_length=128)

print(f"‚úÖ Train dataset: {len(train_dataset)} samples")
print(f"‚úÖ Val dataset:   {len(val_dataset)} samples")
print(f"‚úÖ Test dataset:  {len(test_dataset)} samples")

# Test dataset
sample = train_dataset[0]
print(f"\nSample from dataset:")
print(f"Source shape: {sample['src'].shape}")
print(f"Target shape: {sample['tgt'].shape}")
print(f"Source IDs: {sample['src'][:10].tolist()}...")
print(f"Target IDs: {sample['tgt'][:10].tolist()}...")

Creating PyTorch datasets...
‚úÖ Train dataset: 300000 samples
‚úÖ Val dataset:   25000 samples
‚úÖ Test dataset:  25000 samples

Sample from dataset:
Source shape: torch.Size([12])
Target shape: torch.Size([9])
Source IDs: [2, 1484, 367, 320, 180, 86, 1005, 1853, 2199, 1214]...
Target IDs: [2, 200, 4012, 110, 7, 581, 7357, 15, 3]...


## 5. Create DataLoaders

In [7]:
# Create collate function for dynamic padding
def collate_fn(batch):
    """
    Collate function to pad sequences dynamically
    Returns:
        src: [batch_size, max_src_len]
        tgt: [batch_size, max_tgt_len]
        src_mask: [batch_size, max_src_len]
        tgt_mask: [batch_size, max_tgt_len]
    """
    src_batch = [item['src'] for item in batch]
    tgt_batch = [item['tgt'] for item in batch]
    
    # Pad sequences
    src_padded = pad_sequence(src_batch, batch_first=True, padding_value=0)
    tgt_padded = pad_sequence(tgt_batch, batch_first=True, padding_value=0)
    
    # Create masks (1 for real tokens, 0 for padding)
    src_mask = (src_padded != 0).long()
    tgt_mask = (tgt_padded != 0).long()
    
    return {
        'src': src_padded,
        'tgt': tgt_padded,
        'src_mask': src_mask,
        'tgt_mask': tgt_mask
    }

# Create DataLoaders
batch_size = 64  # Adjust based on GPU memory

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=0,  # Set to 0 for Windows, can increase on Linux
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=0,
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=0,
    pin_memory=True
)

print(f"‚úÖ Train loader: {len(train_loader)} batches")
print(f"‚úÖ Val loader:   {len(val_loader)} batches")
print(f"‚úÖ Test loader:  {len(test_loader)} batches")

# Test batch
print("\nTesting batch loading...")
batch = next(iter(train_loader))
print(f"Batch keys: {batch.keys()}")
print(f"Source shape: {batch['src'].shape}")
print(f"Target shape: {batch['tgt'].shape}")
print(f"Source mask shape: {batch['src_mask'].shape}")
print(f"Target mask shape: {batch['tgt_mask'].shape}")

‚úÖ Train loader: 4688 batches
‚úÖ Val loader:   391 batches
‚úÖ Test loader:  391 batches

Testing batch loading...
Batch keys: dict_keys(['src', 'tgt', 'src_mask', 'tgt_mask'])
Source shape: torch.Size([64, 114])
Target shape: torch.Size([64, 70])
Source mask shape: torch.Size([64, 114])
Target mask shape: torch.Size([64, 70])


## 6. Save Processed Data

In [8]:
# Save processed data and metadata
import pickle

print("Saving processed data...")

# Save tokenizer info
tokenizer_info = {
    'vi_model': '../data/processed/spm_vi.model',
    'en_model': '../data/processed/spm_en.model',
    'vi_vocab_size': sp_vi.get_piece_size(),
    'en_vocab_size': sp_en.get_piece_size(),
    'max_length': 128,
    'pad_id': 0,
    'unk_id': 1,
    'bos_id': 2,
    'eos_id': 3,
    'special_tokens': ['<pad>', '<unk>', '<s>', '</s>', '<mask>']
}

with open('../data/processed/tokenizer_info.json', 'w', encoding='utf-8') as f:
    json.dump(tokenizer_info, f, indent=2)

print("‚úÖ Saved tokenizer_info.json")

# Save split data (optional - for quick loading without re-tokenizing)
splits = {
    'train': train_data,
    'val': val_data,
    'test': test_data
}

with open('../data/processed/splits.pkl', 'wb') as f:
    pickle.dump(splits, f)

print("‚úÖ Saved splits.pkl")

# Save dataset statistics
stats = {
    'total_samples': len(data),
    'train_samples': len(train_data),
    'val_samples': len(val_data),
    'test_samples': len(test_data),
    'batch_size': batch_size,
    'train_batches': len(train_loader),
    'val_batches': len(val_loader),
    'test_batches': len(test_loader)
}

with open('../data/processed/stats.json', 'w', encoding='utf-8') as f:
    json.dump(stats, f, indent=2)

print("‚úÖ Saved stats.json")

print("\n" + "=" * 60)
print("PREPROCESSING COMPLETE!")
print("=" * 60)
print(f"‚úÖ Tokenizers: spm_vi.model, spm_en.model")
print(f"‚úÖ Vocab sizes: VI={sp_vi.get_piece_size()}, EN={sp_en.get_piece_size()}")
print(f"‚úÖ Data splits: train={len(train_data)}, val={len(val_data)}, test={len(test_data)}")
print(f"‚úÖ DataLoaders ready with batch_size={batch_size}")
print(f"\nüìå Next step: Open 03_model_building.ipynb to test the Transformer model")

Saving processed data...
‚úÖ Saved tokenizer_info.json
‚úÖ Saved splits.pkl
‚úÖ Saved stats.json

PREPROCESSING COMPLETE!
‚úÖ Tokenizers: spm_vi.model, spm_en.model
‚úÖ Vocab sizes: VI=32000, EN=32000
‚úÖ Data splits: train=300000, val=25000, test=25000
‚úÖ DataLoaders ready with batch_size=64

üìå Next step: Open 03_model_building.ipynb to test the Transformer model
