# üöÄ Thai Text Classification with Transformer
### Encoder: thai2vec | Decoder: Transformer
---
**Settings:**
- Train/Test/Valid = 80/10/10
- MAX_LEN = 256
- BATCH_SIZE = 128
- EPOCHS = 500
- dim = 300
- depth = 4
- heads = 4
- LR = 2e-4
- threshold = 0.5

In [None]:
# üì¶ Install dependencies
!pip install -q transformers scikit-learn pythainlp pandas gensim x-transformers

In [None]:
# üìö Import libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from pythainlp import word_tokenize
from pythainlp import word_vector
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from x_transformers import TransformerWrapper, Decoder
import time
import os
from torch.amp import autocast, GradScaler

In [None]:
# ‚öôÔ∏è SETTINGS - ‡∏ï‡∏≤‡∏°‡∏ó‡∏µ‡πà‡∏Å‡∏≥‡∏´‡∏ô‡∏î
MAX_LEN = 256
BATCH_SIZE = 128
EPOCHS = 500
LEARNING_RATE = 2e-4
EMBED_DIM = 300  # dim=300
DEPTH = 4
HEADS = 4
THRESHOLD = 0.5

# Split ratios
TRAIN_RATIO = 0.80
TEST_RATIO = 0.10
VALID_RATIO = 0.10

CHECKPOINT_EVERY = 10  # ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å checkpoint ‡∏ó‡∏∏‡∏Å 10 epochs

print("‚úÖ Settings loaded:")
print(f"   MAX_LEN = {MAX_LEN}")
print(f"   BATCH_SIZE = {BATCH_SIZE}")
print(f"   EPOCHS = {EPOCHS}")
print(f"   LEARNING_RATE = {LEARNING_RATE}")
print(f"   EMBED_DIM = {EMBED_DIM}")
print(f"   DEPTH = {DEPTH}")
print(f"   HEADS = {HEADS}")
print(f"   THRESHOLD = {THRESHOLD}")
print(f"   Train/Valid/Test = {TRAIN_RATIO*100:.0f}%/{VALID_RATIO*100:.0f}%/{TEST_RATIO*100:.0f}%")

In [None]:
# üîß Check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üü¢ Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# üìÇ Load Data
# ‚ö†Ô∏è ‡πÅ‡∏Å‡πâ path ‡∏ï‡∏≤‡∏°‡∏ó‡∏µ‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£:
# ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Kaggle: "/kaggle/input/prachatai-train/prachatai_train.csv"
# ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Colab: "prachatai_train.csv" (upload ‡∏Å‡πà‡∏≠‡∏ô)
# ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Local: ‡πÉ‡∏™‡πà path ‡∏ó‡∏µ‡πà‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á

DATA_PATH = "/kaggle/input/prachatai-train/prachatai_train.csv"  # ‡πÅ‡∏Å‡πâ path ‡∏ï‡∏≤‡∏°‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£

print(f"üìÇ Loading data from: {DATA_PATH}")
df = pd.read_csv(DATA_PATH)

texts = df["body_text"].astype(str).tolist()
label_cols = [
    "politics", "human_rights", "quality_of_life", "international",
    "social", "environment", "economics", "culture", "labor",
    "national_security", "ict", "education"
]
y = df[label_cols].values.astype(np.float32)

print(f"üìä Dataset size: {len(texts)}")
print(f"üìã Labels: {label_cols}")

In [None]:
# üî§ Load thai2vec Word Embedding
print("‡∏Å‡∏≥‡∏•‡∏±‡∏á‡πÇ‡∏´‡∏•‡∏î thai2fit_wv...")
model_wv = word_vector.WordVector(model_name="thai2fit_wv").get_model()
w2v = model_wv
embedding_dim = w2v.vector_size

# ‡πÉ‡∏ä‡πâ dim ‡∏ó‡∏µ‡πà‡∏Å‡∏≥‡∏´‡∏ô‡∏î (‡∏ñ‡πâ‡∏≤‡∏ï‡πà‡∏≤‡∏á‡∏à‡∏≤‡∏Å thai2vec ‡∏à‡∏∞‡∏ï‡πâ‡∏≠‡∏á project)
if embedding_dim != EMBED_DIM:
    print(f"‚ö†Ô∏è thai2vec dim ({embedding_dim}) != target dim ({EMBED_DIM})")
    print(f"   ‡∏à‡∏∞‡πÉ‡∏ä‡πâ embedding_dim ‡∏à‡∏≤‡∏Å thai2vec = {embedding_dim}")
else:
    print(f"‚úÖ Embedding dimension: {embedding_dim} (‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ö dim ‡∏ó‡∏µ‡πà‡∏Å‡∏≥‡∏´‡∏ô‡∏î)")

# Tokenize
print("‡∏Å‡∏≥‡∏•‡∏±‡∏á tokenize...")
tokenized_texts = [word_tokenize(t, keep_whitespace=False) for t in texts]

# Build vocabulary
w2v_vocab = list(w2v.key_to_index.keys())
vocab = {"<PAD>": 0, "<UNK>": 1}
for i, word in enumerate(w2v_vocab, start=2):
    vocab[word] = i

print(f"‚úÖ Vocab size: {len(vocab)}")

In [None]:
# üîß Encode & Pad
def encode_text(tokens, vocab):
    return [vocab.get(w, vocab["<UNK>"]) for w in tokens]

def pad_sequences(sequences, max_len=256, pad_value=0):
    padded = np.full((len(sequences), max_len), pad_value, dtype=np.int64)
    lengths = np.array([min(len(seq), max_len) for seq in sequences], dtype=np.int64)
    for i, seq in enumerate(sequences):
        padded[i, :min(len(seq), max_len)] = seq[:max_len]
    return padded, lengths

encoded_texts = [encode_text(tokens, vocab) for tokens in tokenized_texts]
X, lengths = pad_sequences(encoded_texts, max_len=MAX_LEN)

print(f"‚úÖ X shape: {X.shape}")

In [None]:
# üìä Train/Valid/Test Split (80/10/10)
print(f"\nüìä Splitting data: Train={TRAIN_RATIO*100:.0f}%, Valid={VALID_RATIO*100:.0f}%, Test={TEST_RATIO*100:.0f}%")

# ‡πÅ‡∏ö‡πà‡∏á train ‡πÅ‡∏•‡∏∞ temp (test+valid)
X_train, X_temp, y_train, y_temp, len_train, len_temp = train_test_split(
    X, y, lengths, 
    test_size=(TEST_RATIO + VALID_RATIO),  # 20%
    random_state=42
)

# ‡πÅ‡∏ö‡πà‡∏á temp ‡πÄ‡∏õ‡πá‡∏ô test ‡πÅ‡∏•‡∏∞ valid (50/50 ‡∏Ç‡∏≠‡∏á 20% = 10% each)
X_valid, X_test, y_valid, y_test, len_valid, len_test = train_test_split(
    X_temp, y_temp, len_temp,
    test_size=0.5,  # 50% ‡∏Ç‡∏≠‡∏á temp
    random_state=42
)

print(f"‚úÖ Train size: {len(X_train)} ({len(X_train)/len(X)*100:.1f}%)")
print(f"‚úÖ Valid size: {len(X_valid)} ({len(X_valid)/len(X)*100:.1f}%)")
print(f"‚úÖ Test size: {len(X_test)} ({len(X_test)/len(X)*100:.1f}%)")

In [None]:
# üì¶ Dataset & DataLoader
class ThaiTextDataset(Dataset):
    def __init__(self, X, lengths, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.lengths = torch.tensor(lengths, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.lengths[idx], self.y[idx]

train_dataset = ThaiTextDataset(X_train, len_train, y_train)
valid_dataset = ThaiTextDataset(X_valid, len_valid, y_valid)
test_dataset = ThaiTextDataset(X_test, len_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, 
                          num_workers=2, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE,
                          num_workers=2, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, 
                         num_workers=2, pin_memory=True)

print(f"‚úÖ Train batches per epoch: {len(train_loader)}")
print(f"‚úÖ Valid batches: {len(valid_loader)}")
print(f"‚úÖ Test batches: {len(test_loader)}")

In [None]:
# üß† Build Embedding Matrix
vocab_size = max(vocab.values()) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, idx in vocab.items():
    if word in w2v:
        embedding_matrix[idx] = w2v[word]
    elif word == "<PAD>":
        embedding_matrix[idx] = np.zeros(embedding_dim)
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))

print(f"‚úÖ Embedding matrix shape: {embedding_matrix.shape}")

In [None]:
# ü§ñ Transformer Model
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_seq_len, output_dim, 
                 depth=4, heads=4, embedding_matrix=None):
        super().__init__()
        
        self.transformer = TransformerWrapper(
            num_tokens=vocab_size,
            max_seq_len=max_seq_len,
            attn_layers=Decoder(
                dim=embed_dim,
                depth=depth,
                heads=heads,
                attn_dropout=0.1,
                ff_dropout=0.1
            )
        )
        
        if embedding_matrix is not None:
            self.transformer.token_emb.emb.weight.data.copy_(
                torch.tensor(embedding_matrix, dtype=torch.float32)
            )
        
        self.fc1 = nn.Linear(embed_dim, embed_dim // 2)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.fc2 = nn.Linear(embed_dim // 2, output_dim)

    def forward(self, text, lengths=None):
        transformer_out = self.transformer(text, return_embeddings=True)
        
        if lengths is not None:
            mask = torch.arange(text.size(1), device=text.device).unsqueeze(0) < lengths.unsqueeze(1)
            mask = mask.unsqueeze(-1).float()
            pooled = (transformer_out * mask).sum(dim=1) / mask.sum(dim=1)
        else:
            pooled = transformer_out.mean(dim=1)
        
        out = self.fc1(pooled)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        
        return out

In [None]:
# üèóÔ∏è Create Model
output_dim = len(label_cols)

model = TransformerClassifier(
    vocab_size=len(vocab),
    embed_dim=embedding_dim,  # ‡∏à‡∏≤‡∏Å thai2vec (300)
    max_seq_len=MAX_LEN,
    output_dim=output_dim,
    depth=DEPTH,
    heads=HEADS,
    embedding_matrix=embedding_matrix
).to(device)

print(f"\nü§ñ Model Configuration:")
print(f"   - vocab_size: {len(vocab)}")
print(f"   - embed_dim: {embedding_dim}")
print(f"   - max_seq_len: {MAX_LEN}")
print(f"   - depth: {DEPTH}")
print(f"   - heads: {HEADS}")
print(f"   - output_dim: {output_dim}")
print(f"‚úÖ Model parameters: {sum(p.numel() for p in model.parameters()):,}")

In [None]:
import warnings
warnings.filterwarnings('ignore')

# üéØ Training with Mixed Precision & Validation
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
scaler = GradScaler('cuda')  # üëà ‡πÅ‡∏Å‡πâ

best_val_loss = float('inf')
best_val_f1 = 0.0
start_epoch = 0

# üìÇ Resume from checkpoint if exists
checkpoint_path = 'checkpoint.pth'
if os.path.exists(checkpoint_path):
    print("üìÇ Loading checkpoint...")
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    best_val_loss = checkpoint['best_val_loss']
    print(f"Resumed from epoch {start_epoch}")

print(f"\nüöÄ Starting training from epoch {start_epoch + 1}...")
print(f"   Settings: BATCH_SIZE={BATCH_SIZE}, LR={LEARNING_RATE}, EPOCHS={EPOCHS}")
print(f"   Threshold: {THRESHOLD}")

total_start_time = time.time()

for epoch in range(start_epoch, EPOCHS):
    epoch_start_time = time.time()
    
    # ========== Training ==========
    model.train()
    total_train_loss = 0
    
    for batch_idx, (X_batch, lengths_batch, y_batch) in enumerate(train_loader):
        X_batch = X_batch.to(device)
        lengths_batch = lengths_batch.to(device)
        y_batch = y_batch.to(device)
        
        optimizer.zero_grad()
        
        # Mixed Precision Forward
        with autocast('cuda'):  # üëà ‡πÅ‡∏Å‡πâ
            outputs = model(X_batch, lengths_batch)
            loss = criterion(outputs, y_batch)
        
        # Mixed Precision Backward
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        
        total_train_loss += loss.item()
    
    avg_train_loss = total_train_loss / len(train_loader)
    
    # ========== Validation ==========
    model.eval()
    total_val_loss = 0
    val_y_true, val_y_pred = [], []
    
    with torch.no_grad():
        for X_batch, lengths_batch, y_batch in valid_loader:
            X_batch, lengths_batch = X_batch.to(device), lengths_batch.to(device)
            y_batch = y_batch.to(device)
            
            with autocast('cuda'):  # üëà ‡πÅ‡∏Å‡πâ
                outputs = model(X_batch, lengths_batch)
                loss = criterion(outputs, y_batch)
            
            total_val_loss += loss.item()
            
            preds = torch.sigmoid(outputs).cpu().numpy()
            preds = (preds > THRESHOLD).astype(int)
            val_y_true.append(y_batch.cpu().numpy())
            val_y_pred.append(preds)
    
    avg_val_loss = total_val_loss / len(valid_loader)
    val_y_true = np.vstack(val_y_true)
    val_y_pred = np.vstack(val_y_pred)
    val_f1 = f1_score(val_y_true, val_y_pred, average='macro')
    
    scheduler.step()
    epoch_time = time.time() - epoch_start_time
    
    # Save best model
    save_msg = ""
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_val_f1 = val_f1
        torch.save(model.state_dict(), 'best_transformer_model.pth')
        save_msg = " üíæ"
    
    # Save checkpoint
    if (epoch + 1) % CHECKPOINT_EVERY == 0:
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'best_val_loss': best_val_loss,
        }, checkpoint_path)
    
    # Estimate remaining time
    elapsed = time.time() - total_start_time
    eta = (elapsed / (epoch - start_epoch + 1)) * (EPOCHS - epoch - 1) / 60
    
    print(f"Epoch {epoch+1}/{EPOCHS} | Train: {avg_train_loss:.4f} | Val: {avg_val_loss:.4f} | F1: {val_f1:.4f} | {epoch_time:.1f}s | ETA: {eta:.0f}m{save_msg}")

total_time = time.time() - total_start_time
print(f"\n‚úÖ Training complete!")
print(f"üìä Best Val Loss: {best_val_loss:.4f} | Best F1: {best_val_f1:.4f}")
print(f"‚è±Ô∏è Total time: {total_time/3600:.2f} hours")

In [None]:
# üìà Final Evaluation on Test Set
print("\nüìà Evaluating on Test Set...")
model.load_state_dict(torch.load('best_transformer_model.pth'))
model.eval()

y_true, y_pred = [], []
with torch.no_grad():
    for X_batch, lengths_batch, y_batch in test_loader:
        X_batch, lengths_batch = X_batch.to(device), lengths_batch.to(device)
        
        with autocast('cuda'):
            outputs = model(X_batch, lengths_batch)
        
        preds = torch.sigmoid(outputs).cpu().numpy()
        preds = (preds > THRESHOLD).astype(int)
        y_true.append(y_batch.numpy())
        y_pred.append(preds)

y_true = np.vstack(y_true)
y_pred = np.vstack(y_pred)

print(f"üìä Test F1-score (macro): {f1_score(y_true, y_pred, average='macro'):.4f}")
print(f"üìä Test F1-score (micro): {f1_score(y_true, y_pred, average='micro'):.4f}")
for i, label in enumerate(label_cols):
    print(label, f1_score(y_true[:, i], y_pred[:, i]))

In [None]:
# üîÆ Prediction Function
def predict(text):
    model.eval()
    tokens = word_tokenize(text, keep_whitespace=False)
    ids = encode_text(tokens, vocab)[:MAX_LEN]
    
    # Pad
    padded_ids = ids + [0] * (MAX_LEN - len(ids))
    
    lengths = torch.tensor([len(ids)], dtype=torch.long).to(device)
    padded = torch.tensor([padded_ids], dtype=torch.long).to(device)
    
    with torch.no_grad():
        with autocast('cuda'):
            output = model(padded, lengths)
        probs = torch.sigmoid(output).cpu().numpy()[0]
        
        # Multi-label results with threshold
        results = [(label_cols[i], float(probs[i])) 
                   for i in range(len(probs)) if probs[i] > THRESHOLD]
        
        if not results:
            best_idx = np.argmax(probs)
            results = [(label_cols[best_idx], float(probs[best_idx]))]
        
        return results

# Test predictions
print("\nüîÆ Test Predictions:")
print(predict("‡∏£‡∏±‡∏ê‡∏ö‡∏≤‡∏•‡πÑ‡∏ó‡∏¢‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®‡∏ô‡πÇ‡∏¢‡∏ö‡∏≤‡∏¢‡∏î‡πâ‡∏≤‡∏ô‡∏™‡∏¥‡πà‡∏á‡πÅ‡∏ß‡∏î‡∏•‡πâ‡∏≠‡∏°‡πÉ‡∏´‡∏°‡πà"))
print(predict("‡πÅ‡∏£‡∏á‡∏á‡∏≤‡∏ô‡∏õ‡∏£‡∏∞‡∏ó‡πâ‡∏ß‡∏á‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏™‡∏¥‡∏ó‡∏ò‡∏¥‡πÉ‡∏ô‡∏Å‡∏≤‡∏£‡∏ó‡∏≥‡∏á‡∏≤‡∏ô"))

In [None]:
# üíæ Download model (for Colab)
# from google.colab import files
# files.download('best_transformer_model.pth')