In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import math
import time
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from pythainlp import word_tokenize, word_vector
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

In [2]:
# =====================================================
# HYPERPARAMETERS (‡∏ï‡∏±‡πâ‡∏á‡∏Ñ‡πà‡∏≤‡πÉ‡∏´‡∏°‡πà‡∏ï‡∏≤‡∏°‡∏ó‡∏µ‡πà‡∏Å‡∏≥‡∏´‡∏ô‡∏î)
# =====================================================
MAX_LEN = 256
BATCH_SIZE = 128
EPOCHS = 500
LEARNING_RATE = 2e-4  # Fixed LR - ‡πÑ‡∏°‡πà‡∏•‡∏î‡∏•‡∏á‡πÄ‡∏≠‡∏á
EMBED_DIM = 300
DEPTH = 4
HEADS = 4
THRESHOLD = 0.5
PATIENCE = 25  # Early stopping patience

# Train/Test/Valid Split Ratios
TRAIN_RATIO = 0.8   # 80%
TEST_RATIO = 0.1    # 10%
VALID_RATIO = 0.1   # 10%

In [3]:
# DEVICE SETUP
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üü¢ Using device: {device}")

üü¢ Using device: cuda


In [4]:
# LOAD DATA
df = pd.read_csv(r"D:\year4\‡∏™‡∏´‡∏Å‡∏¥‡∏à\prachatai_train.csv")
texts = df["body_text"].astype(str).tolist()
label_cols = [
    "politics", "human_rights", "quality_of_life", "international",
    "social", "environment", "economics", "culture", "labor",
    "national_security", "ict", "education"
]
y = df[label_cols].values.astype(np.float32)

In [5]:
# LOAD THAI2FIT WORD VECTORS
print("üì• Loading Thai2Fit Word Vectors...")
w2v = word_vector.WordVector(model_name="thai2fit_wv").get_model()
embedding_dim = w2v.vector_size  # Should be 300
vocab_list = list(w2v.key_to_index.keys())
print(f"‚úÖ Loaded! Embedding dim: {embedding_dim}")

# Build vocab
vocab = {"<PAD>": 0, "<UNK>": 1}
for i, word in enumerate(vocab_list, start=2):
    vocab[word] = i

üì• Loading Thai2Fit Word Vectors...
‚úÖ Loaded! Embedding dim: 300


In [None]:
# TOKENIZE & ENCODE
print("üî§ Tokenizing texts...")
tokenized_texts = [word_tokenize(t, keep_whitespace=False) for t in texts]

def encode_text(tokens, vocab):
    return [vocab.get(w, vocab["<UNK>"]) for w in tokens]

encoded_texts = [encode_text(tokens, vocab) for tokens in tokenized_texts]
print("‚úÖ Tokenization complete!")

üî§ Tokenizing texts...


In [None]:
# PAD SEQUENCES
def pad_sequences(sequences, max_len=256, pad_value=0):
    padded = np.full((len(sequences), max_len), pad_value, dtype=np.int64)
    lengths = np.array([min(len(seq), max_len) for seq in sequences], dtype=np.int64)
    for i, seq in enumerate(sequences):
        end = min(len(seq), max_len)
        padded[i, :end] = seq[:end]
    return padded, lengths

X, lengths = pad_sequences(encoded_texts, max_len=MAX_LEN)

In [None]:
# SPLIT DATA: 80% Train, 10% Test, 10% Valid
print("üìä Splitting data: 80% train, 10% test, 10% valid...")

# First split: 80% train, 20% temp
X_train, X_temp, y_train, y_temp, len_train, len_temp = train_test_split(
    X, y, lengths, test_size=0.2, random_state=42
)

# Second split: 50% test, 50% valid from temp (= 10% each of total)
X_test, X_val, y_test, y_val, len_test, len_val = train_test_split(
    X_temp, y_temp, len_temp, test_size=0.5, random_state=42
)

print(f"  Train: {len(X_train)} samples")
print(f"  Test:  {len(X_test)} samples")
print(f"  Valid: {len(X_val)} samples")

In [None]:
# DATASET & DATALOADER
class ThaiTextDataset(Dataset):
    def __init__(self, X, lengths, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.lengths = torch.tensor(lengths, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self): return len(self.y)
    def __getitem__(self, idx): return self.X[idx], self.lengths[idx], self.y[idx]

train_loader = DataLoader(ThaiTextDataset(X_train, len_train, y_train), batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(ThaiTextDataset(X_test, len_test, y_test), batch_size=BATCH_SIZE)
val_loader = DataLoader(ThaiTextDataset(X_val, len_val, y_val), batch_size=BATCH_SIZE)

In [None]:
# BUILD EMBEDDING MATRIX
vocab_size = len(vocab)
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, idx in vocab.items():
    if word in w2v:
        embedding_matrix[idx] = w2v[word]
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))

In [None]:
# TRANSFORMER MODEL
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_seq_len, output_dim, depth=4, heads=4, embedding_matrix=None):
        super(TransformerClassifier, self).__init__()
        self.embed_dim = embed_dim
        
        # 1. Embedding
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        if embedding_matrix is not None:
            self.embedding.weight.data.copy_(torch.tensor(embedding_matrix, dtype=torch.float32))
            self.embedding.weight.requires_grad = True  # Fine-tune embedding
            
        # 2. Positional Encoding
        self.pos_encoder = nn.Parameter(torch.randn(1, max_seq_len, embed_dim))
        
        # 3. Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, 
            nhead=heads, 
            dim_feedforward=embed_dim * 4,
            dropout=0.1,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=depth)
        
        # 4. Output
        self.fc = nn.Linear(embed_dim, output_dim)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x, lengths):
        # Create Padding Mask
        src_key_padding_mask = (x == 0)

        # Embed + Pos Encode
        x = self.embedding(x) * math.sqrt(self.embed_dim)
        x = x + self.pos_encoder[:, :x.size(1), :]
        
        # Transformer Pass
        x = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)
        
        # Mean Pooling
        x = x.mean(dim=1) 
        
        x = self.dropout(x)
        return self.fc(x)

In [None]:
# CREATE MODEL
model = TransformerClassifier(
    vocab_size=vocab_size,
    embed_dim=embedding_dim,  # 300
    max_seq_len=MAX_LEN,      # 256
    output_dim=len(label_cols),
    depth=DEPTH,              # 4
    heads=HEADS,              # 4
    embedding_matrix=embedding_matrix
).to(device)

print(f"\nüì¶ Model created with:")
print(f"  - embed_dim: {embedding_dim}")
print(f"  - depth: {DEPTH}")
print(f"  - heads: {HEADS}")
print(f"  - max_seq_len: {MAX_LEN}")

In [None]:
# TRAINING (Fixed LR - No Scheduler)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)
scaler = GradScaler()

# ‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πâ scheduler - LR ‡∏Ñ‡∏á‡∏ó‡∏µ‡πà‡∏ó‡∏µ‡πà 2e-4

print(f"üöÄ Start Training... (Max Epochs: {EPOCHS})")
print(f"  - Learning Rate: {LEARNING_RATE} (Fixed)")
print(f"  - Batch Size: {BATCH_SIZE}")
print(f"  - Patience: {PATIENCE}")

best_val_loss = float('inf')
patience_counter = 0

for epoch in range(EPOCHS):
    start_time = time.time()
    model.train()
    total_loss = 0
    
    for X_batch, lengths_batch, y_batch in train_loader:
        X_batch, lengths_batch, y_batch = X_batch.to(device), lengths_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        
        with autocast():  # Mixed Precision
            outputs = model(X_batch, lengths_batch)
            loss = criterion(outputs, y_batch)
        
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_loader)
    
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, lengths_batch, y_batch in val_loader:
            X_batch, lengths_batch, y_batch = X_batch.to(device), lengths_batch.to(device), y_batch.to(device)
            outputs = model(X_batch, lengths_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item()
    
    avg_val_loss = val_loss / len(val_loader)
    val_time = time.time() - start_time
    
    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | LR: {LEARNING_RATE} | Time: {val_time:.1f}s")
    
    # Early Stopping based on Validation Loss
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model_thai2vec_transformer_v2.pth')
    else:
        patience_counter += 1
        if patience_counter >= PATIENCE:
            print(f"üõë Early stopping triggered at epoch {epoch+1}")
            break

print("‚úÖ Training Complete!")

In [None]:
# EVALUATION ON TEST SET
model.load_state_dict(torch.load('best_model_thai2vec_transformer_v2.pth'))
model.eval()

y_true, y_pred = [], []
with torch.no_grad():
    for X_batch, lengths_batch, y_batch in test_loader:
        X_batch, lengths_batch = X_batch.to(device), lengths_batch.to(device)
        outputs = model(X_batch, lengths_batch)
        preds = torch.sigmoid(outputs).cpu().numpy()
        preds = (preds > THRESHOLD).astype(int)
        y_true.append(y_batch.numpy())
        y_pred.append(preds)

y_true = np.vstack(y_true)
y_pred = np.vstack(y_pred)

print("\n" + "="*35)
print("üèÜ TEST SET RESULTS")
print("="*35)
print(f"F1-Macro: {f1_score(y_true, y_pred, average='macro'):.4f}")
print(f"F1-Micro: {f1_score(y_true, y_pred, average='micro'):.4f}")
print("-" * 35)
print(classification_report(y_true, y_pred, target_names=label_cols, zero_division=0))

In [None]:
# PREDICT FUNCTION
def predict(text, show_all=True, threshold=THRESHOLD):
    """
    show_all: ‡∏ñ‡πâ‡∏≤ True ‡∏à‡∏∞‡πÅ‡∏™‡∏î‡∏á‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô‡∏ó‡∏∏‡∏Å‡∏´‡∏°‡∏ß‡∏î (‡πÄ‡∏£‡∏µ‡∏¢‡∏á‡∏à‡∏≤‡∏Å‡∏°‡∏≤‡∏Å‡πÑ‡∏õ‡∏ô‡πâ‡∏≠‡∏¢)
              ‡∏ñ‡πâ‡∏≤ False ‡∏à‡∏∞‡πÅ‡∏™‡∏î‡∏á‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏´‡∏°‡∏ß‡∏î‡∏ó‡∏µ‡πà‡πÄ‡∏Å‡∏¥‡∏ô threshold
    threshold: ‡∏Ñ‡πà‡∏≤‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡πà‡∏≥‡∏ó‡∏µ‡πà‡∏à‡∏∞‡∏ñ‡∏∑‡∏≠‡∏ß‡πà‡∏≤‡πÄ‡∏õ‡πá‡∏ô‡∏´‡∏°‡∏ß‡∏î‡∏ó‡∏µ‡πà‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á
    """
    model.eval()
    tokens = word_tokenize(text, keep_whitespace=False)
    ids = [vocab.get(w, vocab["<UNK>"]) for w in tokens]
    
    if len(ids) > MAX_LEN: ids = ids[:MAX_LEN]
    else: ids = ids + [0] * (MAX_LEN - len(ids))
    
    tensor_in = torch.tensor([ids], dtype=torch.long).to(device)
    len_in = torch.tensor([min(len(tokens), MAX_LEN)], dtype=torch.long).to(device)
    
    with torch.no_grad():
        output = model(tensor_in, len_in)
        probs = torch.sigmoid(output).cpu().numpy()[0]
    
    all_results = [(label_cols[i], float(prob)) for i, prob in enumerate(probs)]
    all_results.sort(key=lambda x: x[1], reverse=True)
    
    if show_all:
        return all_results
    else:
        filtered = [(name, prob) for name, prob in all_results if prob > threshold]
        if not filtered:
            filtered = [all_results[0]]
        return filtered

In [None]:
# Test prediction
print("üîÆ Sample Prediction:")
sample = "‡∏£‡∏±‡∏ê‡∏ö‡∏≤‡∏•‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®‡∏ô‡πÇ‡∏¢‡∏ö‡∏≤‡∏¢‡∏Å‡∏≤‡∏£‡∏®‡∏∂‡∏Å‡∏©‡∏≤‡πÉ‡∏´‡∏°‡πà ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏û‡∏±‡∏í‡∏ô‡∏≤‡πÄ‡∏¢‡∏≤‡∏ß‡∏ä‡∏ô"
print(f"Input: {sample}")
print(f"Output: {predict(sample, show_all=False)}")