In [1]:
# Install libraries (‡∏ñ‡πâ‡∏≤‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡∏•‡∏á)
# !pip install transformers scikit-learn pythainlp pandas gensim

In [2]:
%pip install x-transformers

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from pythainlp import word_tokenize
from pythainlp import word_vector
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from x_transformers import TransformerWrapper, Encoder

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üü¢ Using device: {device}")


üü¢ Using device: cuda


In [5]:
MAX_LEN = 256        # ‡∏•‡∏î‡∏•‡∏á‡∏°‡∏≤‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÉ‡∏´‡πâ‡∏£‡∏±‡∏ô‡πÑ‡∏ß‡∏Ç‡∏∂‡πâ‡∏ô ‡πÅ‡∏•‡∏∞‡πÄ‡∏´‡∏°‡∏≤‡∏∞‡∏Å‡∏±‡∏ö‡∏ö‡∏£‡∏¥‡∏ö‡∏ó‡∏Ç‡πà‡∏≤‡∏ß
BATCH_SIZE = 128     # Batch ‡πÉ‡∏´‡∏ç‡πà‡∏Ç‡∏∂‡πâ‡∏ô‡πÄ‡∏û‡∏£‡∏≤‡∏∞ Sequence ‡∏™‡∏±‡πâ‡∏ô‡∏•‡∏á
EPOCHS = 500         # ‡∏ï‡∏±‡πâ‡∏á‡πÑ‡∏ß‡πâ‡πÄ‡∏¢‡∏≠‡∏∞‡πÜ ‡πÅ‡∏ï‡πà‡πÄ‡∏î‡∏µ‡πã‡∏¢‡∏ß‡πÄ‡∏£‡∏≤‡∏°‡∏µ Early Stopping ‡∏ä‡πà‡∏ß‡∏¢‡∏ï‡∏±‡∏î‡∏à‡∏ö
PATIENCE = 20        # ‡∏ñ‡πâ‡∏≤ Loss ‡πÑ‡∏°‡πà‡∏•‡∏î‡∏•‡∏á 20 ‡∏£‡∏≠‡∏ö‡∏ï‡∏¥‡∏î‡∏Å‡∏±‡∏ô ‡πÉ‡∏´‡πâ‡∏´‡∏¢‡∏∏‡∏î

In [6]:
df = pd.read_csv(r"D:\year4\‡∏™‡∏´‡∏Å‡∏¥‡∏à\prachatai_train.csv")
texts = df["body_text"].astype(str).tolist()
label_cols = [
    "politics", "human_rights", "quality_of_life", "international",
    "social", "environment", "economics", "culture", "labor",
    "national_security", "ict", "education"
]
y = df[label_cols].values.astype(np.float32)

In [7]:
w2v = word_vector.WordVector(model_name="thai2fit_wv").get_model()
embedding_dim = w2v.vector_size

tokenized_texts = [word_tokenize(t, keep_whitespace=False) for t in texts]
thai2vec_vocab = list(w2v.key_to_index.keys())
vocab = {"<PAD>": 0, "<UNK>": 1}
for i, word in enumerate(thai2vec_vocab, start=2):
    vocab[word] = i

In [8]:
def encode_text(tokens, vocab):
    return [vocab.get(w, vocab["<UNK>"]) for w in tokens]

encoded_texts = [encode_text(tokens, vocab) for tokens in tokenized_texts]

def pad_sequences(sequences, max_len=None, pad_value=0):
    if max_len is None:
        max_len = 256
    
    padded = np.full((len(sequences), max_len), pad_value, dtype=np.int64)
    lengths = np.array([len(seq) for seq in sequences], dtype=np.int64)
    
    for i, seq in enumerate(sequences):
        end = min(len(seq), max_len)
        padded[i, :end] = seq[:end]
        
    return padded, lengths

X, lengths = pad_sequences(encoded_texts, max_len=256)
X_train, X_test, y_train, y_test, len_train, len_test = train_test_split(
    X, y, lengths, test_size=0.1, random_state=42
)

# ‡πÄ‡∏ä‡πá‡∏Ñ‡∏Ç‡∏ô‡∏≤‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ä‡∏±‡∏ß‡∏£‡πå
print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")

Train shape: (48941, 256)
Test shape: (5438, 256)


In [9]:
class ThaiTextDataset(Dataset):
    def __init__(self, X, lengths, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.lengths = torch.tensor(lengths, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.lengths[idx], self.y[idx]

train_dataset = ThaiTextDataset(X_train, len_train, y_train)
test_dataset = ThaiTextDataset(X_test, len_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [10]:
vocab_size = max(vocab.values()) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, idx in vocab.items():
    if word in w2v:
        embedding_matrix[idx] = w2v[word]
    elif word == "<PAD>":
        embedding_matrix[idx] = np.zeros(embedding_dim)
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))

In [11]:
class XTransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, depth, heads, output_dim, max_len, embedding_matrix=None):
        super().__init__()
        
        # ‡πÉ‡∏ä‡πâ Wrapper ‡∏Ç‡∏≠‡∏á x-transformers
        self.model = TransformerWrapper(
            num_tokens = vocab_size,
            max_seq_len = max_len,
            attn_layers = Encoder(
                dim = embed_dim,
                depth = depth,
                heads = heads,
                layer_dropout = 0.1,
                attn_dropout = 0.1
            )
        )

        # Load Pretrained Thai2Fit weights
        if embedding_matrix is not None:
            weights = torch.tensor(embedding_matrix, dtype=torch.float32)
            try:
                self.model.token_emb.emb.weight.data.copy_(weights)
                self.model.token_emb.emb.weight.requires_grad = True
            except AttributeError:
                try:
                    self.model.token_emb.weight.data.copy_(weights)
                    self.model.token_emb.weight.requires_grad = True
                except:
                    print("‚ö†Ô∏è Warning: Could not load pretrained embeddings.")

        self.fc = nn.Linear(embed_dim, output_dim)

    def forward(self, text, lengths=None):
        # üü¢ ‡∏à‡∏∏‡∏î‡πÅ‡∏Å‡πâ‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç: return_embeddings=True 
        # ‡∏ö‡∏≠‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏ß‡πà‡∏≤ "‡πÑ‡∏°‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏ó‡∏≤‡∏¢‡∏®‡∏±‡∏û‡∏ó‡πå‡∏ô‡∏∞" (‡∏õ‡∏¥‡∏î Logits) -> ‡∏õ‡∏£‡∏∞‡∏´‡∏¢‡∏±‡∏î‡πÅ‡∏£‡∏°‡πÑ‡∏õ 90%
        x = self.model(text, return_embeddings=True)
        
        # Mean Pooling
        x = x.mean(dim=1)
        return self.fc(x)

In [12]:
embed_dim = embedding_dim
heads = 8 if embed_dim % 8 == 0 else (5 if embed_dim % 5 == 0 else 4)
depth = 6 

print(f"Settings: Embed={embed_dim}, Heads={heads}, Depth={depth}")

model = XTransformerClassifier(
    vocab_size=vocab_size, 
    embed_dim=embed_dim, 
    depth=depth, 
    heads=heads, 
    output_dim=len(label_cols), 
    max_len=MAX_LEN, 
    embedding_matrix=embedding_matrix
).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scaler = GradScaler() # üëà ‡∏ï‡∏±‡∏ß‡∏ä‡πà‡∏ß‡∏¢ Mixed Precision

Settings: Embed=300, Heads=5, Depth=6


In [14]:
import time

print(f"üöÄ Starting Training (Batch={BATCH_SIZE})...")

best_loss = float('inf')
patience_counter = 0

for epoch in range(EPOCHS):
    start_time = time.time()
    model.train()
    total_loss = 0
    
    for X_batch, lengths_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        
        # Mixed Precision Context
        with autocast():
            outputs = model(X_batch) 
            loss = criterion(outputs, y_batch)
        
        # Backward ‡πÅ‡∏ö‡∏ö Scaled
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
            
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    epoch_time = time.time() - start_time
    
    print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {avg_loss:.4f} | Time: {epoch_time:.1f}s")
    
    # --- Early Stopping ---
    if avg_loss < best_loss:
        best_loss = avg_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_x_transformer_model.pth")
    else:
        patience_counter += 1
        if patience_counter >= PATIENCE:
            print(f"üõë Early stopping at epoch {epoch+1}")
            break

print("‚úÖ Training Complete!")

üöÄ Starting Training (Batch=128)...
Epoch 1/500 | Loss: 0.2592 | Time: 1096.7s


KeyboardInterrupt: 

In [None]:
model.load_state_dict(torch.load("best_x_transformer_model.pth"))
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for X_batch, lengths_batch, y_batch in test_loader:
        X_batch, lengths_batch = X_batch.to(device), lengths_batch.to(device)
        outputs = model(X_batch, lengths_batch)
        preds = torch.sigmoid(outputs).cpu().numpy()
        preds = (preds > 0.5).astype(int)
        y_true.append(y_batch.numpy())
        y_pred.append(preds)

y_true = np.vstack(y_true)
y_pred = np.vstack(y_pred)

print("-" * 30)
print("F1-score (macro):", f1_score(y_true, y_pred, average="macro"))
print("F1-score (micro):", f1_score(y_true, y_pred, average="micro"))

  output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)


------------------------------
F1-score (macro): 0.6319927595062403
F1-score (micro): 0.6944149974892596


In [None]:
def predict(text):
    model.eval()
    tokens = word_tokenize(text, keep_whitespace=False)
    ids = encode_text(tokens, vocab)
    
    # ‡∏ï‡∏±‡∏î‡∏´‡∏£‡∏∑‡∏≠ Padding ‡πÉ‡∏´‡πâ‡πÄ‡∏ó‡πà‡∏≤‡∏Å‡∏±‡∏ö MAX_LEN ‡πÄ‡∏™‡∏°‡∏≠
    if len(ids) > MAX_LEN:
        ids = ids[:MAX_LEN]
    elif len(ids) < MAX_LEN:
        ids = ids + [0] * (MAX_LEN - len(ids))

    padded = torch.tensor([ids], dtype=torch.long).to(device)
    
    with torch.no_grad():
        output = model(padded) 
        probs = torch.sigmoid(output).cpu().numpy()[0]
        
    results = []
    for i, prob in enumerate(probs):
        if prob > 0.5:
            results.append((label_cols[i], float(prob)))
            
    if not results:
        best_idx = np.argmax(probs)
        results.append((label_cols[best_idx], float(probs[best_idx])))
        
    return results

print(predict("‡∏£‡∏±‡∏ê‡∏ö‡∏≤‡∏•‡πÑ‡∏ó‡∏¢‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®‡∏ô‡πÇ‡∏¢‡∏ö‡∏≤‡∏¢‡∏î‡πâ‡∏≤‡∏ô‡∏™‡∏¥‡πà‡∏á‡πÅ‡∏ß‡∏î‡∏•‡πâ‡∏≠‡∏°‡πÉ‡∏´‡∏°‡πà"))

------------------------------
Prediction:
1. [('quality_of_life', 0.971625804901123), ('international', 0.7255356311798096), ('environment', 0.6329044699668884), ('economics', 0.9968211650848389)]
2. [('politics', 0.022722477093338966)]
