In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "airesearch/wangchanberta-base-att-spm-uncased"
)
PAD_ID = tokenizer.pad_token_id
MAX_LEN = 128

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/282 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/905k [00:00<?, ?B/s]

In [None]:
df = pd.read_csv("prachatai_train.csv").head(100)
texts = df["body_text"].astype(str).tolist()
label_cols = [
    "politics", "human_rights", "quality_of_life", "international",
    "social", "environment", "economics", "culture", "labor",
    "national_security", "ict", "education"]

y = df[label_cols].values.astype(np.float32)

In [None]:
def encode_texts(texts):
    enc = tokenizer(
        texts,
        truncation=True,
        max_length=MAX_LEN,
        padding=False,
        return_tensors=None
    )
    return enc["input_ids"]

encoded_texts = encode_texts(texts)
vocab_size = tokenizer.vocab_size

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    encoded_texts, y, test_size=0.1, random_state=42)

In [None]:
class ThaiTextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        seq = torch.tensor(self.X[idx], dtype=torch.long)
        return seq, self.y[idx]

def collate_fn(batch):
    seqs, labels = zip(*batch)
    padded = pad_sequence(seqs, batch_first=True, padding_value=PAD_ID)
    return padded.to(device), torch.stack(labels).to(device)

train_loader = DataLoader(
    ThaiTextDataset(X_train, y_train),
    batch_size=16,
    shuffle=True,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    ThaiTextDataset(X_test, y_test),
    batch_size=16,
    collate_fn=collate_fn
)

In [None]:
class MLP(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            vocab_size,
            embed_dim,
            padding_idx=PAD_ID
        )

        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        emb = self.embedding(x)
        mask = (x != PAD_ID).unsqueeze(-1)
        emb = emb * mask
        pooled = emb.sum(dim=1) / mask.sum(dim=1).clamp(min=1)
        out = self.fc1(pooled)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        return self.sigmoid(out)

In [None]:
model = MLP(
    vocab_size=vocab_size,
    embed_dim=100,
    hidden_dim=128,
    output_dim=len(label_cols)
).to(device)

In [None]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 500
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        preds = model(X_batch)
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} | Loss: {total_loss:.4f}")

Epoch 1/500 | Loss: 4.0870
Epoch 2/500 | Loss: 3.8064
Epoch 3/500 | Loss: 3.3549
Epoch 4/500 | Loss: 2.6884
Epoch 5/500 | Loss: 2.2138
Epoch 6/500 | Loss: 2.1518
Epoch 7/500 | Loss: 2.0771
Epoch 8/500 | Loss: 1.9988
Epoch 9/500 | Loss: 1.9872
Epoch 10/500 | Loss: 1.9742
Epoch 11/500 | Loss: 1.9673
Epoch 12/500 | Loss: 1.9254
Epoch 13/500 | Loss: 1.9002
Epoch 14/500 | Loss: 1.8477
Epoch 15/500 | Loss: 1.8718
Epoch 16/500 | Loss: 1.8506
Epoch 17/500 | Loss: 1.8545
Epoch 18/500 | Loss: 1.8290
Epoch 19/500 | Loss: 1.7662
Epoch 20/500 | Loss: 1.7489
Epoch 21/500 | Loss: 1.7038
Epoch 22/500 | Loss: 1.6515
Epoch 23/500 | Loss: 1.6291
Epoch 24/500 | Loss: 1.6267
Epoch 25/500 | Loss: 1.6137
Epoch 26/500 | Loss: 1.5576
Epoch 27/500 | Loss: 1.5195
Epoch 28/500 | Loss: 1.4807
Epoch 29/500 | Loss: 1.4456
Epoch 30/500 | Loss: 1.3899
Epoch 31/500 | Loss: 1.3587
Epoch 32/500 | Loss: 1.3336
Epoch 33/500 | Loss: 1.2962
Epoch 34/500 | Loss: 1.2295
Epoch 35/500 | Loss: 1.1859
Epoch 36/500 | Loss: 1.1578
E

In [None]:
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        preds = model(X_batch)
        preds = (preds > 0.5).int()
        y_true.append(y_batch.cpu().numpy())
        y_pred.append(preds.cpu().numpy())

y_true = np.vstack(y_true)
y_pred = np.vstack(y_pred)

print("F1 macro:", f1_score(y_true, y_pred, average="macro"))
print("F1 micro:", f1_score(y_true, y_pred, average="micro"))

F1 macro: 0.07878787878787878
F1 micro: 0.38461538461538464


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
def predict(text):
    enc = tokenizer(
        text,
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )
    seq = enc["input_ids"].to(device)

    with torch.no_grad():
        probs = model(seq)[0].cpu().numpy()
        idx = np.argmax(probs)

    return label_cols[idx], float(probs[idx])

print(predict("รัฐบาลไทยประกาศนโยบายด้านสิ่งแวดล้อมใหม่"))
print(predict("แรงงานเรียกร้องสิทธิ์การทำงาน"))

('quality_of_life', 0.9999926090240479)
('politics', 0.016911478713154793)
