In [24]:
!pip install transformers scikit-learn pythainlp pandas gensim



In [25]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from pythainlp import word_tokenize
from pythainlp import word_vector
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üü¢ Using device: {device}")

üü¢ Using device: cuda


In [27]:
df = pd.read_csv("D:\year4\‡∏™‡∏´‡∏Å‡∏¥‡∏à\prachatai_train.csv")
texts = df["body_text"].astype(str).tolist()
label_cols = [
    "politics", "human_rights", "quality_of_life", "international",
    "social", "environment", "economics", "culture", "labor",
    "national_security", "ict", "education"
]
y = df[label_cols].values.astype(np.float32)

  df = pd.read_csv("D:\year4\‡∏™‡∏´‡∏Å‡∏¥‡∏à\prachatai_train.csv")


In [28]:
w2v = word_vector.WordVector(model_name="thai2fit_wv").get_model()
embedding_dim = w2v.vector_size

tokenized_texts = [word_tokenize(t, keep_whitespace=False) for t in texts]
thai2vec_vocab = list(w2v.key_to_index.keys())
vocab = {"<PAD>": 0, "<UNK>": 1}
for i, word in enumerate(thai2vec_vocab, start=2):
    vocab[word] = i

KeyboardInterrupt: 

In [None]:
def encode_text(tokens, vocab):
    return [vocab.get(w, vocab["<UNK>"]) for w in tokens]

encoded_texts = [encode_text(tokens, vocab) for tokens in tokenized_texts]

def pad_sequences(sequences, max_len=None, pad_value=0):
    if not max_len:
        max_len = max(len(seq) for seq in sequences)
    padded = np.full((len(sequences), max_len), pad_value, dtype=np.int64)
    lengths = np.array([len(seq) for seq in sequences], dtype=np.int64)
    for i, seq in enumerate(sequences):
        padded[i, :len(seq)] = seq[:max_len]
    return padded, lengths

In [None]:
X, lengths = pad_sequences(encoded_texts)
X_train, X_test, y_train, y_test, len_train, len_test = train_test_split(
    X, y, lengths, test_size=0.1, random_state=42)

In [None]:
class ThaiTextDataset(Dataset):
    def __init__(self, X, lengths, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.lengths = torch.tensor(lengths, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.lengths[idx], self.y[idx]

train_dataset = ThaiTextDataset(X_train, len_train, y_train)
test_dataset = ThaiTextDataset(X_test, len_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [None]:
vocab_size = max(vocab.values()) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, idx in vocab.items():
    if word in w2v:
        embedding_matrix[idx] = w2v[word]
    elif word == "<PAD>":
        embedding_matrix[idx] = np.zeros(embedding_dim)
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size, output_dim, embedding_matrix=None):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        if embedding_matrix is not None:
            self.embedding.weight.data.copy_(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, output_dim)

    def forward(self, text, lengths):
        embedded = self.embedding(text)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), enforce_sorted=False, batch_first=True)
        packed_out, (hidden, cell) = self.rnn(packed)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        return out

embed_dim = embedding_dim
rnn_hidden_size = 128
fc_hidden_size = 64
output_dim = len(label_cols)

In [None]:
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size, output_dim, embedding_matrix).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, lengths_batch, y_batch in train_loader:
        X_batch, lengths_batch, y_batch = X_batch.to(device), lengths_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch, lengths_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

In [None]:
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for X_batch, lengths_batch, y_batch in test_loader:
        X_batch, lengths_batch = X_batch.to(device), lengths_batch.to(device)
        outputs = model(X_batch, lengths_batch)
        preds = torch.sigmoid(outputs).cpu().numpy()
        preds = (preds > 0.5).astype(int)
        y_true.append(y_batch.numpy())
        y_pred.append(preds)

y_true = np.vstack(y_true)
y_pred = np.vstack(y_pred)

print("F1-score (macro):", f1_score(y_true, y_pred, average="macro"))
print("F1-score (micro):", f1_score(y_true, y_pred, average="micro"))

In [None]:
def predict(text):
    model.eval()
    tokens = word_tokenize(text, keep_whitespace=False)
    ids = encode_text(tokens, vocab)
    lengths = torch.tensor([len(ids)], dtype=torch.long).to(device)
    padded = torch.tensor([ids], dtype=torch.long).to(device)
    with torch.no_grad():
        output = model(padded, lengths)
        probs = torch.sigmoid(output).cpu().numpy()[0]
        best_idx = np.argmax(probs)
        return label_cols[best_idx], float(probs[best_idx])

print(predict("‡∏£‡∏±‡∏ê‡∏ö‡∏≤‡∏•‡πÑ‡∏ó‡∏¢‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®‡∏ô‡πÇ‡∏¢‡∏ö‡∏≤‡∏¢‡∏î‡πâ‡∏≤‡∏ô‡∏™‡∏¥‡πà‡∏á‡πÅ‡∏ß‡∏î‡∏•‡πâ‡∏≠‡∏°‡πÉ‡∏´‡∏°‡πà"))
print(predict("‡πÅ‡∏£‡∏á‡∏á‡∏≤‡∏ô‡∏õ‡∏£‡∏∞‡∏ó‡πâ‡∏ß‡∏á‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏™‡∏¥‡∏ó‡∏ò‡∏¥‡πÉ‡∏ô‡∏Å‡∏≤‡∏£‡∏ó‡∏≥‡∏á‡∏≤‡∏ô"))