In [13]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
import copy
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from pythainlp.tokenize import word_tokenize
from pythainlp.corpus import thai_stopwords
from pythainlp.util import normalize
from gensim.models import Word2Vec 

# ==========================================
# 0. Configuration & Seeding
# ==========================================
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f"Random Seed set to {seed}")

# set_seed(42) 

CSV_PATH = r"d:\year4\‡∏™‡∏´‡∏Å‡∏¥‡∏à\prachatai_train.csv" # ‡∏´‡∏£‡∏∑‡∏≠‡πÅ‡∏Å‡πâ path ‡πÉ‡∏´‡πâ‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ö‡πÄ‡∏Ñ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏ó‡∏µ‡πà‡∏£‡∏±‡∏ô
W2V_PATH = "custom_word2vec.model"
NUM_EPOCHS = 1000
BATCH_SIZE = 64
LEARNING_RATE = 0.0005 #‡∏¢‡∏¥‡πà‡∏á‡∏•‡∏î‡∏¢‡∏¥‡πà‡∏á‡πÅ‡∏°‡πà‡∏ô‡∏¢‡∏≥‡∏Ç‡∏∂‡πâ‡∏ô
THRESHOLD = 0.5 
HIDDEN_DIM1 = 256
HIDDEN_DIM2 = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [2]:
# ==========================================
# 1. Loading Data & Resources
# ==========================================
print("--- Step 1: Loading Data & Resources ---")
try:
    w2v_model = Word2Vec.load(W2V_PATH) 
    print(f"-> Loaded {W2V_PATH} successfully.")
except:
    print(f"Error: Could not load {W2V_PATH}")
    # ‡πÉ‡∏ô Jupyter ‡πÑ‡∏°‡πà‡∏Ñ‡∏ß‡∏£‡πÉ‡∏ä‡πâ exit() ‡πÄ‡∏û‡∏£‡∏≤‡∏∞‡∏à‡∏∞‡∏õ‡∏¥‡∏î Kernel ‡πÉ‡∏´‡πâ‡πÉ‡∏ä‡πâ raise Error ‡πÅ‡∏ó‡∏ô ‡∏´‡∏£‡∏∑‡∏≠‡πÅ‡∏Ñ‡πà print
    raise FileNotFoundError(f"Model not found: {W2V_PATH}")

try:
    df = pd.read_csv(CSV_PATH)
except FileNotFoundError:
    raise FileNotFoundError(f"CSV not found: {CSV_PATH}")

# Prepare Labels
label_cols = ['politics', 'human_rights', 'quality_of_life', 'international', 
              'social', 'environment', 'economics', 'culture', 'labor', 
              'national_security', 'ict', 'education']
y_numpy = df[label_cols].values 
num_classes = len(label_cols)
print(f"Number of Classes: {num_classes}")

# Prepare Features
stop_words = set(thai_stopwords())

def get_avg_vector(text):
    text = normalize(str(text))
    tokens = word_tokenize(str(text), engine='newmm')
    vecs = []
    for word in tokens:
        if word not in stop_words and word.strip() != '':
            if word in w2v_model.wv.key_to_index:
                vecs.append(w2v_model.wv[word])
    if len(vecs) == 0:
        return np.zeros(300) 
    return np.mean(vecs, axis=0)

print("Converting text to vectors...")
X_numpy = np.vstack(df['body_text'].apply(get_avg_vector).values)

# Split & DataLoader
X_tensor = torch.tensor(X_numpy, dtype=torch.float32).to(device)
y_tensor = torch.tensor(y_numpy, dtype=torch.float32).to(device)

X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
print("Data Preparation Complete!")

--- Step 1: Loading Data & Resources ---
-> Loaded custom_word2vec.model successfully.
Number of Classes: 12
Converting text to vectors...
Data Preparation Complete!


In [3]:
# ==========================================
# 2. Model Definition
# ==========================================
class MultiLabelMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, output_dim):
        super(MultiLabelMLP, self).__init__()
        # Layer 1
        self.layer1 = nn.Linear(input_dim, hidden_dim1)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.5) 
        # Layer 2
        self.layer2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.5)
        # Layer 3 (Output)
        self.layer3 = nn.Linear(hidden_dim2, output_dim)
    
    def forward(self, x):
        x = self.layer1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.layer2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.layer3(x)
        return x

model = MultiLabelMLP(300, HIDDEN_DIM1, HIDDEN_DIM2, num_classes).to(device)
print("\n--- Model Architecture ---")
print(model)


--- Model Architecture ---
MultiLabelMLP(
  (layer1): Linear(in_features=300, out_features=256, bias=True)
  (relu1): ReLU()
  (dropout1): Dropout(p=0.5, inplace=False)
  (layer2): Linear(in_features=256, out_features=128, bias=True)
  (relu2): ReLU()
  (dropout2): Dropout(p=0.5, inplace=False)
  (layer3): Linear(in_features=128, out_features=12, bias=True)
)


In [14]:
# ==========================================
# 3. Training Process
# ==========================================
criterion = nn.BCEWithLogitsLoss() 
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

print(f"\n--- Step 2: Training ({NUM_EPOCHS} Epochs) ---")

best_f1 = 0.0
best_epoch = 0
best_model_wts = copy.deepcopy(model.state_dict())

for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation & Save Best
    if (epoch+1) % 10 == 0:
        model.eval()
        with torch.no_grad():
            test_outputs = model(X_test)
            probs = torch.sigmoid(test_outputs)
            predicted = (probs > THRESHOLD).float()
            current_f1 = f1_score(y_test.cpu().numpy(), predicted.cpu().numpy(), average='micro')
            
            if current_f1 > best_f1:
                best_f1 = current_f1
                best_epoch = epoch + 1
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(best_model_wts, 'best_mlp_model.pth')

    if (epoch+1) % 100 == 0:
        print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Loss: {total_loss/len(train_loader):.4f} | Best F1: {best_f1*100:.2f}% (Ep {best_epoch})")

# Load Best Model
print(f"\nTraining Complete. Loading best model from Epoch {best_epoch} (F1: {best_f1*100:.2f}%)")
model.load_state_dict(best_model_wts)


--- Step 2: Training (1000 Epochs) ---
Epoch [100/1000], Loss: 0.1466 | Best F1: 70.84% (Ep 70)
Epoch [200/1000], Loss: 0.1439 | Best F1: 70.84% (Ep 70)
Epoch [300/1000], Loss: 0.1424 | Best F1: 70.84% (Ep 70)
Epoch [400/1000], Loss: 0.1410 | Best F1: 70.84% (Ep 70)
Epoch [500/1000], Loss: 0.1383 | Best F1: 70.84% (Ep 70)
Epoch [600/1000], Loss: 0.1381 | Best F1: 70.84% (Ep 70)
Epoch [700/1000], Loss: 0.1367 | Best F1: 70.84% (Ep 70)
Epoch [800/1000], Loss: 0.1356 | Best F1: 70.84% (Ep 70)
Epoch [900/1000], Loss: 0.1353 | Best F1: 70.84% (Ep 70)
Epoch [1000/1000], Loss: 0.1343 | Best F1: 70.84% (Ep 70)

Training Complete. Loading best model from Epoch 70 (F1: 70.84%)


<All keys matched successfully>

In [15]:
# ==========================================
# 4. Evaluation
# ==========================================
print("\n--- Step 3: Evaluation (Best Model) ---")
model.eval()
with torch.no_grad():
    test_outputs = model(X_test)
    probs = torch.sigmoid(test_outputs)
    predicted = (probs > THRESHOLD).float()
    
    y_true = y_test.cpu().numpy()
    y_pred = predicted.cpu().numpy()
    
    f1_micro = f1_score(y_true, y_pred, average='micro')
    f1_samples = f1_score(y_true, y_pred, average='samples')
    
    print(f"F1 Score (Micro): {f1_micro*100:.2f}%")
    print(f"F1 Score (Samples): {f1_samples*100:.2f}%")
    print("\n--- Classification Report ---")
    print(classification_report(y_true, y_pred, target_names=label_cols, zero_division=0))


--- Step 3: Evaluation (Best Model) ---
F1 Score (Micro): 70.84%
F1 Score (Samples): 69.53%

--- Classification Report ---
                   precision    recall  f1-score   support

         politics       0.80      0.86      0.83      6369
     human_rights       0.74      0.61      0.67      2408
  quality_of_life       0.71      0.58      0.64      1819
    international       0.81      0.73      0.77      1241
           social       0.68      0.12      0.21      1276
      environment       0.72      0.65      0.68      1213
        economics       0.69      0.57      0.62       831
          culture       0.66      0.48      0.55       626
            labor       0.80      0.80      0.80       567
national_security       0.67      0.37      0.48       546
              ict       0.71      0.70      0.71       492
        education       0.70      0.40      0.51       411

        micro avg       0.76      0.66      0.71     17799
        macro avg       0.72      0.57      0.62

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [6]:
# ==========================================
# 5. Functions for Testing
# ==========================================
def predict_custom_news(text):
    model.eval()
    vec = get_avg_vector(text)
    tensor = torch.tensor(vec, dtype=torch.float32).unsqueeze(0).to(device)
    
    with torch.no_grad():
        logits = model(tensor)
        probs = torch.sigmoid(logits).squeeze().cpu().numpy()
    
    print(f"\n--- Testing Custom News ---")
    print(f"Snippet: {text[:50]}...")
    found_labels = []
    for i, col in enumerate(label_cols):
        if probs[i] > THRESHOLD:
            print(f"[/] {col}: {probs[i]*100:.2f}% (YES)")
            found_labels.append(col)
        elif probs[i] > 0.1: 
            print(f"[ ] {col}: {probs[i]*100:.2f}%")
    print(f">> Result: {found_labels}")

In [18]:
# ==========================================
# 6. Interactive Testing
# ==========================================
print("Type 'exit' to stop.")
while True:
    try:
        user_input = input("\nüìù Enter news text: ").strip()
        
        if user_input.lower() in ['exit', 'quit', 'q']:
            print("Goodbye!")
            break
        if not user_input: continue
            
        predict_custom_news(user_input)
        
    except KeyboardInterrupt:
        break

Type 'exit' to stop.

--- Testing Custom News ---
Snippet: ‡∏ß‡∏á‡πÄ‡∏™‡∏ß‡∏ô‡∏≤‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏°‡∏∏‡∏ó‡∏¥‡∏ï‡∏≤‡∏à‡∏¥‡∏ï ‚Äò‡∏ô‡∏•‡∏¥‡∏ô‡∏µ ‡∏ï‡∏±‡∏ô‡∏ò‡∏∏‡∏ß‡∏ô‡∏¥‡∏ï‡∏¢‡πå‚Äô ‡∏´‡∏±‡∏ß‡∏Ç‡πâ‡∏≠ ‚Äú...
[ ] politics: 15.76%
[ ] human_rights: 22.68%
[ ] quality_of_life: 39.46%
[/] social: 51.13% (YES)
[/] culture: 66.69% (YES)
[ ] education: 38.73%
>> Result: ['social', 'culture']
Goodbye!
