In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
import copy
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from pythainlp.tokenize import word_tokenize
from pythainlp.corpus import thai_stopwords
from gensim.models import Word2Vec 
import sys

# ==========================================
# 0. Configuration
# ==========================================
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f"Random Seed set to {seed}")

set_seed(42)

# ‡πÄ‡∏ä‡πá‡∏Ñ Path ‡πÉ‡∏´‡πâ‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡πà‡∏≠‡∏ô‡∏£‡∏±‡∏ô‡∏ô‡∏∞‡∏Ñ‡∏£‡∏±‡∏ö
CSV_PATH = r"d:\year4\‡∏™‡∏´‡∏Å‡∏¥‡∏à\prachatai_test.csv" 
W2V_PATH = "custom_word2vec.model"

# Hyperparameters
NUM_EPOCHS = 50       
BATCH_SIZE = 64
LEARNING_RATE = 0.001
THRESHOLD = 0.5 

# LSTM Specific Configs
MAX_LEN = 200         
EMBED_DIM = 300       
HIDDEN_DIM = 256      
NUM_LAYERS = 2        
BIDIRECTIONAL = True  

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Random Seed set to 42
Using device: cuda


In [2]:
# ==========================================
# 1. Loading Data & Build Vocab
# ==========================================
print("--- Step 1: Loading Data & Building Vocabulary ---")

try:
    df = pd.read_csv(CSV_PATH)
    w2v_model = Word2Vec.load(W2V_PATH)
    print("-> Data resources loaded.")
except:
    raise FileNotFoundError(f"Check your file paths! \nCSV: {CSV_PATH}\nW2V: {W2V_PATH}")

# Build Embedding Matrix
vocab = w2v_model.wv.key_to_index
word_vectors = w2v_model.wv.vectors
pad_vector = np.zeros((1, EMBED_DIM))  # ID 0
unk_vector = np.random.normal(scale=0.6, size=(1, EMBED_DIM)) # ID 1
final_embeddings = np.concatenate([pad_vector, unk_vector, word_vectors], axis=0)
embedding_tensor = torch.FloatTensor(final_embeddings)

print(f"Vocab Size: {len(vocab) + 2}")

# Preprocessing Function
stop_words = set(thai_stopwords())

def text_to_indices(text, max_len=MAX_LEN):
    tokens = word_tokenize(str(text), engine='newmm')
    indices = []
    for word in tokens:
        if word.strip() == '' or word in stop_words: continue
        if word in vocab:
            indices.append(vocab[word] + 2) 
        else:
            indices.append(1) 
            
    if len(indices) < max_len:
        indices += [0] * (max_len - len(indices))
    else:
        indices = indices[:max_len]
    return indices

print("Converting text to Sequence IDs...")
X_list = df['body_text'].apply(text_to_indices).tolist()
X_numpy = np.array(X_list)
X_tensor = torch.LongTensor(X_numpy).to(device)

label_cols = ['politics', 'human_rights', 'quality_of_life', 'international', 
              'social', 'environment', 'economics', 'culture', 'labor', 
              'national_security', 'ict', 'education']
y_numpy = df[label_cols].values
y_tensor = torch.FloatTensor(y_numpy).to(device)
num_classes = len(label_cols)

X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
print("Data Ready for LSTM!")

--- Step 1: Loading Data & Building Vocabulary ---
-> Data resources loaded.
Vocab Size: 84551
Converting text to Sequence IDs...
Data Ready for LSTM!


In [3]:
# ==========================================
# 2. TextLSTM Model Definition
# ==========================================
class TextLSTM(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(TextLSTM, self).__init__()
        
        # 1. Embedding
        num_vocab, embed_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        
        # 2. LSTM Layer
        self.lstm = nn.LSTM(embed_dim, 
                            hidden_dim, 
                            num_layers=n_layers, 
                            bidirectional=bidirectional, 
                            dropout=dropout,
                            batch_first=True)
        
        # 3. Fully Connected
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # x: [batch, seq_len]
        embedded = self.dropout(self.embedding(x))
        
        # output, (hidden, cell)
        output, (hidden, cell) = self.lstm(embedded)
        
        # Concat hidden states from both directions
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
            
        return self.fc(hidden)

model = TextLSTM(embedding_tensor, HIDDEN_DIM, num_classes, NUM_LAYERS, BIDIRECTIONAL, 0.5).to(device)
print(model)

TextLSTM(
  (embedding): Embedding(84551, 300)
  (lstm): LSTM(300, 256, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=12, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [4]:
# ==========================================
# 3. Training
# ==========================================
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

print(f"--- Training TextLSTM ({NUM_EPOCHS} Epochs) ---")
best_f1 = 0.0
best_epoch = 0
best_model_wts = copy.deepcopy(model.state_dict())

for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    # Validation
    if (epoch+1) % 5 == 0:
        model.eval()
        with torch.no_grad():
            test_outputs = model(X_test)
            probs = torch.sigmoid(test_outputs)
            predicted = (probs > THRESHOLD).float()
            current_f1 = f1_score(y_test.cpu().numpy(), predicted.cpu().numpy(), average='micro')
            
            if current_f1 > best_f1:
                best_f1 = current_f1
                best_epoch = epoch + 1
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(best_model_wts, 'best_lstm_model.pth')
                
        print(f"Epoch [{epoch+1}/{NUM_EPOCHS}] Loss: {total_loss/len(train_loader):.4f} | F1: {current_f1*100:.2f}% (Best: {best_f1*100:.2f}%)")

print(f"Loading Best Model from Epoch {best_epoch}...")
model.load_state_dict(best_model_wts)

  from .autonotebook import tqdm as notebook_tqdm


--- Training TextLSTM (50 Epochs) ---
Epoch [5/50] Loss: 0.2090 | F1: 62.12% (Best: 62.12%)
Epoch [10/50] Loss: 0.1512 | F1: 65.98% (Best: 65.98%)
Epoch [15/50] Loss: 0.1099 | F1: 66.12% (Best: 66.12%)
Epoch [20/50] Loss: 0.0792 | F1: 66.62% (Best: 66.62%)
Epoch [25/50] Loss: 0.0570 | F1: 66.03% (Best: 66.62%)
Epoch [30/50] Loss: 0.0501 | F1: 66.45% (Best: 66.62%)
Epoch [35/50] Loss: 0.0368 | F1: 65.88% (Best: 66.62%)
Epoch [40/50] Loss: 0.0304 | F1: 65.60% (Best: 66.62%)
Epoch [45/50] Loss: 0.0238 | F1: 66.88% (Best: 66.88%)
Epoch [50/50] Loss: 0.0204 | F1: 66.56% (Best: 66.88%)
Loading Best Model from Epoch 45...


<All keys matched successfully>

In [5]:
# ==========================================
# 4. Evaluation
# ==========================================
model.eval()
with torch.no_grad():
    test_outputs = model(X_test)
    probs = torch.sigmoid(test_outputs)
    predicted = (probs > THRESHOLD).float()
    
    y_true = y_test.cpu().numpy()
    y_pred = predicted.cpu().numpy()
    
    print("\n--- Classification Report (TextLSTM) ---")
    print(classification_report(y_true, y_pred, target_names=label_cols, zero_division=0))


--- Classification Report (TextLSTM) ---
                   precision    recall  f1-score   support

         politics       0.76      0.83      0.79       766
     human_rights       0.58      0.65      0.61       306
  quality_of_life       0.61      0.63      0.62       211
    international       0.73      0.78      0.76       158
           social       0.37      0.23      0.29       171
      environment       0.70      0.67      0.68       150
        economics       0.58      0.64      0.61        94
          culture       0.50      0.56      0.53        94
            labor       0.81      0.81      0.81        69
national_security       0.44      0.43      0.44        60
              ict       0.64      0.75      0.69        65
        education       0.50      0.43      0.46        56

        micro avg       0.66      0.68      0.67      2200
        macro avg       0.60      0.62      0.61      2200
     weighted avg       0.65      0.68      0.66      2200
      sample

In [6]:
# ==========================================
# 5. Interactive Mode
# ==========================================
def predict_lstm(text):
    model.eval()
    indices = text_to_indices(text) 
    tensor = torch.LongTensor([indices]).to(device)
    
    with torch.no_grad():
        logits = model(tensor)
        probs = torch.sigmoid(logits).squeeze().cpu().numpy()
        
    print(f"\nSnippet: {text[:50]}...")
    found = False
    for i, col in enumerate(label_cols):
        if probs[i] > THRESHOLD:
            print(f"[/] {col}: {probs[i]*100:.2f}% (YES)")
            found = True
        elif probs[i] > 0.15:
            print(f"[ ] {col}: {probs[i]*100:.2f}%")
    if not found: print(">> No category detected.")

print("Type 'exit' to stop.")
while True:
    try:
        user_input = input("\nüìù Enter news (TextLSTM): ").strip()
        if user_input.lower() in ['exit', 'quit', 'q']: break
        if not user_input: continue
        predict_lstm(user_input)
    except KeyboardInterrupt: break

Type 'exit' to stop.

Snippet: 17 ‡∏û.‡∏¢. 2558 Blognone [1] ‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡∏ß‡πà‡∏≤ ‡∏Å‡∏•‡∏∏‡πà‡∏°‡πÅ‡∏Æ‡∏Ñ‡πÄ‡∏Å‡∏≠‡∏£‡πå ...
[/] international: 99.89% (YES)
[/] ict: 99.87% (YES)

Snippet: ‡∏Å‡∏≠.‡∏£‡∏°‡∏ô.‡∏û‡∏£‡∏∞‡∏ô‡∏Ñ‡∏£‡∏®‡∏£‡∏µ‡∏≠‡∏¢‡∏∏‡∏ò‡∏¢‡∏≤ ‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÄ‡∏Ñ‡∏£‡∏∑‡∏≠‡∏Ç‡πà‡∏≤‡∏¢‡∏Ç‡πà‡∏≤‡∏ß‡∏†‡∏≤‡∏Ñ‡∏õ‡∏£‡∏∞‡∏ä‡∏≤‡∏ä...
[/] politics: 98.62% (YES)
[ ] economics: 15.22%
[/] national_security: 96.72% (YES)
[/] ict: 98.39% (YES)

Snippet: ‡πÜ...
[/] quality_of_life: 94.63% (YES)
[ ] social: 21.18%
[/] environment: 97.98% (YES)
