In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import copy
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from pythainlp.tokenize import word_tokenize
from pythainlp.corpus import thai_stopwords
from gensim.models import Word2Vec 

# ==========================================
# 0. Configuration
# ==========================================
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f"Random Seed set to {seed}")

set_seed(42)

CSV_PATH = r"d:\year4\‡∏™‡∏´‡∏Å‡∏¥‡∏à\prachatai_test.csv"
W2V_PATH = "custom_word2vec.model"
NUM_EPOCHS = 50 # CNN ‡πÄ‡∏£‡∏µ‡∏¢‡∏ô‡∏£‡∏π‡πâ‡πÄ‡∏£‡πá‡∏ß‡∏Å‡∏ß‡πà‡∏≤ MLP ‡πÄ‡∏¢‡∏≠‡∏∞‡∏Ñ‡∏£‡∏±‡∏ö 50-100 ‡∏Å‡πá‡∏£‡∏π‡πâ‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á‡πÅ‡∏•‡πâ‡∏ß
BATCH_SIZE = 64
LEARNING_RATE = 0.001
THRESHOLD = 0.5 

# CNN Configs
MAX_LEN = 200         # ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏¢‡∏≤‡∏ß‡∏™‡∏π‡∏á‡∏™‡∏∏‡∏î‡∏Ç‡∏≠‡∏á‡∏õ‡∏£‡∏∞‡πÇ‡∏¢‡∏Ñ (‡∏ï‡∏±‡∏î‡∏ó‡∏µ‡πà 200 ‡∏Ñ‡∏≥)
EMBED_DIM = 300       # ‡∏Ç‡∏ô‡∏≤‡∏î Word2Vec ‡πÄ‡∏î‡∏¥‡∏°
NUM_FILTERS = 100     # ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏ï‡∏±‡∏ß‡∏ï‡∏£‡∏ß‡∏à‡∏à‡∏±‡∏ö Pattern ‡πÉ‡∏ô‡πÅ‡∏ï‡πà‡∏•‡∏∞‡∏Ç‡∏ô‡∏≤‡∏î
FILTER_SIZES = [2, 3, 4] # ‡∏Ç‡∏ô‡∏≤‡∏î‡∏Ç‡∏≠‡∏á Pattern ‡∏ó‡∏µ‡πà‡∏°‡∏≠‡∏á (2 ‡∏Ñ‡∏≥‡∏ï‡∏¥‡∏î‡∏Å‡∏±‡∏ô, 3 ‡∏Ñ‡∏≥‡∏ï‡∏¥‡∏î‡∏Å‡∏±‡∏ô, 4 ‡∏Ñ‡∏≥‡∏ï‡∏¥‡∏î‡∏Å‡∏±‡∏ô)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
# ==========================================
# 1. Loading Data & Build Vocab
# ==========================================
print("--- Step 1: Loading Data & Building Vocabulary ---")

# 1.1 Load Raw Data
try:
    df = pd.read_csv(CSV_PATH)
    w2v_model = Word2Vec.load(W2V_PATH)
    print("-> Data resources loaded.")
except:
    raise FileNotFoundError("Check your file paths!")

# 1.2 Build Embedding Matrix (‡πÅ‡∏õ‡∏•‡∏á Word2Vec ‡πÄ‡∏õ‡πá‡∏ô‡∏ï‡∏≤‡∏£‡∏≤‡∏á‡πÉ‡∏´‡πâ CNN ‡πÉ‡∏ä‡πâ)
# ‡πÄ‡∏£‡∏≤‡∏ï‡πâ‡∏≠‡∏á‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏ï‡∏≤‡∏£‡∏≤‡∏á‡∏ó‡∏µ‡πà‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà 0 ‡∏Ñ‡∏∑‡∏≠ <PAD>, ‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà 1 ‡∏Ñ‡∏∑‡∏≠ <UNK>, ‡πÅ‡∏ñ‡∏ß‡∏ñ‡∏±‡∏î‡πÑ‡∏õ‡∏Ñ‡∏∑‡∏≠‡∏Ñ‡∏≥‡πÉ‡∏ô Word2Vec
vocab = w2v_model.wv.key_to_index
word_vectors = w2v_model.wv.vectors

# ‡πÄ‡∏û‡∏¥‡πà‡∏° <PAD> ‡πÅ‡∏•‡∏∞ <UNK> ‡πÄ‡∏Ç‡πâ‡∏≤‡πÑ‡∏õ‡πÉ‡∏ô‡∏£‡∏∞‡∏ö‡∏ö
pad_vector = np.zeros((1, EMBED_DIM))  # ID 0: ‡πÄ‡∏≠‡∏≤‡πÑ‡∏ß‡πâ‡πÄ‡∏ï‡∏¥‡∏°‡∏Ñ‡∏≥‡∏ß‡πà‡∏≤‡∏á‡πÜ (‡∏™‡∏µ‡∏î‡∏≥)
unk_vector = np.random.normal(scale=0.6, size=(1, EMBED_DIM)) # ID 1: ‡∏Ñ‡∏≥‡∏ó‡∏µ‡πà‡πÑ‡∏°‡πà‡∏£‡∏π‡πâ‡∏à‡∏±‡∏Å

# ‡∏£‡∏ß‡∏°‡∏£‡πà‡∏≤‡∏á: [PAD, UNK, ...Word2Vec...]
final_embeddings = np.concatenate([pad_vector, unk_vector, word_vectors], axis=0)
embedding_tensor = torch.FloatTensor(final_embeddings)

print(f"Vocab Size: {len(vocab) + 2}")
print(f"Embedding Matrix Shape: {embedding_tensor.shape}")

# 1.3 Preprocessing Function (Text -> List of IDs)
stop_words = set(thai_stopwords())

def text_to_indices(text, max_len=MAX_LEN):
    tokens = word_tokenize(str(text), engine='newmm')
    indices = []
    for word in tokens:
        if word.strip() == '' or word in stop_words:
            continue
        
        if word in vocab:
            # +2 ‡πÄ‡∏û‡∏£‡∏≤‡∏∞‡πÄ‡∏£‡∏≤‡∏°‡∏µ PAD(0) ‡∏Å‡∏±‡∏ö UNK(1) ‡∏ô‡∏≥‡∏´‡∏ô‡πâ‡∏≤‡∏≠‡∏¢‡∏π‡πà
            indices.append(vocab[word] + 2) 
        else:
            indices.append(1) # Unknown
            
    # Padding / Truncating (‡∏ó‡∏≥‡πÉ‡∏´‡πâ‡∏¢‡∏≤‡∏ß‡πÄ‡∏ó‡πà‡∏≤‡∏Å‡∏±‡∏ô‡∏ó‡∏µ‡πà MAX_LEN)
    if len(indices) < max_len:
        indices += [0] * (max_len - len(indices)) # ‡πÄ‡∏ï‡∏¥‡∏° 0 ‡πÉ‡∏´‡πâ‡πÄ‡∏ï‡πá‡∏°
    else:
        indices = indices[:max_len] # ‡∏ï‡∏±‡∏î‡∏ó‡∏¥‡πâ‡∏á‡∏™‡πà‡∏ß‡∏ô‡πÄ‡∏Å‡∏¥‡∏ô
        
    return indices

# 1.4 Convert All Data
print("Converting text to Sequence IDs...")
X_list = df['body_text'].apply(text_to_indices).tolist()
X_numpy = np.array(X_list)
X_tensor = torch.LongTensor(X_numpy).to(device) # ‡∏™‡∏±‡∏á‡πÄ‡∏Å‡∏ï! ‡πÄ‡∏õ‡πá‡∏ô LongTensor (‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡πÄ‡∏ï‡πá‡∏°)

# Labels (‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô‡πÄ‡∏î‡∏¥‡∏°)
label_cols = ['politics', 'human_rights', 'quality_of_life', 'international', 
              'social', 'environment', 'economics', 'culture', 'labor', 
              'national_security', 'ict', 'education']
y_numpy = df[label_cols].values
y_tensor = torch.FloatTensor(y_numpy).to(device)
num_classes = len(label_cols)

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
print("Data Ready for CNN!")

In [None]:
# ==========================================
# 2. TextCNN Model Definition
# ==========================================
class TextCNN(nn.Module):
    def __init__(self, embedding_matrix, num_classes, filter_sizes, num_filters):
        super(TextCNN, self).__init__()
        
        # 1. Embedding Layer: ‡πÇ‡∏´‡∏•‡∏î Word2Vec ‡πÄ‡∏Ç‡πâ‡∏≤‡πÑ‡∏õ
        vocab_size, embed_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False) 
        # freeze=False ‡πÅ‡∏õ‡∏•‡∏ß‡πà‡∏≤‡∏¢‡∏≠‡∏°‡πÉ‡∏´‡πâ Word2Vec ‡∏Ç‡∏¢‡∏±‡∏ö‡∏Ñ‡πà‡∏≤‡πÑ‡∏î‡πâ‡∏ô‡∏¥‡∏î‡∏´‡∏ô‡πà‡∏≠‡∏¢‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÉ‡∏´‡πâ‡πÄ‡∏Ç‡πâ‡∏≤‡∏Å‡∏±‡∏ö‡∏á‡∏≤‡∏ô‡∏Ç‡πà‡∏≤‡∏ß‡πÄ‡∏£‡∏≤
        
        # 2. Conv Layers: ‡∏ï‡∏±‡∏ß‡∏™‡πÅ‡∏Å‡∏ô‡∏´‡∏≤ Pattern (‡πÄ‡∏ä‡πà‡∏ô "‡∏´‡∏¢‡∏∏‡∏î-‡∏á‡∏≤‡∏ô")
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, 
                      out_channels=num_filters, 
                      kernel_size=fs) 
            for fs in filter_sizes
        ])
        
        # 3. Dropout & Linear Output
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(len(filter_sizes) * num_filters, num_classes)
        
    def forward(self, x):
        # x shape: [batch_size, max_len] (‡πÄ‡∏õ‡πá‡∏ô‡πÄ‡∏•‡∏Ç ID)
        
        x = self.embedding(x) 
        # x shape: [batch, max_len, embed_dim]
        
        x = x.permute(0, 2, 1) 
        # x shape: [batch, embed_dim, max_len] (‡∏ï‡πâ‡∏≠‡∏á‡∏™‡∏•‡∏±‡∏ö‡πÅ‡∏Å‡∏ô‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÉ‡∏´‡πâ Conv1d ‡∏ó‡∏≥‡∏á‡∏≤‡∏ô‡πÑ‡∏î‡πâ)
        
        # Apply Convolution + ReLU + MaxPool
        conved = [F.relu(conv(x)) for conv in self.convs]
        
        # Max Pooling over time (‡∏´‡∏≤ Feature ‡∏ó‡∏µ‡πà‡πÄ‡∏î‡πà‡∏ô‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î‡πÉ‡∏ô‡∏õ‡∏£‡∏∞‡πÇ‡∏¢‡∏Ñ)
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        # ‡∏£‡∏ß‡∏°‡∏£‡πà‡∏≤‡∏á‡∏ó‡∏∏‡∏Å Filter (2‡∏Ñ‡∏≥, 3‡∏Ñ‡∏≥, 4‡∏Ñ‡∏≥) ‡πÄ‡∏Ç‡πâ‡∏≤‡∏î‡πâ‡∏ß‡∏¢‡∏Å‡∏±‡∏ô
        cat = torch.cat(pooled, dim=1)
        
        out = self.dropout(cat)
        return self.fc(out)

model = TextCNN(embedding_tensor, num_classes, FILTER_SIZES, NUM_FILTERS).to(device)
print(model)

In [None]:
# ==========================================
# 3. Training
# ==========================================
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

print(f"--- Training TextCNN ({NUM_EPOCHS} Epochs) ---")
best_f1 = 0.0
best_epoch = 0
best_model_wts = copy.deepcopy(model.state_dict())

for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    # Validation
    if (epoch+1) % 5 == 0: # ‡πÄ‡∏ä‡πá‡∏Ñ‡∏ö‡πà‡∏≠‡∏¢‡∏´‡∏ô‡πà‡∏≠‡∏¢
        model.eval()
        with torch.no_grad():
            test_outputs = model(X_test)
            probs = torch.sigmoid(test_outputs)
            predicted = (probs > THRESHOLD).float()
            current_f1 = f1_score(y_test.cpu().numpy(), predicted.cpu().numpy(), average='micro')
            
            if current_f1 > best_f1:
                best_f1 = current_f1
                best_epoch = epoch + 1
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(best_model_wts, 'best_cnn_model.pth')
                
        print(f"Epoch [{epoch+1}/{NUM_EPOCHS}] Loss: {total_loss/len(train_loader):.4f} | F1: {current_f1*100:.2f}% (Best: {best_f1*100:.2f}%)")

print(f"Loading Best Model from Epoch {best_epoch}...")
model.load_state_dict(best_model_wts)

In [None]:
# ==========================================
# 4. Evaluation
# ==========================================
model.eval()
with torch.no_grad():
    test_outputs = model(X_test)
    probs = torch.sigmoid(test_outputs)
    predicted = (probs > THRESHOLD).float()
    
    y_true = y_test.cpu().numpy()
    y_pred = predicted.cpu().numpy()
    
    print("\n--- Classification Report (TextCNN) ---")
    print(classification_report(y_true, y_pred, target_names=label_cols, zero_division=0))

In [None]:
# ==========================================
# 5. Interactive Mode
# ==========================================
def predict_cnn(text):
    model.eval()
    # ‡πÅ‡∏õ‡∏•‡∏á text ‡πÄ‡∏õ‡πá‡∏ô ID sequence
    indices = text_to_indices(text) 
    tensor = torch.LongTensor([indices]).to(device) # ‡πÉ‡∏™‡πà [] ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏°‡∏¥‡∏ï‡∏¥ Batch
    
    with torch.no_grad():
        logits = model(tensor)
        probs = torch.sigmoid(logits).squeeze().cpu().numpy()
        
    print(f"\nSnippet: {text[:50]}...")
    found = False
    for i, col in enumerate(label_cols):
        if probs[i] > THRESHOLD:
            print(f"[/] {col}: {probs[i]*100:.2f}% (YES)")
            found = True
        elif probs[i] > 0.15:
            print(f"[ ] {col}: {probs[i]*100:.2f}%")
    if not found: print(">> No category detected.")

print("Type 'exit' to stop.")
while True:
    try:
        user_input = input("\nüìù Enter news (TextCNN): ").strip()
        if user_input.lower() in ['exit', 'quit', 'q']: break
        if not user_input: continue
        predict_cnn(user_input)
    except KeyboardInterrupt: break