In [1]:
import pandas as pd
import numpy as np
from pythainlp.tokenize import word_tokenize
from pythainlp.corpus import thai_stopwords
from pythainlp import word_vector
import torch
from IPython.display import display

# Check Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"‚úÖ PyTorch is running on: {device}")

‚úÖ PyTorch is running on: cuda


In [2]:
# 1. Load Thai2Vec
print("‚è≥ Loading Thai2Vec model (thai2fit_wv)...")
try:
    wv_wrapper = word_vector.WordVector(model_name="thai2fit_wv")
    wv = wv_wrapper.get_model() 
    print("‚úÖ Thai2Vec Loaded successfully.")
except Exception as e:
    print(f"‚ùå Error loading model: {e}")

# 2. Load CSV
file_name = r"d:\year4\‡∏™‡∏´‡∏Å‡∏¥‡∏à\prachatai_test.csv"
try:
    df = pd.read_csv(file_name)
    print(f"‚úÖ CSV Loaded: {len(df)} records")
except FileNotFoundError:
    print(f"‚ùå ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏µ‡πà: {file_name}")

‚è≥ Loading Thai2Vec model (thai2fit_wv)...
‚úÖ Thai2Vec Loaded successfully.
‚úÖ CSV Loaded: 6789 records


In [3]:
# Stopwords Setup
stop_words = set(thai_stopwords())
my_custom_stops = {' ', '\n', '\t', '‚Äú', '‚Äù', '(', ')', '[', ']', '-', '.', ',', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}
stop_words.update(my_custom_stops)

# --- ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡∏õ‡∏£‡∏∞‡πÇ‡∏¢‡∏Ñ‡πÉ‡∏´‡πâ‡πÄ‡∏õ‡πá‡∏ô Matrix ‡∏Ç‡∏≠‡∏á‡∏Ñ‡∏≥ ‡πÅ‡∏•‡∏∞ ‡∏Ñ‡∏∑‡∏ô‡∏Ñ‡πà‡∏≤‡∏Ñ‡∏≥‡∏ó‡∏µ‡πà‡πÉ‡∏ä‡πâ‡∏î‡πâ‡∏ß‡∏¢ ---
def get_word_vectors_and_tokens(text):
    tokens = word_tokenize(str(text), engine='newmm')
    
    vecs = []
    kept_tokens = []

    for word in tokens:
        if word not in stop_words and word.strip() != '':
            try:
                # ‡∏•‡∏≠‡∏á‡∏î‡∏∂‡∏á Vector ‡∏à‡∏≤‡∏Å Thai2Fit
                vec = wv.get_vector(word) 
                vecs.append(vec)
                kept_tokens.append(word) # ‡πÄ‡∏Å‡πá‡∏ö‡∏Ñ‡∏≥‡∏ó‡∏µ‡πà‡∏ú‡πà‡∏≤‡∏ô‡∏Å‡∏≤‡∏£‡∏Å‡∏£‡∏≠‡∏á‡πÅ‡∏•‡πâ‡∏ß
            except:
                pass # ‡∏ñ‡πâ‡∏≤‡∏Ñ‡∏≥‡πÑ‡∏°‡πà‡∏°‡∏µ‡πÉ‡∏ô Dict ‡∏Å‡πá‡∏Ç‡πâ‡∏≤‡∏°‡πÑ‡∏õ
    
    # ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏≥‡πÑ‡∏´‡∏ô‡πÅ‡∏õ‡∏•‡∏á‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢
    if len(vecs) == 0:
        # ‡∏Ñ‡∏∑‡∏ô‡∏Ñ‡πà‡∏≤ Matrix ‡∏ß‡πà‡∏≤‡∏á (size 0, 300) ‡πÅ‡∏•‡∏∞ List ‡∏ß‡πà‡∏≤‡∏á
        return np.zeros((0, wv.vector_size)), [] 
    
    # ‡∏Ñ‡∏∑‡∏ô‡∏Ñ‡πà‡∏≤ 2 ‡∏≠‡∏¢‡πà‡∏≤‡∏á: (Matrix ‡∏Ç‡∏≠‡∏á Vector, List ‡∏Ç‡∏≠‡∏á‡∏Ñ‡∏≥‡∏ó‡∏µ‡πà‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ô)
    return np.array(vecs), kept_tokens

print("‚úÖ Function ready.")

‚úÖ Function ready.


In [4]:
# ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏°‡∏≤‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡πÅ‡∏Ñ‡πà 5 ‡πÅ‡∏ñ‡∏ß
df_small = df.head(5).copy()

print("üîÑ Converting text to Word Matrices and Tokens...")

# ‡πÄ‡∏£‡∏µ‡∏¢‡∏Å‡πÉ‡∏ä‡πâ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô ‡πÅ‡∏•‡∏∞‡πÄ‡∏Å‡πá‡∏ö‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏•‡∏á‡∏ï‡∏±‡∏ß‡πÅ‡∏õ‡∏£‡∏ä‡∏±‡πà‡∏ß‡∏Ñ‡∏£‡∏≤‡∏ß
results = df_small['body_text'].apply(get_word_vectors_and_tokens)

# ‡πÅ‡∏¢‡∏Å‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏≠‡∏≠‡∏Å‡πÄ‡∏õ‡πá‡∏ô 2 ‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå
df_small['word_matrices'] = results.apply(lambda x: x[0]) # ‡πÄ‡∏Å‡πá‡∏ö Matrix (‡∏ï‡∏±‡∏ß‡πÄ‡∏•‡∏Ç)
df_small['final_tokens']  = results.apply(lambda x: x[1]) # ‡πÄ‡∏Å‡πá‡∏ö‡∏Ñ‡∏≥ (Text)

print("‚úÖ Processing Done.")
display(df_small[['title', 'final_tokens']].head(3)) # ‡πÇ‡∏ä‡∏ß‡πå‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå

üîÑ Converting text to Word Matrices and Tokens...
‚úÖ Processing Done.


Unnamed: 0,title,final_tokens
0,‡πÅ‡∏Æ‡∏Ñ‡πÄ‡∏Å‡∏≠‡∏£‡πå Anonymous ‡∏•‡∏±‡πà‡∏ô‡∏ó‡∏≥‡∏™‡∏á‡∏Ñ‡∏£‡∏≤‡∏°‡πÑ‡∏ã‡πÄ‡∏ö‡∏≠‡∏£‡πå‡∏Ñ‡∏£‡∏±‡πâ‡∏á‡πÉ‡∏´‡∏ç...,"[17, ‡∏û, ‡∏¢, 2558, ‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô, ‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®‡∏™‡∏á‡∏Ñ‡∏£‡∏≤‡∏°, ‡∏´‡∏±‡∏ß‡∏£‡∏∏‡∏ô‡πÅ..."
1,‡∏™‡∏ï‡∏π‡∏î‡∏¥‡πÇ‡∏≠‡∏à‡∏¥‡∏ö‡∏•‡∏¥‡∏ï‡πâ‡∏≤‡∏ô‡∏™‡∏á‡∏Ñ‡∏£‡∏≤‡∏° ‡∏ß‡∏¥‡∏à‡∏≤‡∏£‡∏ì‡πå‡∏Å‡∏≤‡∏£‡πÅ‡∏Å‡πâ‡∏£‡∏±‡∏ê‡∏ò‡∏£‡∏£‡∏°‡∏ô‡∏π‡∏ç...,"[‡∏™.‡∏™., ‡∏ç‡∏µ‡πà‡∏õ‡∏∏‡πà‡∏ô, ‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£, ‡πÅ‡∏Å‡πâ, ‡∏£‡∏±‡∏ê‡∏ò‡∏£‡∏£‡∏°‡∏ô‡∏π‡∏ç, ‡∏Å‡∏≠‡∏á‡∏Å..."
2,We need Safety Zone ‡∏™‡∏°‡∏≤‡∏Ñ‡∏°‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏™‡∏±‡∏ô‡∏ï‡∏¥‡∏†‡∏≤‡∏û‡∏ô‡∏≥‡πÄ‡∏î‡∏¥‡∏ô‡∏£‡∏ì...,"[‡∏™‡∏°‡∏≤‡∏Ñ‡∏°, ‡∏™‡∏±‡∏ô‡∏ï‡∏¥‡∏†‡∏≤‡∏û, ‡πÄ‡∏Ñ‡∏£‡∏∑‡∏≠‡∏Ç‡πà‡∏≤‡∏¢, ‡πÄ‡∏î‡πá‡∏Å, ‡πÄ‡∏¢‡∏≤‡∏ß‡∏ä‡∏ô, ‡∏õ‡∏£‡∏∞..."


In [5]:
# ===========================
# üîç ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏î‡∏π‡∏Ç‡πà‡∏≤‡∏ß‡∏ó‡∏µ‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£ (‡πÅ‡∏Å‡πâ‡πÄ‡∏•‡∏Ç‡∏ï‡∏£‡∏á‡∏ô‡∏µ‡πâ!)
idx = 0
# ===========================

if idx < len(df_small):
    matrix_news = df_small['word_matrices'].iloc[idx]
    tokens_news = df_small['final_tokens'].iloc[idx]

    print(f"--- üì∞ ‡∏Ç‡πà‡∏≤‡∏ß‡∏ó‡∏µ‡πà {idx+1} ---")
    print(f"‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏Ñ‡∏≥‡∏ó‡∏µ‡πà‡πÄ‡∏´‡∏•‡∏∑‡∏≠‡∏´‡∏•‡∏±‡∏á‡∏ï‡∏±‡∏î Stopwords: {len(tokens_news)} ‡∏Ñ‡∏≥")
    print(f"‡∏Ç‡∏ô‡∏≤‡∏î‡∏Ç‡∏≠‡∏á Matrix (‡∏Ñ‡∏≥ x 300): {matrix_news.shape}") 

    print("\n--- üìù ‡∏£‡∏≤‡∏¢‡∏ä‡∏∑‡πà‡∏≠‡∏Ñ‡∏≥ (Tokens) ---")
    print(tokens_news[:50]) # ‡πÇ‡∏ä‡∏ß‡πå 50 ‡∏Ñ‡∏≥‡πÅ‡∏£‡∏Å

    print("\n--- üî¢ ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á Vector ---")
    if len(matrix_news) > 0:
        print(f"‡∏Ñ‡∏≥‡∏ß‡πà‡∏≤: '{tokens_news[0]}'") 
        print(f"Vector (20 ‡∏ï‡∏±‡∏ß‡πÅ‡∏£‡∏Å): {matrix_news[0][:20]}")
        print("...")
        print(f"\n‡∏Ñ‡∏≥‡∏ß‡πà‡∏≤: '{tokens_news[1]}'") 
        print(f"Vector (20 ‡∏ï‡∏±‡∏ß‡πÅ‡∏£‡∏Å): {matrix_news[1][:20]}")
else:
    print("‚ùå Index ‡πÄ‡∏Å‡∏¥‡∏ô‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ó‡∏µ‡πà‡∏°‡∏µ")

--- üì∞ ‡∏Ç‡πà‡∏≤‡∏ß‡∏ó‡∏µ‡πà 1 ---
‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏Ñ‡∏≥‡∏ó‡∏µ‡πà‡πÄ‡∏´‡∏•‡∏∑‡∏≠‡∏´‡∏•‡∏±‡∏á‡∏ï‡∏±‡∏î Stopwords: 59 ‡∏Ñ‡∏≥
‡∏Ç‡∏ô‡∏≤‡∏î‡∏Ç‡∏≠‡∏á Matrix (‡∏Ñ‡∏≥ x 300): (59, 300)

--- üìù ‡∏£‡∏≤‡∏¢‡∏ä‡∏∑‡πà‡∏≠‡∏Ñ‡∏≥ (Tokens) ---
['17', '‡∏û', '‡∏¢', '2558', '‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô', '‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®‡∏™‡∏á‡∏Ñ‡∏£‡∏≤‡∏°', '‡∏´‡∏±‡∏ß‡∏£‡∏∏‡∏ô‡πÅ‡∏£‡∏á', '‡∏≠‡∏≠‡∏Å‡∏°‡∏≤', '‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®', '‡∏ú‡∏π‡πâ‡∏≠‡∏¢‡∏π‡πà‡πÄ‡∏ö‡∏∑‡πâ‡∏≠‡∏á‡∏´‡∏•‡∏±‡∏á', '‡πÇ‡∏à‡∏°‡∏ï‡∏µ', '‡∏Å‡∏£‡∏∏‡∏á', '‡∏õ‡∏≤‡∏£‡∏µ‡∏™', '‡∏Ñ‡∏∑‡∏ô', '‡∏®‡∏∏‡∏Å‡∏£‡πå', '‡∏ó‡∏µ‡πà‡∏ú‡πà‡∏≤‡∏ô‡∏°‡∏≤', '‡∏†‡∏≤‡∏û', '‡∏Ñ‡∏•‡∏¥‡∏õ', '‡πÇ‡∏Ü‡∏©‡∏Å', '‡∏™‡∏ß‡∏°‡∏´‡∏ô‡πâ‡∏≤‡∏Å‡∏≤‡∏Å', '‡∏™‡∏±‡∏ç‡∏•‡∏±‡∏Å‡∏©‡∏ì‡πå', '‡∏≠‡∏≠‡∏Å‡∏°‡∏≤', '‡∏≠‡πà‡∏≤‡∏ô', '‡πÅ‡∏ñ‡∏•‡∏á', '‡∏†‡∏≤‡∏©‡∏≤', '‡∏ù‡∏£‡∏±‡πà‡∏á‡πÄ‡∏®‡∏™', '‡πÉ‡∏à‡∏Ñ‡∏ß‡∏≤‡∏°', '‡πÇ‡∏à‡∏°‡∏ï‡∏µ', '‡∏Å‡∏£‡∏∏‡∏á', '‡∏õ‡∏≤‡∏£‡∏µ‡∏™', '‡∏ó‡∏±‡πà‡∏ß‡πÇ‡∏•‡∏Å', '‡∏ï‡∏≤‡∏°‡∏•‡πà‡∏≤', '‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô', '‡∏ó‡∏≥', '‡∏ï‡∏≠‡∏ô‡∏ó‡∏µ‡πà', '‡πÇ‡∏à‡∏°‡∏ï‡∏µ', '‡∏™‡∏≥‡∏ô‡∏±‡∏Å‡∏û‡∏¥‡∏°‡∏û‡πå',