In [None]:
### **0. Install Required Libraries**
# !pip install numpy==1.23.5
# !pip install --upgrade gensim
# !pip install --upgrade pythainlp
# !pip install emoji
# !pip install fasttext-wheel  # ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö fastText embeddings

### **1. Import Libraries**
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import re
import string
import emoji
import warnings
warnings.filterwarnings('ignore')

# Thai NLP - ‡πÄ‡∏Ñ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏°‡∏∑‡∏≠‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢
from pythainlp.tokenize import word_tokenize
from pythainlp.corpus import thai_negations
from pythainlp.word_vector import WordVector

# Machine Learning - ‡πÄ‡∏Ñ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏°‡∏∑‡∏≠‡πÅ‡∏°‡∏ä‡∏ä‡∏µ‡∏ô‡πÄ‡∏•‡∏¥‡∏£‡πå‡∏ô‡∏ô‡∏¥‡∏á
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.sparse import hstack, csr_matrix
from collections import Counter

# FastText - ‡πÄ‡∏Ñ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏°‡∏∑‡∏≠‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö word embedding
import fasttext
import fasttext.util

# NLTK ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏†‡∏≤‡∏©‡∏≤‡∏≠‡∏±‡∏á‡∏Å‡∏§‡∏©
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize as nltk_tokenize
from nltk.stem import WordNetLemmatizer

# ‡∏î‡∏≤‡∏ß‡∏ô‡πå‡πÇ‡∏´‡∏•‡∏î NLTK data (‡∏£‡∏±‡∏ô‡∏Ñ‡∏£‡∏±‡πâ‡∏á‡πÅ‡∏£‡∏Å‡πÄ‡∏ó‡πà‡∏≤‡∏ô‡∏±‡πâ‡∏ô)
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')

## **2. Load and Explore Data**
# ‡∏Å‡∏≥‡∏´‡∏ô‡∏î‡∏Å‡∏≤‡∏£‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢
import matplotlib
matplotlib.rcParams['font.family'] = 'DejaVu Sans'

# ‡πÇ‡∏´‡∏•‡∏î dataset ‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢ (‡∏£‡∏µ‡∏ß‡∏¥‡∏ß‡∏´‡∏≠‡∏û‡∏±‡∏Å)
df_thai = pd.read_csv("Data/dorm_reviews.csv")
# ‡∏Ñ‡∏≤‡∏î‡∏ß‡πà‡∏≤‡πÑ‡∏ü‡∏•‡πå‡πÑ‡∏ó‡∏¢‡∏°‡∏µ‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå text, rating ‡∏≠‡∏¢‡∏π‡πà‡πÅ‡∏•‡πâ‡∏ß
if 'dormitory_id' in df_thai.columns:
    df_thai = df_thai.drop([c for c in ['dormitory_id', 'user_id'] if c in df_thai.columns], axis=1)
df_thai = df_thai.rename(columns={'Review':'text','Rating':'rating'})
df_thai = df_thai[['text','rating']].copy()
df_thai['language'] = 'thai'

# ‡πÇ‡∏´‡∏•‡∏î dataset ‡∏†‡∏≤‡∏©‡∏≤‡∏≠‡∏±‡∏á‡∏Å‡∏§‡∏© (‡∏£‡∏µ‡∏ß‡∏¥‡∏ß‡πÇ‡∏£‡∏á‡πÅ‡∏£‡∏°)
df_english = pd.read_csv("Data/tripadvisor_hotel_reviews.csv")
# ‡πÉ‡∏´‡πâ‡πÄ‡∏´‡∏•‡∏∑‡∏≠‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå text, rating
if {'Review','Rating'}.issubset(df_english.columns):
    df_english = df_english[['Review','Rating']].copy()
    df_english.columns = ['text','rating']
else:
    # ‡πÄ‡∏ú‡∏∑‡πà‡∏≠‡πÑ‡∏ü‡∏•‡πå‡πÉ‡∏ä‡πâ‡∏ä‡∏∑‡πà‡∏≠‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏≠‡∏∑‡πà‡∏ô
    possible_text = [c for c in df_english.columns if c.lower() in ['review','text','comment','content']]
    possible_rating = [c for c in df_english.columns if c.lower() in ['rating','score','stars']]
    df_english = df_english[[possible_text[0], possible_rating[0]]].copy()
    df_english.columns = ['text','rating']
df_english['language'] = 'english'

# ‡∏£‡∏ß‡∏°‡∏™‡∏≠‡∏á‡∏ä‡∏∏‡∏î‡πÄ‡∏õ‡πá‡∏ô df (‡πÅ‡∏Å‡πâ NameError ‡πÄ‡∏î‡∏¥‡∏°)
df = pd.concat([df_thai, df_english], ignore_index=True)

# ‡∏ó‡∏≥ rating ‡πÉ‡∏´‡πâ‡πÄ‡∏õ‡πá‡∏ô int 1..5
df['rating'] = pd.to_numeric(df['rating'], errors='coerce').clip(1,5).astype(int)

### **3. ‡πÇ‡∏´‡∏•‡∏î Word Embedding Models**
# ‡πÇ‡∏´‡∏•‡∏î Thai2Vec model ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢
print("‡∏Å‡∏≥‡∏•‡∏±‡∏á‡πÇ‡∏´‡∏•‡∏î Thai2Vec model...")
thai2fit_model = WordVector(model_name="thai2fit_wv").get_model()

# ‡∏î‡∏≤‡∏ß‡∏ô‡πå‡πÇ‡∏´‡∏•‡∏î‡πÅ‡∏•‡∏∞‡πÇ‡∏´‡∏•‡∏î fastText model ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏†‡∏≤‡∏©‡∏≤‡∏≠‡∏±‡∏á‡∏Å‡∏§‡∏©
print("‡∏Å‡∏≥‡∏•‡∏±‡∏á‡πÇ‡∏´‡∏•‡∏î fastText model...")
fasttext_model = fasttext.load_model('cc.en.300.bin')
print("‡πÇ‡∏´‡∏•‡∏î FastText model ‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à")

def enhanced_sentence_vectorizer(text, dim=300):
    """
    ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡πÅ‡∏õ‡∏•‡∏á‡∏õ‡∏£‡∏∞‡πÇ‡∏¢‡∏Ñ‡πÄ‡∏õ‡πá‡∏ô vector ‡πÇ‡∏î‡∏¢‡πÉ‡∏ä‡πâ Thai2Vec ‡πÅ‡∏•‡∏∞ fastText
    """
    if not isinstance(text, str):
        text = ""
    is_thai = any(ord(char) >= 3584 and ord(char) <= 3711 for char in text)
    words = word_tokenize(text, engine="newmm") if is_thai else nltk_tokenize(text.lower())

    vec = np.zeros(dim, dtype=np.float32)
    word_count = 0
    total_words = len(words)
    
    for i, word in enumerate(words):
        if word.strip():
            position_weight = 1.0 + (i / max(total_words, 1)) * 0.5
            word_vector = None
            
            if is_thai:
                if word in thai2fit_model:
                    word_vector = thai2fit_model[word] * position_weight
                else:
                    char_vec, char_count = np.zeros(dim, dtype=np.float32), 0
                    for char in word:
                        if char in thai2fit_model:
                            char_vec += thai2fit_model[char]
                            char_count += 1
                    if char_count > 0:
                        word_vector = (char_vec / char_count) * position_weight
            else:
                try:
                    word_vector = fasttext_model.get_word_vector(word) * position_weight
                except Exception:
                    pass
            
            if word_vector is not None:
                vec += word_vector
                word_count += 1
    
    return vec / max(word_count, 1)

### **4. ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏ó‡∏≥‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∞‡∏≠‡∏≤‡∏î‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡πÅ‡∏•‡∏∞‡∏Å‡∏≤‡∏£‡∏™‡∏Å‡∏±‡∏î‡∏Ñ‡∏∏‡∏ì‡∏•‡∏±‡∏Å‡∏©‡∏ì‡∏∞**
def clean_text_thai(text):
    if not isinstance(text, str):
        return ""
    from pythainlp.corpus import thai_stopwords
    thai_stop_words = list(thai_stopwords())
    important_words = [
        "‡πÑ‡∏°‡πà","‡πÑ‡∏°‡πà‡∏°‡∏µ","‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ","‡πÑ‡∏°‡πà‡∏Ñ‡πà‡∏≠‡∏¢","‡πÑ‡∏°‡πà‡πÄ‡∏Ñ‡∏¢","‡∏´‡πâ‡∏≤‡∏°","‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà","‡πÑ‡∏°‡πà‡∏¢‡∏≠‡∏°",
        "‡∏î‡∏µ","‡∏î‡∏µ‡∏°‡∏≤‡∏Å","‡∏™‡∏∞‡∏≠‡∏≤‡∏î","‡πÄ‡∏¢‡πá‡∏ô","‡∏Å‡∏ß‡πâ‡∏≤‡∏á","‡πÉ‡∏´‡∏°‡πà","‡∏™‡∏ß‡∏¢","‡∏ô‡πà‡∏≤‡∏≠‡∏¢‡∏π‡πà","‡∏™‡∏ö‡∏≤‡∏¢",
        "‡∏ä‡∏≠‡∏ö","‡∏õ‡∏£‡∏∞‡∏ó‡∏±‡∏ö‡πÉ‡∏à","‡∏™‡∏∞‡∏î‡∏ß‡∏Å","‡∏õ‡∏•‡∏≠‡∏î‡∏†‡∏±‡∏¢","‡∏Ñ‡∏∏‡πâ‡∏°","‡∏Ñ‡∏∏‡πâ‡∏°‡∏Ñ‡πà‡∏≤","‡πÄ‡∏á‡∏µ‡∏¢‡∏ö","‡∏Ñ‡∏£‡∏ö",
        "‡∏û‡∏≠‡πÉ‡∏à","‡πÄ‡∏£‡πá‡∏ß","‡πÇ‡∏≠‡πÄ‡∏Ñ","‡πÇ‡∏≠‡πÄ‡∏Ñ‡πÄ‡∏•‡∏¢","‡πÄ‡∏¢‡∏µ‡πà‡∏¢‡∏°","‡∏ñ‡∏π‡∏Å‡πÉ‡∏à","‡∏ó‡∏≥‡πÄ‡∏•‡∏î‡∏µ","‡πÉ‡∏Å‡∏•‡πâ","‡∏Ñ‡∏£‡∏ö‡∏Ñ‡∏£‡∏±‡∏ô",
        "‡πÅ‡∏¢‡πà","‡πÑ‡∏°‡πà‡∏î‡∏µ","‡πÄ‡∏´‡∏°‡πá‡∏ô","‡∏£‡πâ‡∏≠‡∏ô","‡πÅ‡∏Ñ‡∏ö","‡πÄ‡∏Å‡πà‡∏≤","‡∏™‡∏Å‡∏õ‡∏£‡∏Å","‡∏û‡∏±‡∏á","‡πÄ‡∏™‡∏µ‡∏¢‡∏á‡∏î‡∏±‡∏á",
        "‡πÅ‡∏û‡∏á","‡πÑ‡∏°‡πà‡∏ä‡∏≠‡∏ö","‡πÅ‡∏≠‡∏≠‡∏±‡∏î","‡∏£‡∏Å","‡∏≠‡∏±‡∏ô‡∏ï‡∏£‡∏≤‡∏¢","‡∏ä‡πâ‡∏≤","‡∏ú‡∏¥‡∏î‡∏´‡∏ß‡∏±‡∏á","‡∏´‡πà‡∏ß‡∏¢","‡πÄ‡∏ü‡∏•",
        "‡∏Å‡∏≤‡∏Å","‡πÑ‡∏°‡πà‡∏Ñ‡∏∏‡πâ‡∏°","‡πÑ‡∏Å‡∏•","‡∏£‡∏±‡πà‡∏ß","‡∏ó‡∏£‡∏∏‡∏î","‡∏ó‡∏£‡∏∏‡∏î‡πÇ‡∏ó‡∏£‡∏°","‡πÄ‡∏™‡∏∑‡πà‡∏≠‡∏°",
        "‡∏°‡∏≤‡∏Å","‡∏™‡∏∏‡∏î‡πÜ","‡πÄ‡∏¢‡∏≠‡∏∞","‡∏ô‡πâ‡∏≠‡∏¢","‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î","‡∏´‡∏•‡∏≤‡∏¢","‡∏ó‡∏∏‡∏Å","‡πÄ‡∏Å‡∏¥‡∏ô","‡∏à‡∏±‡∏î","‡πÇ‡∏Ñ‡∏ï‡∏£",
        "‡∏°‡∏≤‡∏Å‡πÜ","‡∏™‡∏∏‡∏î‡∏¢‡∏≠‡∏î","‡∏ò‡∏£‡∏£‡∏°‡∏î‡∏≤","‡∏û‡∏≠‡πÉ‡∏ä‡πâ","‡∏ö‡πà‡∏≠‡∏¢","‡∏ï‡∏•‡∏≠‡∏î","‡πÄ‡∏ß‡∏≠‡∏£‡πå",
        "‡πÅ‡∏≠‡∏£‡πå","‡∏ô‡πâ‡∏≥","‡πÑ‡∏ü","‡∏´‡πâ‡∏≠‡∏á‡∏ô‡πâ‡∏≥","‡πÄ‡∏ï‡∏µ‡∏¢‡∏á","‡∏ù‡∏±‡∏Å‡∏ö‡∏±‡∏ß","‡πÄ‡∏ô‡πá‡∏ï","‡πÑ‡∏ß‡πÑ‡∏ü","‡πÑ‡∏ü‡∏ü‡πâ‡∏≤",
        "‡∏õ‡∏£‡∏∞‡∏õ‡∏≤","‡πÄ‡∏ü‡∏≠‡∏£‡πå","‡∏•‡∏¥‡∏ü‡∏ï‡πå","‡∏ó‡∏µ‡πà‡∏à‡∏≠‡∏î","‡∏à‡∏≠‡∏î‡∏£‡∏ñ","‡∏ã‡∏±‡∏Å‡∏ú‡πâ‡∏≤","‡∏ï‡∏π‡πâ‡πÄ‡∏¢‡πá‡∏ô","‡∏ó‡∏µ‡∏ß‡∏µ",
        "‡∏à‡∏≤‡∏ô","‡πÑ‡∏°‡πÇ‡∏Ñ‡∏£‡πÄ‡∏ß‡∏ü","‡πÄ‡∏ï‡∏≤","‡∏ô‡πâ‡∏≥‡∏≠‡∏∏‡πà‡∏ô","‡∏ú‡πâ‡∏≤‡∏õ‡∏π","‡πÇ‡∏ï‡πä‡∏∞","‡πÄ‡∏Å‡πâ‡∏≤‡∏≠‡∏µ‡πâ","‡∏ï‡∏π‡πâ","‡∏ä‡∏±‡πâ‡∏ô‡∏ß‡∏≤‡∏á",
        "‡∏õ‡∏•‡∏±‡πä‡∏Å","‡∏™‡∏±‡∏ç‡∏ç‡∏≤‡∏ì",
        "‡πÄ‡∏™‡∏µ‡∏¢‡∏á","‡∏°‡∏î","‡πÅ‡∏°‡∏•‡∏á","‡πÅ‡∏°‡∏•‡∏á‡∏™‡∏≤‡∏ö","‡∏´‡∏ô‡∏π","‡∏¢‡∏∏‡∏á","‡∏ù‡∏∏‡πà‡∏ô","‡∏Å‡∏•‡∏¥‡πà‡∏ô","‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏ô‡∏ö‡πâ‡∏≤‡∏ô",
        "‡∏Ç‡πâ‡∏≤‡∏á‡∏´‡πâ‡∏≠‡∏á","‡∏Ç‡πâ‡∏≤‡∏á‡∏ô‡∏≠‡∏Å","‡∏ñ‡∏ô‡∏ô","‡∏ó‡∏≤‡∏á‡πÄ‡∏î‡∏¥‡∏ô","‡∏•‡∏≤‡∏ô‡∏à‡∏≠‡∏î","‡∏ä‡∏±‡πâ‡∏ô‡∏ö‡∏ô","‡∏ö‡∏±‡∏ô‡πÑ‡∏î","‡∏Å‡∏≥‡πÅ‡∏û‡∏á",
        "‡∏î‡∏π‡πÅ‡∏•","‡∏ö‡∏£‡∏¥‡∏Å‡∏≤‡∏£","‡∏ã‡πà‡∏≠‡∏°","‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç","‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£","‡∏û‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô","‡πÅ‡∏°‡πà‡∏ö‡πâ‡∏≤‡∏ô","‡∏£‡∏õ‡∏†",
        "‡πÄ‡∏à‡πâ‡∏≤‡∏Ç‡∏≠‡∏á","‡∏ô‡∏¥‡∏ï‡∏¥","‡∏Å‡∏é","‡∏£‡∏∞‡πÄ‡∏ö‡∏µ‡∏¢‡∏ö","‡∏Ñ‡πà‡∏≤‡πÄ‡∏ä‡πà‡∏≤","‡∏Ñ‡πà‡∏≤‡πÑ‡∏ü","‡∏Ñ‡πà‡∏≤‡∏ô‡πâ‡∏≥","‡∏Ñ‡πà‡∏≤‡∏™‡πà‡∏ß‡∏ô‡∏Å‡∏•‡∏≤‡∏á",
        "‡∏°‡∏±‡∏î‡∏à‡∏≥","‡∏õ‡∏£‡∏∞‡∏Å‡∏±‡∏ô","‡∏™‡∏±‡∏ç‡∏ç‡∏≤","‡∏ù‡∏≤‡∏Å‡∏Ç‡∏≠‡∏á","‡∏£‡∏±‡∏ö‡∏û‡∏±‡∏™‡∏î‡∏∏","‡∏Ñ‡∏µ‡∏¢‡πå‡∏Å‡∏≤‡∏£‡πå‡∏î","‡∏•‡πá‡∏≠‡∏Ñ","‡∏£‡∏≠‡∏ô‡∏≤‡∏ô",
        "‡πÑ‡∏°‡πà‡∏°‡∏≤‡∏î‡∏π","‡πÑ‡∏°‡πà‡∏ã‡πà‡∏≠‡∏°",
        "‡πÅ‡∏ï‡πà","‡πÅ‡∏ï‡πà‡∏ß‡πà‡∏≤","‡∏ñ‡∏∂‡∏á‡πÅ‡∏°‡πâ","‡∏≠‡∏¢‡πà‡∏≤‡∏á‡πÑ‡∏£‡∏Å‡πá‡∏ï‡∏≤‡∏°","‡πÄ‡∏û‡∏£‡∏≤‡∏∞","‡πÄ‡∏û‡∏£‡∏≤‡∏∞‡∏ß‡πà‡∏≤","‡πÄ‡∏ô‡∏∑‡πà‡∏≠‡∏á‡∏à‡∏≤‡∏Å",
        "‡∏Ñ‡∏∑‡∏≠","‡∏Å‡πá‡∏Ñ‡∏∑‡∏≠","‡∏™‡πà‡∏ß‡∏ô","‡∏ô‡∏≠‡∏Å‡∏à‡∏≤‡∏Å‡∏ô‡∏µ‡πâ","‡∏ó‡∏µ‡πà‡∏à‡∏£‡∏¥‡∏á","‡∏à‡∏£‡∏¥‡∏á‡πÜ","‡∏Å‡πá","‡πÅ‡∏°‡πâ","‡∏ó‡∏µ‡πà",
        "‡∏ï‡∏≠‡∏ô‡πÅ‡∏£‡∏Å","‡∏û‡∏≠‡∏î‡∏µ","‡πÅ‡∏•‡πâ‡∏ß‡∏Å‡πá"
    ]
    custom_stop_words = [w for w in thai_stop_words if w not in important_words]
    punct = string.punctuation.replace('!', '').replace('?', '').replace('.', '')
    text = emoji.replace_emoji(text, replace="")
    text = ''.join(ch if ch not in punct else ' ' for ch in text)
    text = re.sub(r'([‡∏Å-‡πôa-zA-Z])\1{2,}', r'\1\1', text)
    text = re.sub(r'\s+', " ", text).strip().lower()
    words = word_tokenize(text, engine='newmm')
    words = [w for w in words if w not in custom_stop_words]
    return ' '.join(words)

def clean_text_english(text):
    if not isinstance(text, str):
        return ""
    english_stop_words = set(stopwords.words('english'))
    important_english_words = {
        'not','no','never','nothing','nowhere','neither','nobody','none',
        'good','great','excellent','amazing','wonderful','perfect','love',
        'best','nice','clean','comfortable','convenient','recommend',
        'bad','terrible','awful','horrible','worst','hate','dirty',
        'uncomfortable','expensive','cheap','noisy','small',
        'very','really','extremely','quite','pretty','too','so','absolutely'
    }
    custom_stop_words = english_stop_words - important_english_words
    punct = string.punctuation.replace('!', '').replace('?', '').replace('.', '')
    text = emoji.replace_emoji(text, replace="")
    text = ''.join(ch if ch not in punct else ' ' for ch in text)
    text = re.sub(r'([a-zA-Z])\1{2,}', r'\1\1', text)
    text = re.sub(r'\s+', " ", text).strip().lower()
    words = nltk_tokenize(text)
    lem = WordNetLemmatizer()
    words = [lem.lemmatize(w) for w in words if w not in custom_stop_words and w.isalpha()]
    return ' '.join(words)

def clean_text(text):
    is_thai = any(ord(c) >= 3584 and ord(c) <= 3711 for c in str(text))
    return clean_text_thai(text) if is_thai else clean_text_english(text)

def extract_features(text):
    is_thai = any(ord(c) >= 3584 and ord(c) <= 3711 for c in str(text))
    words = word_tokenize(text, engine='newmm') if is_thai else nltk_tokenize(text.lower())
    word_count = len(words)
    features = {
        'exclamation_count': text.count('!'),
        'question_count': text.count('?'),
        'sentence_count': text.count('.') + 1,
        'word_count': word_count,
        'avg_word_length': sum(len(w) for w in words) / max(word_count, 1),
        'text_length': len(text)
    }
    word_counts = Counter(words)
    repeated_words = sum(1 for cnt in word_counts.values() if cnt > 1)
    features['repeated_words_ratio'] = repeated_words / max(word_count, 1)
    if is_thai:
        negation_words = set(thai_negations())
        features['negation_count'] = sum(1 for w in words if w in negation_words)
    else:
        english_negations = {'not','no','never','nothing','nobody','none','neither','nowhere'}
        features['negation_count'] = sum(1 for w in words if w in english_negations)
    features['punctuation_ratio'] = len([c for c in text if c in string.punctuation]) / max(len(text), 1)
    features['words_per_sentence'] = word_count / max(features['sentence_count'], 1)
    english_words = sum(1 for w in words if w.isascii() and w.isalpha())
    features['english_ratio'] = english_words / max(word_count, 1)
    return features

### **5. ‡∏ó‡∏≥‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∞‡∏≠‡∏≤‡∏î‡πÅ‡∏•‡∏∞‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•**
print("‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏ó‡∏≥‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∞‡∏≠‡∏≤‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•...")
df['cleaned_review'] = df['text'].astype(str).apply(clean_text)

def get_word_count(text):
    is_thai = any(ord(char) >= 3584 and ord(char) <= 3711 for char in str(text))
    return len(word_tokenize(text, engine='newmm')) if is_thai else len(nltk_tokenize(text))

df = df[df['cleaned_review'].apply(get_word_count) > 3]
df = df.drop_duplicates(subset=['cleaned_review'])

print("‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏™‡∏Å‡∏±‡∏î‡∏Ñ‡∏∏‡∏ì‡∏•‡∏±‡∏Å‡∏©‡∏ì‡∏∞‡∏à‡∏≤‡∏Å‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°...")
feature_columns = ['cleaned_review']
feature_names = ['exclamation_count','question_count','sentence_count','word_count',
                 'avg_word_length','repeated_words_ratio','negation_count',
                 'punctuation_ratio','text_length','words_per_sentence','english_ratio']

for f in feature_names:
    df[f] = df['cleaned_review'].apply(lambda x: extract_features(x)[f])

feature_columns.extend(feature_names)

print(f"‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏´‡∏•‡∏±‡∏á‡∏ó‡∏≥‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∞‡∏≠‡∏≤‡∏î: {len(df)}")
print("‡∏Å‡∏≤‡∏£‡∏Å‡∏£‡∏∞‡∏à‡∏≤‡∏¢‡∏Ç‡∏≠‡∏á‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô‡∏´‡∏•‡∏±‡∏á‡∏ó‡∏≥‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∞‡∏≠‡∏≤‡∏î:")
print(df['rating'].value_counts().sort_index())
print("\n‡∏Å‡∏≤‡∏£‡∏Å‡∏£‡∏∞‡∏à‡∏≤‡∏¢‡∏ï‡∏≤‡∏°‡∏†‡∏≤‡∏©‡∏≤‡∏´‡∏•‡∏±‡∏á‡∏ó‡∏≥‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∞‡∏≠‡∏≤‡∏î:")
print(df['language'].value_counts())

### **6. ‡πÅ‡∏ö‡πà‡∏á‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•**
X_train, X_test, y_train, y_test = train_test_split(
    df[feature_columns], df['rating'],
    test_size=0.2, random_state=42, stratify=df['rating'] if df['rating'].nunique()>1 else None
)
print(f"‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ù‡∏∂‡∏Å‡∏ù‡∏ô: {len(X_train)}")
print(f"‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ó‡∏î‡∏™‡∏≠‡∏ö: {len(X_test)}")

### **7. ‡∏™‡∏£‡πâ‡∏≤‡∏á Feature Vectors**
def custom_tokenizer_mixed(text):
    is_thai = any(ord(char) >= 3584 and ord(char) <= 3711 for char in str(text))
    return word_tokenize(text, engine='newmm') if is_thai else nltk_tokenize(text.lower())

print("‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏™‡∏£‡πâ‡∏≤‡∏á word embeddings...")
X_train_vectors = np.array([enhanced_sentence_vectorizer(t) for t in X_train['cleaned_review']], dtype=np.float32)
X_test_vectors  = np.array([enhanced_sentence_vectorizer(t) for t in X_test['cleaned_review']], dtype=np.float32)

print("‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏™‡∏£‡πâ‡∏≤‡∏á TF-IDF features...")
tfidf_vectorizer = TfidfVectorizer(
    tokenizer=custom_tokenizer_mixed,
    max_features=8000,
    ngram_range=(1,3),
    min_df=2,
    max_df=0.85,
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=True
)

print("‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏™‡∏£‡πâ‡∏≤‡∏á Count vectors...")
count_vectorizer = CountVectorizer(
    tokenizer=custom_tokenizer_mixed,
    max_features=2000,
    ngram_range=(1,2),
    min_df=3,
    max_df=0.85
)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['cleaned_review'])
X_test_tfidf  = tfidf_vectorizer.transform(X_test['cleaned_review'])
X_train_count = count_vectorizer.fit_transform(X_train['cleaned_review'])
X_test_count  = count_vectorizer.transform(X_test['cleaned_review'])

print("‡∏Å‡∏≥‡∏•‡∏±‡∏á scale vectors (embeddings)...")
scaler = StandardScaler(with_mean=False)
X_train_vectors_sparse = csr_matrix(X_train_vectors)
X_test_vectors_sparse  = csr_matrix(X_test_vectors)
X_train_vectors_scaled = scaler.fit_transform(X_train_vectors_sparse)
X_test_vectors_scaled  = scaler.transform(X_test_vectors_sparse)

numerical_features = [c for c in feature_columns if c != 'cleaned_review']
X_train_additional = X_train[numerical_features].values
X_test_additional  = X_test[numerical_features].values

features_scaler = StandardScaler()
X_train_additional_scaled = features_scaler.fit_transform(X_train_additional)
X_test_additional_scaled  = features_scaler.transform(X_test_additional)

print(f"TF-IDF features: {X_train_tfidf.shape[1]}")
print(f"Count features: {X_train_count.shape[1]}")
print(f"Word embedding features: {X_train_vectors.shape[1]}")
print(f"Additional features: {X_train_additional.shape[1]}")

### **8. ‡∏£‡∏ß‡∏° Features ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î**
print("‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏£‡∏ß‡∏° features ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î...")
X_train_combined = hstack([
    X_train_tfidf,
    X_train_count,
    X_train_vectors_scaled,
    csr_matrix(X_train_additional_scaled)
], format='csr')

X_test_combined = hstack([
    X_test_tfidf,
    X_test_count,
    X_test_vectors_scaled,
    csr_matrix(X_test_additional_scaled)
], format='csr')

print(f"‡∏Ç‡∏ô‡∏≤‡∏î‡∏Ç‡∏≠‡∏á features ‡∏£‡∏ß‡∏° - train: {X_train_combined.shape}, test: {X_test_combined.shape}")

# ‡∏™‡∏≥‡∏£‡∏≠‡∏á‡πÅ‡∏ö‡∏ö dense ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏ó‡∏µ‡πà‡πÑ‡∏°‡πà‡∏£‡∏≠‡∏á‡∏£‡∏±‡∏ö sparse
print("‡πÅ‡∏õ‡∏•‡∏á dense ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏ö‡∏≤‡∏á‡πÇ‡∏°‡πÄ‡∏î‡∏•...")
X_train_combined_dense = X_train_combined.toarray()
X_test_combined_dense  = X_test_combined.toarray()

# ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Naive Bayes ‡∏ï‡πâ‡∏≠‡∏á‡πÑ‡∏°‡πà‡πÄ‡∏õ‡πá‡∏ô‡∏•‡∏ö
X_train_nb = np.abs(X_train_combined_dense)
X_test_nb  = np.abs(X_test_combined_dense)

### **9. ‡∏Å‡∏≥‡∏´‡∏ô‡∏î Models**
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000, C=0.2,
                                              class_weight='balanced', solver='saga',
                                              multi_class='multinomial'),
    'Random Forest': RandomForestClassifier(random_state=42, n_jobs=-1,
                                            n_estimators=200, max_depth=15,
                                            min_samples_split=5, min_samples_leaf=2,
                                            class_weight='balanced'),
    'Linear SVM': LinearSVC(random_state=42, max_iter=2000, C=0.5,
                            class_weight='balanced', dual=False),
    'Multinomial Naive Bayes': MultinomialNB(alpha=1.0, fit_prior=True)
}

### **10. ‡∏ù‡∏∂‡∏Å‡∏ù‡∏ô‡πÅ‡∏•‡∏∞‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ú‡∏• Models**
results = {}
best_models = {}
training_times = {}

print("\n" + "="*50)
print("‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏ù‡∏∂‡∏Å‡∏ù‡∏ô‡πÅ‡∏•‡∏∞‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ú‡∏• MODELS")
print("="*50)

for model_name, model in models.items():
    print(f"\nüîß ‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏ù‡∏∂‡∏Å‡∏ù‡∏ô {model_name}...")
    start_time = time.time()
    
    # ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏ä‡∏∏‡∏î‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡πÄ‡∏´‡∏°‡∏≤‡∏∞‡∏™‡∏°
    if model_name == 'Multinomial Naive Bayes':
        Xtr, Xte = X_train_nb, X_test_nb
    elif model_name == 'Random Forest':
        # Tree-based ‡πÑ‡∏°‡πà‡∏£‡∏±‡∏ö sparse -> ‡πÉ‡∏ä‡πâ dense
        Xtr, Xte = X_train_combined_dense, X_test_combined_dense
    else:
        Xtr, Xte = X_train_combined, X_test_combined
    
    model.fit(Xtr, y_train)
    best_models[model_name] = model
    training_time = time.time() - start_time
    training_times[model_name] = training_time
    
    y_pred = model.predict(Xte)
    accuracy = accuracy_score(y_test, y_pred)

    # cross_val_score: ‡πÉ‡∏ä‡πâ‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå‡πÄ‡∏î‡∏µ‡∏¢‡∏ß‡∏Å‡∏±‡∏ö‡∏ï‡∏≠‡∏ô train
    try:
        cv_scores = cross_val_score(model, Xtr, y_train, cv=5, scoring='accuracy')
    except Exception:
        # ‡πÄ‡∏ú‡∏∑‡πà‡∏≠‡∏ö‡∏≤‡∏á‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÑ‡∏°‡πà‡πÄ‡∏´‡∏°‡∏≤‡∏∞‡∏Å‡∏±‡∏ö CV ‡πÉ‡∏ô‡∏™‡∏†‡∏≤‡∏û‡∏ô‡∏µ‡πâ
        cv_scores = np.array([accuracy])
    
    results[model_name] = {
        'accuracy': accuracy,
        'cv_mean': float(np.mean(cv_scores)),
        'cv_std': float(np.std(cv_scores)),
        'predictions': y_pred,
        'training_time': training_time
    }
    
    print(f"‚úÖ {model_name} ‡πÄ‡∏™‡∏£‡πá‡∏à‡∏™‡∏¥‡πâ‡∏ô!")
    print(f"   ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÅ‡∏°‡πà‡∏ô‡∏¢‡∏≥‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö: {accuracy:.4f}")
    print(f"   ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÅ‡∏°‡πà‡∏ô‡∏¢‡∏≥ CV: {np.mean(cv_scores):.4f} (¬±{np.std(cv_scores):.4f})")
    print(f"   ‡πÄ‡∏ß‡∏•‡∏≤‡∏ù‡∏∂‡∏Å‡∏ù‡∏ô: {training_time:.2f} ‡∏ß‡∏¥‡∏ô‡∏≤‡∏ó‡∏µ")

### **11. ‡πÄ‡∏õ‡∏£‡∏µ‡∏¢‡∏ö‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡πÅ‡∏•‡∏∞‡∏Å‡∏≤‡∏£‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•**
results_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Test Accuracy': [results[m]['accuracy'] for m in results.keys()],
    'CV Mean': [results[m]['cv_mean'] for m in results.keys()],
    'CV Std': [results[m]['cv_std'] for m in results.keys()],
    'Training Time (s)': [results[m]['training_time'] for m in results.keys()]
})

print("\n" + "="*60)
print("‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡πÄ‡∏õ‡∏£‡∏µ‡∏¢‡∏ö‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö MODELS")
print("="*60)
print(results_df.round(4))

best_model_name = results_df.loc[results_df['Test Accuracy'].idxmax(), 'Model']
print(f"\nüèÜ ‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏õ‡∏£‡∏∞‡∏™‡∏¥‡∏ó‡∏ò‡∏¥‡∏†‡∏≤‡∏û‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î: {best_model_name}")
print(f"   ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÅ‡∏°‡πà‡∏ô‡∏¢‡∏≥‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö: {results[best_model_name]['accuracy']:.4f}")

### **12. ‡∏Å‡∏≤‡∏£‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•‡πÅ‡∏ö‡∏ö‡∏Å‡∏£‡∏≤‡∏ü**
plt.rcParams['font.family'] = ['DejaVu Sans']
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

axes[0, 0].bar(results_df['Model'], results_df['Test Accuracy'])
axes[0, 0].set_title('‡∏Å‡∏≤‡∏£‡πÄ‡∏õ‡∏£‡∏µ‡∏¢‡∏ö‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö‡∏Ñ‡∏ß‡∏≤‡∏°‡πÅ‡∏°‡πà‡∏ô‡∏¢‡∏≥‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö', fontsize=14, fontweight='bold')
axes[0, 0].set_ylabel('‡∏Ñ‡∏ß‡∏≤‡∏°‡πÅ‡∏°‡πà‡∏ô‡∏¢‡∏≥'); axes[0, 0].tick_params(axis='x', rotation=45); axes[0, 0].grid(True, alpha=0.3)
for i, v in enumerate(results_df['Test Accuracy']):
    axes[0, 0].text(i, v + 0.005, f'{v:.3f}', ha='center', fontweight='bold')

axes[0, 1].errorbar(range(len(results_df)), results_df['CV Mean'],
                    yerr=results_df['CV Std'], fmt='o-', capsize=5, linewidth=2)
axes[0, 1].set_title('‡∏Ñ‡∏ß‡∏≤‡∏°‡πÅ‡∏°‡πà‡∏ô‡∏¢‡∏≥ Cross-Validation (‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢ ¬± ‡∏™‡πà‡∏ß‡∏ô‡πÄ‡∏ö‡∏µ‡πà‡∏¢‡∏á‡πÄ‡∏ö‡∏ô‡∏°‡∏≤‡∏ï‡∏£‡∏ê‡∏≤‡∏ô)', fontsize=14, fontweight='bold')
axes[0, 1].set_ylabel('‡∏Ñ‡∏ß‡∏≤‡∏°‡πÅ‡∏°‡πà‡∏ô‡∏¢‡∏≥ CV'); axes[0, 1].set_xticks(range(len(results_df)))
axes[0, 1].set_xticklabels(results_df['Model'], rotation=45); axes[0, 1].grid(True, alpha=0.3)

bars = axes[1, 0].bar(results_df['Model'], results_df['Training Time (s)'])
axes[1, 0].set_title('‡∏Å‡∏≤‡∏£‡πÄ‡∏õ‡∏£‡∏µ‡∏¢‡∏ö‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö‡πÄ‡∏ß‡∏•‡∏≤‡∏ù‡∏∂‡∏Å‡∏ù‡∏ô', fontsize=14, fontweight='bold')
axes[1, 0].set_ylabel('‡πÄ‡∏ß‡∏•‡∏≤ (‡∏ß‡∏¥‡∏ô‡∏≤‡∏ó‡∏µ)'); axes[1, 0].tick_params(axis='x', rotation=45); axes[1, 0].grid(True, alpha=0.3)
for bar, time_val in zip(bars, results_df['Training Time (s)']):
    axes[1, 0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, f'{time_val:.1f}s', ha='center', fontweight='bold')

# Radar
categories = ['‡∏Ñ‡∏ß‡∏≤‡∏°‡πÅ‡∏°‡πà‡∏ô‡∏¢‡∏≥‡∏ó‡∏î‡∏™‡∏≠‡∏ö', '‡∏Ñ‡∏ß‡∏≤‡∏°‡πÅ‡∏°‡πà‡∏ô‡∏¢‡∏≥ CV', '‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏£‡πá‡∏ß (‡∏Å‡∏•‡∏±‡∏ö‡∏´‡∏±‡∏ß)']
fig2, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(projection='polar'))
normalized_accuracy = results_df['Test Accuracy'] / max(1e-9, results_df['Test Accuracy'].max())
normalized_cv = results_df['CV Mean'] / max(1e-9, results_df['CV Mean'].max())
normalized_speed = (1 / (results_df['Training Time (s)'] + 0.1))
normalized_speed = normalized_speed / max(1e-9, normalized_speed.max())
angles = np.linspace(0, 2*np.pi, len(categories), endpoint=False).tolist(); angles += angles[:1]
base_colors = ['C0','C1','C2','C3','C4','C5','C6','C7','C8','C9']
for i, model in enumerate(results_df['Model']):
    values = [normalized_accuracy.iloc[i], normalized_cv.iloc[i], normalized_speed.iloc[i]]; values += values[:1]
    ax.plot(angles, values, 'o-', linewidth=2, label=model, color=base_colors[i % len(base_colors)])
    ax.fill(angles, values, alpha=0.25, color=base_colors[i % len(base_colors)])
ax.set_xticks(angles[:-1]); ax.set_xticklabels(categories); ax.set_ylim(0, 1)
ax.set_title('‡∏Å‡∏≤‡∏£‡πÄ‡∏õ‡∏£‡∏µ‡∏¢‡∏ö‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö‡∏õ‡∏£‡∏∞‡∏™‡∏¥‡∏ó‡∏ò‡∏¥‡∏†‡∏≤‡∏û‡πÇ‡∏°‡πÄ‡∏î‡∏• (‡∏õ‡∏£‡∏±‡∏ö‡∏°‡∏≤‡∏ï‡∏£‡∏ê‡∏≤‡∏ô)', fontweight='bold', size=14)
ax.legend(loc='upper right', bbox_to_anchor=(1.2, 1.0)); ax.grid(True)
axes[1, 1].remove()
plt.tight_layout(); plt.show()

### **13. ‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡∏Å‡∏≤‡∏£‡∏à‡∏≥‡πÅ‡∏ô‡∏Å‡∏õ‡∏£‡∏∞‡πÄ‡∏†‡∏ó‡πÇ‡∏î‡∏¢‡∏•‡∏∞‡πÄ‡∏≠‡∏µ‡∏¢‡∏î**
print("\n" + "="*60)
print("‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡∏Å‡∏≤‡∏£‡∏à‡∏≥‡πÅ‡∏ô‡∏Å‡∏õ‡∏£‡∏∞‡πÄ‡∏†‡∏ó‡πÇ‡∏î‡∏¢‡∏•‡∏∞‡πÄ‡∏≠‡∏µ‡∏¢‡∏î")
print("="*60)

# ‡∏ó‡∏≥‡πÉ‡∏´‡πâ‡∏õ‡∏•‡∏≠‡∏î‡∏†‡∏±‡∏¢‡∏´‡∏≤‡∏Å‡∏ö‡∏≤‡∏á‡∏Ñ‡∏•‡∏≤‡∏™‡πÑ‡∏°‡πà‡∏°‡∏µ‡πÉ‡∏ô y_test
unique_labels = sorted(df['rating'].unique().tolist())
target_names = [f'‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô {i}' for i in unique_labels]

for model_name in results.keys():
    print(f"\nüìä {model_name}")
    print("-" * 40)
    model = models[model_name]
    params_show = ['C','class_weight','solver','multi_class','n_estimators',
                   'max_depth','min_samples_split','min_samples_leaf','dual',
                   'alpha','fit_prior']
    param_desc = {
        'C':'‡∏Ñ‡πà‡∏≤ regularization parameter','class_weight':'‡∏Å‡∏≤‡∏£‡∏õ‡∏£‡∏±‡∏ö‡∏ô‡πâ‡∏≥‡∏´‡∏ô‡∏±‡∏Å‡∏Ñ‡∏•‡∏≤‡∏™',
        'solver':'‡∏≠‡∏±‡∏•‡∏Å‡∏≠‡∏£‡∏¥‡∏ó‡∏∂‡∏°‡∏Å‡∏≤‡∏£‡∏´‡∏≤‡∏Ñ‡πà‡∏≤‡∏ó‡∏µ‡πà‡πÄ‡∏´‡∏°‡∏≤‡∏∞‡∏™‡∏°','multi_class':'‡∏ß‡∏¥‡∏ò‡∏µ‡∏Å‡∏≤‡∏£‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£‡∏´‡∏•‡∏≤‡∏¢‡∏Ñ‡∏•‡∏≤‡∏™',
        'n_estimators':'‡∏à‡∏≥‡∏ô‡∏ß‡∏ô decision trees','max_depth':'‡∏Ñ‡∏ß‡∏≤‡∏°‡∏•‡∏∂‡∏Å‡∏™‡∏π‡∏á‡∏™‡∏∏‡∏î‡∏Ç‡∏≠‡∏á‡∏ï‡πâ‡∏ô‡πÑ‡∏°‡πâ',
        'min_samples_split':'‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡πà‡∏≥‡πÉ‡∏ô‡∏Å‡∏≤‡∏£‡πÅ‡∏¢‡∏Å node',
        'min_samples_leaf':'‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡πà‡∏≥‡πÉ‡∏ô leaf node',
        'dual':'‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ dual formulation','alpha':'‡∏Ñ‡πà‡∏≤ smoothing parameter',
        'fit_prior':'‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ prior probability'
    }
    for p, v in model.get_params().items():
        if p in params_show:
            print(f"  {p}: {v} # {param_desc.get(p,'')}")
    print(f"\n‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡∏Å‡∏≤‡∏£‡∏à‡∏≥‡πÅ‡∏ô‡∏Å‡∏õ‡∏£‡∏∞‡πÄ‡∏†‡∏ó:")
    # ‡πÉ‡∏ä‡πâ labels ‡πÅ‡∏•‡∏∞ target_names ‡πÉ‡∏´‡πâ‡∏™‡∏≠‡∏î‡∏Ñ‡∏•‡πâ‡∏≠‡∏á‡∏Å‡∏±‡∏ô
    print(classification_report(y_test, results[model_name]['predictions'],
                                labels=unique_labels, target_names=target_names, zero_division=0))

### **14. Confusion Matrices**
rows = int(np.ceil(len(results)/2))
cols = 2 if len(results) > 1 else 1
fig, axes = plt.subplots(rows, cols, figsize=(15, 6*rows))
axes = np.array(axes).reshape(-1)
for i, (model_name, result) in enumerate(results.items()):
    cm = confusion_matrix(y_test, result['predictions'], labels=unique_labels)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=[str(l) for l in unique_labels],
                yticklabels=[str(l) for l in unique_labels],
                ax=axes[i])
    axes[i].set_xlabel("‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô‡∏ó‡∏µ‡πà‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢"); axes[i].set_ylabel("‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô‡∏à‡∏£‡∏¥‡∏á")
    axes[i].set_title(f"{model_name}\n‡∏Ñ‡∏ß‡∏≤‡∏°‡πÅ‡∏°‡πà‡∏ô‡∏¢‡∏≥: {result['accuracy']:.3f}")
plt.tight_layout(); plt.savefig("confusion_matrices_comparison.png", dpi=300, bbox_inches='tight'); plt.show()

### **15. ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡∏ú‡∏•‡∏ó‡∏µ‡πà‡∏õ‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∏‡∏á**
def predict_review_enhanced(text, model_name='best'):
    if model_name == 'best':
        model_name = best_model_name
    model = best_models[model_name]
    cleaned_text = clean_text(text)
    features = extract_features(cleaned_text)

    review_vector = enhanced_sentence_vectorizer(cleaned_text).reshape(1, -1)
    review_vector_sparse = csr_matrix(review_vector)
    review_vector_scaled = scaler.transform(review_vector_sparse)

    review_tfidf = tfidf_vectorizer.transform([cleaned_text])
    review_count = count_vectorizer.transform([cleaned_text])

    additional_features = np.array([[
        features['exclamation_count'], features['question_count'],
        features['sentence_count'], features['word_count'],
        features['avg_word_length'], features['repeated_words_ratio'],
        features['negation_count'], features['punctuation_ratio'],
        features['text_length'], features['words_per_sentence'],
        features['english_ratio']
    ]], dtype=np.float32)
    additional_features_scaled = features_scaler.transform(additional_features)

    if model_name in ['Multinomial Naive Bayes','Random Forest']:
        # ‡πÉ‡∏ä‡πâ‡πÄ‡∏ß‡∏≠‡∏£‡πå‡∏ä‡∏±‡∏ô dense ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏ó‡∏µ‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£ non-negative/dense
        review_combined = hstack([
            review_tfidf, review_count, review_vector_scaled, csr_matrix(additional_features_scaled)
        ], format='csr').toarray()
        if model_name == 'Multinomial Naive Bayes':
            review_combined = np.abs(review_combined)
    else:
        review_combined = hstack([
            review_tfidf, review_count, review_vector_scaled, csr_matrix(additional_features_scaled)
        ], format='csr')

    pred = model.predict(review_combined)[0]
    if hasattr(model, 'predict_proba'):
        probs = model.predict_proba(review_combined)[0]
    else:
        # LinearSVC
        scores = model.decision_function(review_combined)[0]
        # ‡∏õ‡∏£‡∏±‡∏ö softmax ‡πÅ‡∏ö‡∏ö‡∏á‡πà‡∏≤‡∏¢
        exps = np.exp(scores - np.max(scores))
        probs = exps / np.sum(exps)
    return int(pred), probs

### **16. ‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏Å‡∏±‡∏ö‡∏£‡∏µ‡∏ß‡∏¥‡∏ß‡∏à‡∏£‡∏¥‡∏á**
def print_prediction_results_enhanced(text, actual_rating, model_name='best'):
    predicted_class, confidences = predict_review_enhanced(text, model_name)
    is_correct = int(predicted_class) == int(actual_rating)
    print(f"\n‡∏£‡∏µ‡∏ß‡∏¥‡∏ß: {text[:100]}...")
    print(f"‡πÇ‡∏°‡πÄ‡∏î‡∏•: {model_name}")
    print(f"‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô‡∏à‡∏£‡∏¥‡∏á: {actual_rating}/5")
    print(f"‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô‡∏ó‡∏µ‡πà‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢: {predicted_class}/5")
    print(f"‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå: {'‚úÖ ‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á' if is_correct else '‚ùå ‡∏ú‡∏¥‡∏î‡∏û‡∏•‡∏≤‡∏î'}")
    print("‡∏Ñ‡∏ß‡∏≤‡∏°‡∏°‡∏±‡πà‡∏ô‡πÉ‡∏à:")
    # ‡∏ó‡∏≥ mapping ‡∏ï‡∏≤‡∏°‡∏•‡∏≥‡∏î‡∏±‡∏ö‡∏Ñ‡∏•‡∏≤‡∏™‡∏ó‡∏µ‡πà‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÉ‡∏ä‡πâ (‡∏™‡∏°‡∏°‡∏ï‡∏¥‡πÄ‡∏£‡∏µ‡∏¢‡∏á 1..5)
    for rating_idx, confidence in enumerate(confidences, 1):
        print(f"  ‚≠ê {rating_idx}: {confidence * 100:.1f}%")
    return predicted_class, int(actual_rating)

test_reviews = [
    ["‡πÑ‡∏°‡πà‡πÅ‡∏ô‡∏∞‡∏ô‡∏≥‡πÄ‡∏•‡∏¢‡∏Ñ‡πà‡∏∞ ‡∏´‡∏≠‡∏ô‡∏µ‡πâ ‡∏´‡∏•‡∏≠‡∏Å‡πÄ‡∏≠‡∏≤‡πÄ‡∏á‡∏¥‡∏ô‡∏ä‡∏±‡∏î‡πÜ ‡πÉ‡∏ô‡∏£‡∏π‡∏õ‡∏™‡∏ß‡∏¢‡∏°‡∏≤‡∏Å ‡πÅ‡∏ï‡πà‡∏û‡∏≠‡πÄ‡∏Ç‡πâ‡∏≤‡πÑ‡∏õ‡∏≠‡∏¢‡∏π‡πà‡∏à‡∏£‡∏¥‡∏á‡∏™‡∏†‡∏≤‡∏û‡∏´‡πâ‡∏≠‡∏á‡∏ó‡∏£‡∏∏‡∏î‡πÇ‡∏ó‡∏£‡∏°‡∏°‡∏≤‡∏Å ‡∏ï‡∏π‡πâ‡πÄ‡∏™‡∏∑‡πâ‡∏≠‡∏ú‡πâ‡∏≤‡∏û‡∏±‡∏á‡∏ï‡∏±‡πâ‡∏á‡πÅ‡∏ï‡πà‡∏ß‡∏±‡∏ô‡πÅ‡∏£‡∏Å‡∏ó‡∏µ‡πà‡∏¢‡πâ‡∏≤‡∏¢‡πÄ‡∏Ç‡πâ‡∏≤ ‡πÄ‡∏ï‡∏µ‡∏¢‡∏á‡∏Å‡πá‡πÄ‡∏Å‡πà‡∏≤‡∏°‡∏≤‡∏Å‡∏ô‡∏≠‡∏ô‡πÅ‡∏•‡πâ‡∏ß‡∏õ‡∏ß‡∏î‡∏´‡∏•‡∏±‡∏á ‡∏ù‡∏±‡∏Å‡∏ö‡∏±‡∏ß‡∏ô‡πâ‡∏≥‡∏Å‡πá‡πÑ‡∏´‡∏•‡πÅ‡∏Ñ‡πà‡∏ã‡∏¥‡∏Å‡πÜ ‡πÑ‡∏°‡πà‡πÄ‡∏Ñ‡∏¢‡∏°‡∏≤‡∏ã‡πà‡∏≠‡∏°‡πÉ‡∏´‡πâ‡∏™‡∏±‡∏Å‡∏ó‡∏µ ‡∏Ç‡∏≠‡∏¢‡πâ‡∏≤‡∏¢‡∏≠‡∏≠‡∏Å‡∏Å‡πá‡πÑ‡∏°‡πà‡∏Ñ‡∏∑‡∏ô‡πÄ‡∏á‡∏¥‡∏ô‡∏°‡∏±‡∏î‡∏à‡∏≥ ‡πÄ‡∏™‡∏µ‡∏¢‡∏Ñ‡∏ß‡∏≤‡∏°‡∏£‡∏π‡πâ‡∏™‡∏∂‡∏Å‡∏°‡∏≤‡∏Å‡∏Ñ‡πà‡∏∞", 1],
    ["‡∏´‡∏≠‡∏û‡∏±‡∏Å‡∏£‡∏≤‡∏Ñ‡∏≤‡∏Å‡πá‡πÇ‡∏≠‡πÄ‡∏Ñ‡∏ô‡∏∞ ‡πÑ‡∏°‡πà‡πÅ‡∏û‡∏á‡∏°‡∏≤‡∏Å ‡πÅ‡∏ï‡πà‡∏°‡∏µ‡∏Ç‡πâ‡∏≠‡πÄ‡∏™‡∏µ‡∏¢‡πÄ‡∏¢‡∏≠‡∏∞‡πÑ‡∏õ‡∏´‡∏ô‡πà‡∏≠‡∏¢ ‡∏´‡πâ‡∏≠‡∏á‡πÄ‡∏•‡πá‡∏Å‡πÄ‡∏Å‡∏¥‡∏ô‡πÑ‡∏õ ‡πÅ‡∏≠‡∏£‡πå‡πÄ‡∏™‡∏µ‡∏¢‡∏á‡∏î‡∏±‡∏á‡∏£‡∏ö‡∏Å‡∏ß‡∏ô‡πÄ‡∏ß‡∏•‡∏≤‡∏ô‡∏≠‡∏ô ‡∏õ‡∏£‡∏∞‡∏ï‡∏π‡∏´‡πâ‡∏≠‡∏á‡∏ô‡πâ‡∏≥‡∏õ‡∏¥‡∏î‡πÑ‡∏°‡πà‡∏™‡∏ô‡∏¥‡∏ó ‡πÅ‡∏•‡πâ‡∏ß‡∏Å‡πá‡∏°‡∏µ‡∏°‡∏î‡πÄ‡∏¢‡∏≠‡∏∞‡∏°‡∏≤‡∏Å ‡∏Ç‡πâ‡∏≠‡∏î‡∏µ‡∏Ñ‡∏∑‡∏≠‡πÉ‡∏Å‡∏•‡πâ‡∏ï‡∏•‡∏≤‡∏î ‡πÄ‡∏î‡∏¥‡∏ô‡πÑ‡∏õ‡∏ã‡∏∑‡πâ‡∏≠‡∏Ç‡∏≠‡∏á‡∏Å‡∏¥‡∏ô‡πÑ‡∏î‡πâ‡∏™‡∏∞‡∏î‡∏ß‡∏Å ‡πÅ‡∏ï‡πà‡∏†‡∏≤‡∏û‡∏£‡∏ß‡∏°‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡∏Ñ‡∏∏‡πâ‡∏°‡∏Ñ‡πà‡∏≤‡πÄ‡∏ó‡πà‡∏≤‡πÑ‡∏´‡∏£‡πà ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ‡∏ó‡∏≤‡∏á‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏≠‡∏∑‡πà‡∏ô‡∏Å‡πá‡∏ô‡πà‡∏≤‡∏à‡∏∞‡∏î‡∏µ‡∏Å‡∏ß‡πà‡∏≤‡∏ô‡∏∞", 2],
    ["‡∏ä‡∏≠‡∏ö‡∏´‡∏≠‡∏ô‡∏µ‡πâ‡∏°‡∏≤‡∏Å‡∏Ñ‡πà‡∏∞ ‡∏´‡πâ‡∏≠‡∏á‡∏Å‡∏ß‡πâ‡∏≤‡∏á‡∏™‡∏∞‡∏≠‡∏≤‡∏î ‡πÄ‡∏ü‡∏≠‡∏£‡πå‡∏ô‡∏¥‡πÄ‡∏à‡∏≠‡∏£‡πå‡∏Ñ‡∏£‡∏ö‡∏Ñ‡∏£‡∏±‡∏ô ‡πÅ‡∏≠‡∏£‡πå‡πÄ‡∏¢‡πá‡∏ô‡∏â‡πà‡∏≥ ‡∏°‡∏µ‡πÇ‡∏ï‡πä‡∏∞‡πÄ‡∏Ñ‡∏£‡∏∑‡πà‡∏≠‡∏á‡πÅ‡∏õ‡πâ‡∏á‡∏î‡πâ‡∏ß‡∏¢ ‡∏™‡∏∞‡∏î‡∏ß‡∏Å‡∏°‡∏≤‡∏Å ‡∏≠‡∏¥‡∏ô‡πÄ‡∏ó‡∏≠‡∏£‡πå‡πÄ‡∏ô‡πá‡∏ï‡πÄ‡∏£‡πá‡∏ß ‡πÄ‡∏•‡πà‡∏ô‡πÄ‡∏Å‡∏°‡∏™‡∏ö‡∏≤‡∏¢ ‡πÄ‡∏à‡πâ‡∏≤‡∏Ç‡∏≠‡∏á‡∏´‡∏≠‡πÉ‡∏à‡∏î‡∏µ ‡∏°‡∏µ‡∏≠‡∏∞‡πÑ‡∏£‡πÅ‡∏à‡πâ‡∏á‡∏õ‡∏∏‡πä‡∏ö‡∏°‡∏≤‡∏î‡∏π‡∏õ‡∏±‡πä‡∏ö ‡∏Ç‡πâ‡∏≠‡πÄ‡∏™‡∏µ‡∏¢‡πÄ‡∏•‡πá‡∏Å‡πÜ‡∏Ñ‡∏∑‡∏≠‡∏Ñ‡πà‡∏≤‡πÑ‡∏ü‡∏Ñ‡πà‡∏≠‡∏ô‡∏Ç‡πâ‡∏≤‡∏á‡πÅ‡∏û‡∏á ‡πÅ‡∏•‡πâ‡∏ß‡∏Å‡πá‡∏ã‡∏±‡∏Å‡∏ú‡πâ‡∏≤‡∏ï‡πâ‡∏≠‡∏á‡∏•‡∏á‡πÑ‡∏õ‡∏ä‡∏±‡πâ‡∏ô‡∏•‡πà‡∏≤‡∏á ‡∏≠‡∏¢‡∏≤‡∏Å‡πÉ‡∏´‡πâ‡∏°‡∏µ‡πÄ‡∏Ñ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏ã‡∏±‡∏Å‡∏ú‡πâ‡∏≤‡∏ó‡∏∏‡∏Å‡∏ä‡∏±‡πâ‡∏ô ‡πÅ‡∏ï‡πà‡πÇ‡∏î‡∏¢‡∏£‡∏ß‡∏°‡∏û‡∏≠‡πÉ‡∏à‡∏°‡∏≤‡∏Å‡∏Ñ‡πà‡∏∞ ‡πÅ‡∏ô‡∏∞‡∏ô‡∏≥‡πÄ‡∏•‡∏¢", 4],
    ["Terrible hotel experience! The room was dirty, smelly, and completely different from the photos. Staff was rude and unhelpful. AC didn't work, hot water was cold, and WiFi was extremely slow. The bed was uncomfortable and the bathroom was disgusting. Would never stay here again. Complete waste of money!", 1],
    ["The hotel is okay for the price. Room was decent size but could be cleaner. Staff was friendly but service was slow. Location is good, close to attractions. Some facilities need maintenance. WiFi worked well. Overall, it's an average hotel - nothing special but acceptable for a short stay.", 3],
    ["Amazing hotel! Absolutely loved our stay here. The room was spacious, clean, and beautifully decorated. Staff was incredibly friendly and helpful. The location is perfect - walking distance to everything. Breakfast was delicious with great variety. Pool and gym facilities were excellent. Highly recommend this place!", 5],
    ["This hotel is really good! ‡∏´‡πâ‡∏≠‡∏á‡∏™‡∏∞‡∏≠‡∏≤‡∏î‡∏°‡∏≤‡∏Å ‡πÅ‡∏≠‡∏£‡πå‡πÄ‡∏¢‡πá‡∏ô wifi super fast ‡πÅ‡∏ï‡πà‡∏£‡∏≤‡∏Ñ‡∏≤‡πÅ‡∏û‡∏á‡πÑ‡∏õ‡∏´‡∏ô‡πà‡∏≠‡∏¢ but overall worth it ‡∏ô‡∏∞ staff friendly ‡∏°‡∏≤‡∏Å highly recommended! üëç", 4],
    ["‡∏´‡∏≠‡∏ô‡∏µ‡πâ‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î‡πÉ‡∏ô‡∏¢‡πà‡∏≤‡∏ô‡∏ô‡∏µ‡πâ‡πÅ‡∏•‡πâ‡∏ß‡∏ß ‡∏≠‡∏¢‡∏π‡πà‡∏°‡∏≤ 3 ‡∏õ‡∏µ‡πÑ‡∏°‡πà‡πÄ‡∏Ñ‡∏¢‡∏°‡∏µ‡∏õ‡∏±‡∏ç‡∏´‡∏≤‡πÄ‡∏•‡∏¢ ‡∏´‡πâ‡∏≠‡∏á‡∏Å‡∏ß‡πâ‡∏≤‡∏á ‡∏™‡∏∞‡∏≠‡∏≤‡∏î ‡∏ï‡∏Å‡πÅ‡∏ï‡πà‡∏á‡∏™‡∏ß‡∏¢ ‡∏°‡∏µ‡πÄ‡∏ü‡∏≠‡∏£‡πå‡∏Ñ‡∏£‡∏ö ‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô‡∏≠‡∏¢‡∏π‡πà‡∏Ñ‡∏≠‡∏ô‡πÇ‡∏î ‡πÄ‡∏ô‡πá‡∏ï‡πÑ‡∏ß‡∏°‡∏≤‡∏Å 100 Mbps ‡πÄ‡∏•‡πà‡∏ô‡πÄ‡∏Å‡∏°‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏™‡∏∞‡∏î‡∏∏‡∏î! ‡∏£‡∏∞‡∏ö‡∏ö‡∏£‡∏±‡∏Å‡∏©‡∏≤‡∏Ñ‡∏ß‡∏≤‡∏°‡∏õ‡∏•‡∏≠‡∏î‡∏†‡∏±‡∏¢‡πÅ‡∏ô‡πà‡∏ô‡∏°‡∏≤‡∏Å ‡∏°‡∏µ‡∏Å‡∏•‡πâ‡∏≠‡∏á‡∏ß‡∏á‡∏à‡∏£‡∏õ‡∏¥‡∏î ‡∏Ñ‡∏µ‡∏¢‡πå‡∏Å‡∏≤‡∏£‡πå‡∏î‡∏ó‡∏∏‡∏Å‡∏ä‡∏±‡πâ‡∏ô ‡πÅ‡∏•‡∏∞‡∏°‡∏µ ‡∏£‡∏õ‡∏†. 24 ‡∏ä‡∏°. ‡∏ó‡∏µ‡πÄ‡∏î‡πá‡∏î‡∏™‡∏∏‡∏î‡∏Ñ‡∏∑‡∏≠‡∏°‡∏µ‡∏ü‡∏¥‡∏ï‡πÄ‡∏ô‡∏™‡πÅ‡∏•‡∏∞‡∏™‡∏£‡∏∞‡∏ß‡πà‡∏≤‡∏¢‡∏ô‡πâ‡∏≥‡πÉ‡∏´‡πâ‡πÉ‡∏ä‡πâ‡∏ü‡∏£‡∏µ ‡∏Ñ‡∏∏‡πâ‡∏°‡∏°‡∏≤‡∏Å‡∏Å‡∏Å‡∏Å ‡πÅ‡∏ô‡∏∞‡∏ô‡∏≥‡∏™‡∏∏‡∏î‡πÜ ‡∏ñ‡πâ‡∏≤‡πÑ‡∏î‡πâ‡∏´‡πâ‡∏≠‡∏á‡∏Å‡πá‡∏à‡∏≠‡∏á‡πÄ‡∏•‡∏¢‡∏≠‡∏¢‡πà‡∏≤‡∏£‡∏≠!", 5]
]

print("\n" + "="*60)
print("‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏Å‡∏±‡∏ö‡∏£‡∏µ‡∏ß‡∏¥‡∏ß‡∏à‡∏£‡∏¥‡∏á")
print("="*60)
print(f"üèÜ ‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏Å‡∏±‡∏ö‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î: {best_model_name}")
test_results = []
for review, actual_rating in test_reviews:
    predicted_rating, actual = print_prediction_results_enhanced(review, actual_rating, best_model_name)
    test_results.append((predicted_rating, actual))

correct_predictions = sum(1 for pred, actual in test_results if pred == actual)
test_accuracy = correct_predictions / len(test_results)
print(f"\nüìà ‡∏ú‡∏•‡∏™‡∏£‡∏∏‡∏õ‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏£‡∏µ‡∏ß‡∏¥‡∏ß (‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î: {best_model_name}):")
print(f"‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á: {correct_predictions} ‡∏£‡∏µ‡∏ß‡∏¥‡∏ß")
print(f"‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡∏ú‡∏¥‡∏î‡∏û‡∏•‡∏≤‡∏î: {len(test_results) - correct_predictions} ‡∏£‡∏µ‡∏ß‡∏¥‡∏ß")
print(f"‡∏Ñ‡∏ß‡∏≤‡∏°‡πÅ‡∏°‡πà‡∏ô‡∏¢‡∏≥‡∏£‡∏ß‡∏°: {test_accuracy:.2f} ({correct_predictions}/{len(test_results)})")

### **17. ‡πÄ‡∏õ‡∏£‡∏µ‡∏¢‡∏ö‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏Å‡∏±‡∏ö‡∏£‡∏µ‡∏ß‡∏¥‡∏ß‡∏ó‡∏î‡∏™‡∏≠‡∏ö**
print("\n" + "="*60)
print("‡πÄ‡∏õ‡∏£‡∏µ‡∏¢‡∏ö‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö‡∏ó‡∏∏‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏Å‡∏±‡∏ö‡∏£‡∏µ‡∏ß‡∏¥‡∏ß‡∏ó‡∏î‡∏™‡∏≠‡∏ö")
print("="*60)
model_test_results = {}
for model_name in results.keys():
    print(f"\nüîç ‡∏ó‡∏î‡∏™‡∏≠‡∏ö {model_name}:")
    model_results = []
    for idx, (review, actual_rating) in enumerate(test_reviews[:5]):
        predicted_rating, _ = predict_review_enhanced(review, model_name)
        model_results.append(predicted_rating == actual_rating)
        print(f"  ‡∏£‡∏µ‡∏ß‡∏¥‡∏ß‡∏ó‡∏µ‡πà {idx+1}: {'‚úÖ' if model_results[-1] else '‚ùå'}")
    accuracy_small = sum(model_results) / len(model_results)
    model_test_results[model_name] = accuracy_small
    print(f"  ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÅ‡∏°‡πà‡∏ô‡∏¢‡∏≥‡πÉ‡∏ô‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á: {accuracy_small:.2f}")

### **18. ‡∏Å‡∏≤‡∏£‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå Feature Importance (‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö tree-based models)**
if 'Random Forest' in best_models:
    print(f"\nüå≥ ‡∏Å‡∏≤‡∏£‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç‡∏Ç‡∏≠‡∏á Features (Random Forest):")
    rf_model = best_models['Random Forest']
    tfidf_features = [f"tfidf_{i}" for i in range(X_train_tfidf.shape[1])]
    count_features = [f"count_{i}" for i in range(X_train_count.shape[1])]
    embedding_features = [f"embed_{i}" for i in range(X_train_vectors.shape[1])]
    all_feature_names = tfidf_features + count_features + embedding_features + feature_names
    feature_importance = rf_model.feature_importances_
    top_k = min(20, feature_importance.shape[0])
    top_indices = np.argsort(feature_importance)[-top_k:]
    plt.figure(figsize=(10, 8))
    plt.barh(range(top_k), feature_importance[top_indices])
    plt.yticks(range(top_k), [all_feature_names[i] for i in top_indices])
    plt.xlabel('‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç‡∏Ç‡∏≠‡∏á Feature'); plt.title(f'{top_k} Features ‡∏ó‡∏µ‡πà‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î (Random Forest)')
    plt.tight_layout(); plt.savefig("feature_importance.png", dpi=300, bbox_inches='tight'); plt.show()

### **19. ‡∏™‡∏£‡∏∏‡∏õ‡∏õ‡∏£‡∏∞‡∏™‡∏¥‡∏ó‡∏ò‡∏¥‡∏†‡∏≤‡∏û**
print("\n" + "="*60)
print("‡∏™‡∏£‡∏∏‡∏õ‡∏õ‡∏£‡∏∞‡∏™‡∏¥‡∏ó‡∏ò‡∏¥‡∏†‡∏≤‡∏û‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢")
print("="*60)
summary_df = pd.DataFrame({
    'Model': results_df['Model'],
    'Test Accuracy': results_df['Test Accuracy'],
    'CV Accuracy': results_df['CV Mean'],
    'Std Dev': results_df['CV Std'],
    'Training Time': results_df['Training Time (s)'],
    'Rank': results_df['Test Accuracy'].rank(ascending=False).astype(int)
}).sort_values('Test Accuracy', ascending=False)
print(summary_df.round(4))

print(f"\nüéØ ‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÄ‡∏ä‡∏¥‡∏á‡∏•‡∏∂‡∏Å‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç:")
print(f"‚Ä¢ ‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î: {best_model_name} (‡∏Ñ‡∏ß‡∏≤‡∏°‡πÅ‡∏°‡πà‡∏ô‡∏¢‡∏≥: {results[best_model_name]['accuracy']:.4f})")
print(f"‚Ä¢ ‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ FastText + Thai2Vec embeddings ‡∏ä‡πà‡∏ß‡∏¢‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£‡∏†‡∏≤‡∏©‡∏≤‡∏ú‡∏™‡∏°‡πÑ‡∏î‡πâ‡∏î‡∏µ‡∏Ç‡∏∂‡πâ‡∏ô")
print(f"‚Ä¢ ‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏™‡∏≤‡∏°‡∏≤‡∏£‡∏ñ‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£‡∏Ñ‡∏≥‡∏†‡∏≤‡∏©‡∏≤‡∏≠‡∏±‡∏á‡∏Å‡∏§‡∏©‡πÉ‡∏ô‡∏£‡∏µ‡∏ß‡∏¥‡∏ß‡πÑ‡∏ó‡∏¢‡πÑ‡∏î‡πâ‡πÅ‡∏•‡πâ‡∏ß")
print(f"‚Ä¢ Features ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î: {X_train_combined.shape[1]:,}")
print(f"‚Ä¢ ‡∏£‡∏≠‡∏á‡∏£‡∏±‡∏ö‡∏ó‡∏±‡πâ‡∏á‡∏£‡∏µ‡∏ß‡∏¥‡∏ß‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢‡πÅ‡∏•‡∏∞‡∏≠‡∏±‡∏á‡∏Å‡∏§‡∏©")


[nltk_data] Downloading package punkt to C:\Users\Phutawan
[nltk_data]     Chonsakorn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Phutawan
[nltk_data]     Chonsakorn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Phutawan
[nltk_data]     Chonsakorn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


‡∏Å‡∏≥‡∏•‡∏±‡∏á‡πÇ‡∏´‡∏•‡∏î Thai2Vec model...
‡∏Å‡∏≥‡∏•‡∏±‡∏á‡πÇ‡∏´‡∏•‡∏î fastText model...
‡πÇ‡∏´‡∏•‡∏î FastText model ‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à
‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏ó‡∏≥‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∞‡∏≠‡∏≤‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•...
‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏™‡∏Å‡∏±‡∏î‡∏Ñ‡∏∏‡∏ì‡∏•‡∏±‡∏Å‡∏©‡∏ì‡∏∞‡∏à‡∏≤‡∏Å‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°...
‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏´‡∏•‡∏±‡∏á‡∏ó‡∏≥‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∞‡∏≠‡∏≤‡∏î: 22493
‡∏Å‡∏≤‡∏£‡∏Å‡∏£‡∏∞‡∏à‡∏≤‡∏¢‡∏Ç‡∏≠‡∏á‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô‡∏´‡∏•‡∏±‡∏á‡∏ó‡∏≥‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∞‡∏≠‡∏≤‡∏î:
rating
1    1821
2    2191
3    2584
4    6448
5    9449
Name: count, dtype: int64

‡∏Å‡∏≤‡∏£‡∏Å‡∏£‡∏∞‡∏à‡∏≤‡∏¢‡∏ï‡∏≤‡∏°‡∏†‡∏≤‡∏©‡∏≤‡∏´‡∏•‡∏±‡∏á‡∏ó‡∏≥‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∞‡∏≠‡∏≤‡∏î:
language
english    20491
thai        2002
Name: count, dtype: int64
‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ù‡∏∂‡∏Å‡∏ù‡∏ô: 17994
‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ó‡∏î‡∏™‡∏≠‡∏ö: 4499
‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏™‡∏£‡πâ‡∏≤‡∏á word embeddings...
‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏™‡∏£‡πâ‡∏≤‡∏á TF-IDF features...
‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏™‡∏£‡πâ‡∏≤‡∏á Count vectors...
‡∏Å‡∏≥‡∏•‡∏±