In [1]:
import pandas as pd

data = pd.read_csv("../../data/cleaned_data.csv")
data.head()

Unnamed: 0,text,label
0,នាយិកា មជ្ឈមណ្ឌល សិទ្ធិ មនុស្ស កម្ពុជា អ្នកស្រ...,neutral
1,ការឃុំ កញ្ញា សេង ធារី កាន់តែ យូរ រដ្ឋាភិបាល ហ៊...,positive
2,ប្រភព បង្ហើប បន្ទប់ ខ្ទង់ ចំណាយ ជាង ១០ម៉ឺន ដុល...,neutral
3,1956 បាន បង្ហាញ ផូស្វ័រ បាន ផ្ទេរ ដើម បែក អារ ...,neutral
4,ដរាបណា មិន បាន តាំងចិត្ត ខិតខំ ប្រឹង រៀន ប្រឹង...,negative


In [2]:
from sklearn.model_selection import train_test_split

# Features and labels
X = data['text']  # cleaned text
y = data['label']      # labels

# Split dataset: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # stratify keeps class distribution
)


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier


# 2. Vectorize text
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 3. Define models
models = {
    "MultinomialNB": MultinomialNB(),
    "LogisticRegression": LogisticRegression(max_iter=500, class_weight='balanced'),
    "LinearSVC": LinearSVC(class_weight='balanced', max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    "DecisionTree": DecisionTreeClassifier(class_weight='balanced', random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# 4. Train & evaluate
for name, model in models.items():
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)
    acc = accuracy_score(y_test, y_pred)
    print(f"=== {name} ===")
    print("Accuracy:", acc)
    print(classification_report(y_test, y_pred))
    print("-"*50)


=== MultinomialNB ===
Accuracy: 0.616191904047976
              precision    recall  f1-score   support

    negative       0.66      0.16      0.26       651
     neutral       0.00      0.00      0.00       184
    positive       0.61      0.97      0.75      1166

    accuracy                           0.62      2001
   macro avg       0.43      0.38      0.34      2001
weighted avg       0.57      0.62      0.52      2001

--------------------------------------------------
=== LogisticRegression ===
Accuracy: 0.5812093953023488
              precision    recall  f1-score   support

    negative       0.54      0.56      0.55       651
     neutral       0.21      0.44      0.29       184
    positive       0.76      0.62      0.68      1166

    accuracy                           0.58      2001
   macro avg       0.50      0.54      0.51      2001
weighted avg       0.64      0.58      0.60      2001

--------------------------------------------------
=== LinearSVC ===
Accuracy: 0.

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier


# 2. Vectorize text (TF-IDF + n-gram)
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),   # (n-gram) unigram + bigram
    min_df=2,             # remove rare words
    max_df=0.9            # remove too frequent words
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


# 3. Define models
models = {
    "MultinomialNB": MultinomialNB(),
    "LogisticRegression": LogisticRegression(max_iter=500, class_weight='balanced'),
    "LinearSVC": LinearSVC(class_weight='balanced', max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    "DecisionTree": DecisionTreeClassifier(class_weight='balanced', random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# 4. Train & evaluate
for name, model in models.items():
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)

    acc = accuracy_score(y_test, y_pred)
    print(f"=== {name} ===")
    print("Accuracy:", acc)
    print(classification_report(y_test, y_pred))
    print("-" * 50)


=== MultinomialNB ===
Accuracy: 0.6291854072963519
              precision    recall  f1-score   support

    negative       0.75      0.18      0.30       651
     neutral       0.00      0.00      0.00       184
    positive       0.62      0.98      0.76      1166

    accuracy                           0.63      2001
   macro avg       0.46      0.39      0.35      2001
weighted avg       0.60      0.63      0.54      2001

--------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


=== LogisticRegression ===
Accuracy: 0.6226886556721639
              precision    recall  f1-score   support

    negative       0.55      0.59      0.57       651
     neutral       0.27      0.41      0.33       184
    positive       0.77      0.67      0.72      1166

    accuracy                           0.62      2001
   macro avg       0.53      0.56      0.54      2001
weighted avg       0.65      0.62      0.63      2001

--------------------------------------------------
=== LinearSVC ===
Accuracy: 0.6301849075462269
              precision    recall  f1-score   support

    negative       0.54      0.51      0.53       651
     neutral       0.30      0.27      0.28       184
    positive       0.72      0.75      0.74      1166

    accuracy                           0.63      2001
   macro avg       0.52      0.51      0.52      2001
weighted avg       0.62      0.63      0.63      2001

--------------------------------------------------
=== RandomForest ===
Accuracy: 0.