In [1]:
import pandas as pd

data = pd.read_csv("../../data/cleaned_data.csv")
data.head()

Unnamed: 0,text,label
0,នាយិកា មជ្ឈមណ្ឌល សិទ្ធិ មនុស្ស កម្ពុជា អ្នកស្រ...,neutral
1,ការឃុំ កញ្ញា សេង ធារី កាន់តែ យូរ រដ្ឋាភិបាល ហ៊...,positive
2,ប្រភព បង្ហើប បន្ទប់ ខ្ទង់ ចំណាយ ជាង ១០ម៉ឺន ដុល...,neutral
3,1956 បាន បង្ហាញ ផូស្វ័រ បាន ផ្ទេរ ដើម បែក អារ ...,neutral
4,ដរាបណា មិន បាន តាំងចិត្ត ខិតខំ ប្រឹង រៀន ប្រឹង...,negative


In [2]:
from sklearn.model_selection import train_test_split

# Features and labels
X = data['text']  # cleaned text
y = data['label']      # labels

# Split dataset: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # stratify keeps class distribution
)


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier


# 2. Vectorize text using BoW
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 3. Define models
models = {
    "MultinomialNB": MultinomialNB(),
    "LogisticRegression": LogisticRegression(max_iter=500, class_weight='balanced'),
    "LinearSVC": LinearSVC(class_weight='balanced', max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    "DecisionTree": DecisionTreeClassifier(class_weight='balanced', random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# 4. Train & evaluate
for name, model in models.items():
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)

    acc = accuracy_score(y_test, y_pred)
    print(f"=== {name} ===")
    print("Accuracy:", acc)
    print(classification_report(y_test, y_pred))
    print("-" * 50)


=== MultinomialNB ===
Accuracy: 0.6431784107946027
              precision    recall  f1-score   support

    negative       0.56      0.49      0.52       651
     neutral       0.46      0.06      0.11       184
    positive       0.68      0.82      0.74      1166

    accuracy                           0.64      2001
   macro avg       0.57      0.46      0.46      2001
weighted avg       0.62      0.64      0.61      2001

--------------------------------------------------
=== LogisticRegression ===
Accuracy: 0.567216391804098
              precision    recall  f1-score   support

    negative       0.52      0.54      0.53       651
     neutral       0.22      0.47      0.30       184
    positive       0.76      0.60      0.67      1166

    accuracy                           0.57      2001
   macro avg       0.50      0.54      0.50      2001
weighted avg       0.63      0.57      0.59      2001

--------------------------------------------------
=== LinearSVC ===
Accuracy: 0.

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier


# 2. Vectorize text using BoW + n-grams
vectorizer = CountVectorizer(
    ngram_range=(1, 2),   # unigram + bigram
    min_df=2,             # ignore very rare words
    max_df=0.9            # ignore very common words
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 3. Define models
models = {
    "MultinomialNB": MultinomialNB(),
    "LogisticRegression": LogisticRegression(max_iter=500, class_weight='balanced'),
    "LinearSVC": LinearSVC(class_weight='balanced', max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    "DecisionTree": DecisionTreeClassifier(class_weight='balanced', random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# 4. Train & evaluate
for name, model in models.items():
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)

    acc = accuracy_score(y_test, y_pred)
    print(f"=== {name} ===")
    print("Accuracy:", acc)
    print(classification_report(y_test, y_pred))
    print("-" * 50)


=== MultinomialNB ===
Accuracy: 0.656671664167916
              precision    recall  f1-score   support

    negative       0.58      0.55      0.56       651
     neutral       0.56      0.05      0.10       184
    positive       0.69      0.81      0.75      1166

    accuracy                           0.66      2001
   macro avg       0.61      0.47      0.47      2001
weighted avg       0.64      0.66      0.63      2001

--------------------------------------------------
=== LogisticRegression ===
Accuracy: 0.5987006496751625
              precision    recall  f1-score   support

    negative       0.52      0.55      0.53       651
     neutral       0.26      0.39      0.31       184
    positive       0.74      0.66      0.70      1166

    accuracy                           0.60      2001
   macro avg       0.51      0.53      0.51      2001
weighted avg       0.63      0.60      0.61      2001

--------------------------------------------------
=== LinearSVC ===
Accuracy: 0.