In [1]:
import numpy as np

from gensim.models import Word2Vec

from imblearn.over_sampling import RandomOverSampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier


In [2]:
import pandas as pd

data = pd.read_csv("../../data/cleaned_data.csv")
data.head()

Unnamed: 0,text,label
0,នាយិកា មជ្ឈមណ្ឌល សិទ្ធិ មនុស្ស កម្ពុជា អ្នកស្រ...,neutral
1,ការឃុំ កញ្ញា សេង ធារី កាន់តែ យូរ រដ្ឋាភិបាល ហ៊...,positive
2,ប្រភព បង្ហើប បន្ទប់ ខ្ទង់ ចំណាយ ជាង ១០ម៉ឺន ដុល...,neutral
3,1956 បាន បង្ហាញ ផូស្វ័រ បាន ផ្ទេរ ដើម បែក អារ ...,neutral
4,ដរាបណា មិន បាន តាំងចិត្ត ខិតខំ ប្រឹង រៀន ប្រឹង...,negative


In [3]:
from sklearn.model_selection import train_test_split

# Features and labels
X = data['text']  # cleaned text
y = data['label']      # labels

# Split dataset: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # stratify keeps class distribution
)


In [4]:
X_tokens = [sentence.split() for sentence in X_train]

In [5]:
cbow_model = Word2Vec(
    sentences=X_tokens,
    vector_size=100,     # embedding dimension
    window=5,            # context window size
    min_count=2,
    workers=4,
    sg=0                 # sg=0 = CBoW
)


In [6]:
def sentence_to_vector(sentence, model, vector_size):
    words = sentence.split()
    vectors = [model.wv[word] for word in words if word in model.wv]

    if len(vectors) == 0:
        return np.zeros(vector_size)

    return np.mean(vectors, axis=0)


In [7]:
X_vec = np.array([
    sentence_to_vector(sentence, cbow_model, 100)
    for sentence in X_train
])


In [8]:
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X_vec, y_train)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X_res,
    y_res,
    test_size=0.2,
    random_state=42,
    stratify=y_res
)


In [10]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=500, class_weight='balanced'),
    "LinearSVC": LinearSVC(class_weight='balanced', max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    "DecisionTree": DecisionTreeClassifier(class_weight='balanced', random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}


In [11]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print(f"=== {name} ===")
    print("Accuracy:", acc)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(cm)
    print("-" * 50)


=== LogisticRegression ===
Accuracy: 0.5082201572551823

Classification Report:
              precision    recall  f1-score   support

    negative       0.53      0.52      0.52       932
     neutral       0.51      0.54      0.53       933
    positive       0.48      0.46      0.47       933

    accuracy                           0.51      2798
   macro avg       0.51      0.51      0.51      2798
weighted avg       0.51      0.51      0.51      2798

Confusion Matrix:
[[487 226 219]
 [187 502 244]
 [252 248 433]]
--------------------------------------------------
=== LinearSVC ===
Accuracy: 0.547891350964975

Classification Report:
              precision    recall  f1-score   support

    negative       0.57      0.55      0.56       932
     neutral       0.56      0.61      0.58       933
    positive       0.52      0.48      0.50       933

    accuracy                           0.55      2798
   macro avg       0.55      0.55      0.55      2798
weighted avg       0.55     

In [12]:
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

results = []

for name, model in models.items():
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    macro_f1 = f1_score(y_test, y_pred, average="macro")

    results.append({
        "model": name,
        "accuracy": acc,
        "macro_f1": macro_f1
    })

df_results = pd.DataFrame(results)
print(df_results.sort_values("macro_f1", ascending=False))


                model  accuracy  macro_f1
2        RandomForest  0.865618  0.862508
3        DecisionTree  0.811294  0.801333
4    GradientBoosting  0.614725  0.608151
1           LinearSVC  0.547891  0.546881
0  LogisticRegression  0.508220  0.507877


In [4]:
import numpy as np
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# -------------------------
# 1. Tokenize sentences
# -------------------------
X_tokens = [sentence.split() for sentence in X_train]  # Khmer sentences tokenized

# -------------------------
# 2. Train CBoW Word2Vec
# -------------------------
vector_size = 100
cbow_model = Word2Vec(sentences=X_tokens, vector_size=vector_size, window=5, min_count=2, workers=4, sg=0)

# -------------------------
# 3. Convert sentences to sequence of embeddings
# -------------------------
def sentence_to_seq(sentence, model):
    return [model.wv[word] for word in sentence.split() if word in model.wv]

X_seq = [sentence_to_seq(sent, cbow_model) for sent in X_train]

# Find max sentence length
max_len = max(len(seq) for seq in X_seq)

# Pad sequences with zeros
X_padded = pad_sequences(
    X_seq, maxlen=max_len, dtype='float32', padding='post', value=0.0
)

# -------------------------
# 4. Encode labels
# -------------------------
le = LabelEncoder()
y_encoded = le.fit_transform(y_train)
y_categorical = to_categorical(y_encoded)

# -------------------------
# 5. Oversample (optional)
# -------------------------
# Flatten for oversampling
X_flat = X_padded.reshape(len(X_padded), -1)
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X_flat, y_encoded)

# Reshape back to 3D
X_res = X_res.reshape(len(X_res), max_len, vector_size)
y_res_cat = to_categorical(y_res)

# -------------------------
# 6. Train-test split
# -------------------------
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(
    X_res, y_res_cat, test_size=0.2, random_state=42, stratify=y_res
)

# -------------------------
# 7. Build LSTM / GRU model
# -------------------------
model = Sequential()
model.add(LSTM(128, input_shape=(max_len, vector_size), return_sequences=False))
model.add(Dropout(0.3))
model.add(Dense(y_train_dl.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# -------------------------
# 8. Train model
# -------------------------
history = model.fit(
    X_train_dl, y_train_dl,
    validation_split=0.1,
    epochs=10,
    batch_size=32
)

# -------------------------
# 9. Evaluate
# -------------------------
y_pred_prob = model.predict(X_test_dl)
y_pred = np.argmax(y_pred_prob, axis=1)
y_true = np.argmax(y_test_dl, axis=1)

print("Classification Report:")
print(classification_report(y_true, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))


  super().__init__(**kwargs)


Epoch 1/10
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 59ms/step - accuracy: 0.3347 - loss: 1.0988 - val_accuracy: 0.3384 - val_loss: 1.0987
Epoch 2/10
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 45ms/step - accuracy: 0.3309 - loss: 1.0990 - val_accuracy: 0.3384 - val_loss: 1.0982
Epoch 3/10
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 39ms/step - accuracy: 0.3371 - loss: 1.0984 - val_accuracy: 0.3607 - val_loss: 1.0978
Epoch 4/10
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 39ms/step - accuracy: 0.3584 - loss: 1.1007 - val_accuracy: 0.3330 - val_loss: 1.1308
Epoch 5/10
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 42ms/step - accuracy: 0.3551 - loss: 1.1004 - val_accuracy: 0.3902 - val_loss: 1.0809
Epoch 6/10
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 39ms/step - accuracy: 0.3952 - loss: 1.0804 - val_accuracy: 0.4589 - val_loss: 1.0359
Epoch 7/10
[1m3