In [6]:
# ======= Imports =======
import numpy as np
from khmernltk import word_tokenize
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# ======= 1. Prepare sentences =======
# data['clean_text'] should already exist
data = pd.read_csv("../../data/cleaned_data.csv")
# sentences = data['text'].apply(word_tokenize).tolist()
# Remove tokens that are just spaces
sentences = [
    [word for word in word_tokenize(text) if word.strip() != ""]
    for text in data['text']
]


labels = data['label'].values

print(f"Total sentences: {len(sentences)}")
print(f"Example tokenized sentence: {sentences[0]}")

# ======= 2. Train Word2Vec (CBoW) =======
w2v_model = Word2Vec(
    sentences=sentences,
    vector_size=1000,  # embedding dimension
    window=4,
    min_count=2,
    workers=4,
    sg=0  # 0 = CBoW
)

print("Word2Vec (CBoW) training complete.")

# ======= 3. Convert sentence to vector =======
def sentence_vector(tokens, model):
    """Average word vectors for a sentence"""
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

X = np.array([sentence_vector(s, w2v_model) for s in sentences])

print(f"Shape of sentence vectors: {X.shape}")

# ======= 4. Train/test split =======
X_train, X_test, y_train, y_test = train_test_split(
    X, labels, test_size=0.2, random_state=42, stratify=labels
)

print(f"Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")

# ======= 5. Logistic Regression =======
lr = LogisticRegression(max_iter=500, class_weight='balanced')
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print("\n=== Word2Vec (CBoW) + Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

# ======= 6. SVM =======
svm = SVC(kernel='linear', class_weight='balanced')
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

print("\n=== Word2Vec (CBoW) + SVM ===")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


Total sentences: 10004
Example tokenized sentence: ['នាយិកា', 'មជ្ឈមណ្ឌល', 'សិទ្ធិ', 'មនុស្ស', 'កម្ពុជា', 'អ្នកស្រី', 'ចក់', 'សុភាព', 'បង្ហាញ', 'តុលាការ', 'ក្រុង', 'ភ្នំពេញ', 'នាម', 'ជា', 'សាក្សី', 'សាក្សី', 'ផ្សេង', 'សំណុំរឿង', 'មេដឹកនាំ', 'នយោបាយ', 'ជំទាស់', 'កឹម', 'សុខា', 'ទីក្រុង', 'ភ្នំពេញ', 'ថ្ងៃទី', '៥', 'ខែតុលា', 'ឆ្នាំ', '២០២២']
Word2Vec (CBoW) training complete.
Shape of sentence vectors: (10004, 1000)
Train size: 8003, Test size: 2001

=== Word2Vec (CBoW) + Logistic Regression ===
Accuracy: 0.46776611694152925
              precision    recall  f1-score   support

    negative       0.48      0.51      0.50       651
     neutral       0.16      0.52      0.24       184
    positive       0.72      0.43      0.54      1166

    accuracy                           0.47      2001
   macro avg       0.45      0.49      0.43      2001
weighted avg       0.59      0.47      0.50      2001


=== Word2Vec (CBoW) + SVM ===
Accuracy: 0.45327336331834084
              precision    reca

In [8]:
# ======= Imports =======
import numpy as np
import pandas as pd
from khmernltk import word_tokenize
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout, Bidirectional

# ======= 1. Load & prepare data =======
data = pd.read_csv("../../data/cleaned_data.csv")

# Tokenize and remove empty tokens
sentences = [
    [word for word in word_tokenize(text) if word.strip() != ""]
    for text in data['text']
]

texts = [" ".join(s) for s in sentences]  # Join tokens for Keras tokenizer
labels = data['label'].values

# Encode labels to integers
le = LabelEncoder()
y = le.fit_transform(labels)

num_classes = len(le.classes_)
print("Number of classes:", num_classes)
print("Classes:", le.classes_)

# ======= 2. Tokenizer + sequences =======
MAX_NUM_WORDS = 20000
MAX_SEQUENCE_LENGTH = 100  # adjust based on average sentence length

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

word_index = tokenizer.word_index
print("Vocabulary size:", len(word_index))

# ======= 3. Train Word2Vec (CBoW) =======
w2v_model = Word2Vec(
    sentences=sentences,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    sg=0  # CBoW
)

# ======= 4. Prepare embedding matrix =======
embedding_dim = 100
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in word_index.items():
    if i >= num_words:
        continue
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

# ======= 5. Train/test split =======
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ======= 6. LSTM Model =======
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=num_words,
                         output_dim=embedding_dim,
                         weights=[embedding_matrix],
                         input_length=MAX_SEQUENCE_LENGTH,
                         trainable=False))  # freeze Word2Vec embeddings
lstm_model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
lstm_model.add(Dense(64, activation='relu'))
lstm_model.add(Dropout(0.2))
lstm_model.add(Dense(num_classes, activation='softmax'))

lstm_model.compile(loss='sparse_categorical_crossentropy',
                   optimizer='adam',
                   metrics=['accuracy'])

lstm_model.summary()

history_lstm = lstm_model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=10,
    batch_size=64
)

# ======= 7. Evaluate LSTM =======
y_pred_lstm = np.argmax(lstm_model.predict(X_test), axis=1)
print("\n=== LSTM Model ===")
print("Accuracy:", accuracy_score(y_test, y_pred_lstm))
print(classification_report(y_test, y_pred_lstm, target_names=le.classes_))

# ======= 8. GRU Model =======
gru_model = Sequential()
gru_model.add(Embedding(input_dim=num_words,
                        output_dim=embedding_dim,
                        weights=[embedding_matrix],
                        input_length=MAX_SEQUENCE_LENGTH,
                        trainable=False))
gru_model.add(Bidirectional(GRU(128, dropout=0.2, recurrent_dropout=0.2)))
gru_model.add(Dense(64, activation='relu'))
gru_model.add(Dropout(0.2))
gru_model.add(Dense(num_classes, activation='softmax'))

gru_model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

gru_model.summary()

history_gru = gru_model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=10,
    batch_size=64
)

# ======= 9. Evaluate GRU =======
y_pred_gru = np.argmax(gru_model.predict(X_test), axis=1)
print("\n=== GRU Model ===")
print("Accuracy:", accuracy_score(y_test, y_pred_gru))
print(classification_report(y_test, y_pred_gru, target_names=le.classes_))


  if not hasattr(np, "object"):


Number of classes: 3
Classes: ['negative' 'neutral' 'positive']
Vocabulary size: 13823




Epoch 1/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 286ms/step - accuracy: 0.5685 - loss: 0.8925 - val_accuracy: 0.5805 - val_loss: 0.8800
Epoch 2/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 272ms/step - accuracy: 0.5816 - loss: 0.8703 - val_accuracy: 0.5743 - val_loss: 0.8634
Epoch 3/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 272ms/step - accuracy: 0.5816 - loss: 0.8551 - val_accuracy: 0.5843 - val_loss: 0.8550
Epoch 4/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 280ms/step - accuracy: 0.5796 - loss: 0.8485 - val_accuracy: 0.5893 - val_loss: 0.8481
Epoch 5/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 292ms/step - accuracy: 0.5937 - loss: 0.8379 - val_accuracy: 0.5943 - val_loss: 0.8379
Epoch 6/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 296ms/step - accuracy: 0.5943 - loss: 0.8380 - val_accuracy: 0.5905 - val_loss: 0.8408
Epoch 7/10



Epoch 1/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 222ms/step - accuracy: 0.5761 - loss: 0.8965 - val_accuracy: 0.5793 - val_loss: 0.8719
Epoch 2/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 197ms/step - accuracy: 0.5804 - loss: 0.8721 - val_accuracy: 0.5818 - val_loss: 0.8552
Epoch 3/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 192ms/step - accuracy: 0.5861 - loss: 0.8533 - val_accuracy: 0.5918 - val_loss: 0.8478
Epoch 4/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 190ms/step - accuracy: 0.5879 - loss: 0.8462 - val_accuracy: 0.5893 - val_loss: 0.8470
Epoch 5/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 188ms/step - accuracy: 0.5937 - loss: 0.8416 - val_accuracy: 0.5680 - val_loss: 0.8433
Epoch 6/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 197ms/step - accuracy: 0.5972 - loss: 0.8305 - val_accuracy: 0.5968 - val_loss: 0.8369
Epoch 7/10