In [1]:
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline

In [2]:
data = pd.read_csv("Data/GamesDataClean.csv", usecols=["content", "score", "game_name"])
subset = (
    data.groupby("game_name", group_keys=False)
        .apply(lambda x: x.sample(n=min(50, len(x)), random_state=42))
        .reset_index(drop=True)
)
# subset = (
#     data.groupby(["game_name", "score"], group_keys=False)
#         .apply(lambda x: x.sample(n=min(200, len(x)), random_state=42))
#         .reset_index(drop=True)
# )
subset

  .apply(lambda x: x.sample(n=min(50, len(x)), random_state=42))


Unnamed: 0,game_name,content,score
0,8 Ball Pool,suppicious play versus cpu cpu always win prov...,2
1,8 Ball Pool,want say something daily cash reward video wor...,3
2,8 Ball Pool,get 40 pop ups play game good gam3 worth,1
3,8 Ball Pool,seem game rig look get 5 game win streak losin...,2
4,8 Ball Pool,great gamegraphicsonly thing game fix lose mat...,3
...,...,...,...
995,Shadow Fight 2,game suck energy bar cannot play long,1
996,Shadow Fight 2,please make character coustome character nice ...,5
997,Shadow Fight 2,perfect fight game whenever villian drop weapo...,5
998,Shadow Fight 2,game good controles super graphic also reason ...,3


In [3]:
subset.value_counts('score')

score
5    436
1    248
4    147
3     91
2     78
Name: count, dtype: int64

In [4]:
X = subset["content"].values
y = subset["score"].astype(int).values

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
tfidf = TfidfVectorizer(max_features=7000, ngram_range=(1,2))  # unigram + bigram
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("\n===== TF-IDF =====")
print("Shape (train):", X_train_tfidf.shape)
print("Jumlah kata unik/vocab:", len(tfidf.vocabulary_))
print("Beberapa vocab:", list(tfidf.vocabulary_.keys())[:20])


===== TF-IDF =====
Shape (train): (800, 7000)
Jumlah kata unik/vocab: 7000
Beberapa vocab: ['peak', 'twin', 'face', 'hold', 'back', 'tearsface', 'tear', 'peak twin', 'twin face', 'hold back', 'back tearsface', 'tearsface hold', 'play', 'roblox', 'year', 'single', 'problem', 'fun', 'multiple', 'platform']


In [7]:
tfidf_lr = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        strip_accents="unicode",
        token_pattern=r"[A-Za-z]{2,}",   # words of 2+ letters
        ngram_range=(1,2),               # uni+bi-grams usually best for sentiment
        min_df=5,                        # ignore very rare terms
        max_df=0.9,                      # ignore too-common terms
        stop_words="english",            # keep negations in text; vectorizer removes generic stopwords
    )),
    ("clf", LogisticRegression(
        max_iter=2000,
        multi_class="multinomial",
        class_weight="balanced",
        solver="lbfgs",
        n_jobs=-1
    ))
])

tfidf_lr.fit(X_train, y_train)
pred = tfidf_lr.predict(X_test)

print("TF-IDF + LR accuracy:", accuracy_score(y_test, pred))
print("\nClassification report (TF-IDF + LR):\n", classification_report(y_test, pred, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_test, pred))



TF-IDF + LR accuracy: 0.465

Classification report (TF-IDF + LR):
               precision    recall  f1-score   support

           1     0.6087    0.5600    0.5833        50
           2     0.1250    0.1875    0.1500        16
           3     0.1765    0.1667    0.1714        18
           4     0.1892    0.2414    0.2121        29
           5     0.6842    0.5977    0.6380        87

    accuracy                         0.4650       200
   macro avg     0.3567    0.3506    0.3510       200
weighted avg     0.5031    0.4650    0.4816       200

Confusion matrix:
 [[28  9  6  4  3]
 [ 4  3  3  3  3]
 [ 3  4  3  5  3]
 [ 4  2  1  7 15]
 [ 7  6  4 18 52]]


In [8]:
import os
from gensim.models import FastText
from sklearn.preprocessing import StandardScaler

def tokenize_clean(s: str):
    return s.split()

sentences = [tokenize_clean(t) for t in subset["content"]]
y = subset["score"].astype(int).values

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(sentences, y, test_size=0.2, stratify=y, random_state=42)

# 3) Train FastText on the training sentences
ft_dim = 300
ft = FastText(
    vector_size=ft_dim,
    window=5,
    min_count=5,          # raise/lower depending on corpus size
    workers=os.cpu_count(),
    sg=1,                 # skip-gram (semantic)
    epochs=10
)
ft.build_vocab(corpus_iterable=X_train_s)
ft.train(corpus_iterable=X_train_s, total_examples=len(X_train_s), epochs=ft.epochs)

# 4) Sentence → vector (mean of word vectors)
def sent_vec(tokens, model, dim):
    if not tokens:
        return np.zeros(dim, dtype=np.float32)
    vecs = [model.wv[w] for w in tokens if w in model.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(dim, dtype=np.float32)

Xtr = np.vstack([sent_vec(t, ft, ft_dim) for t in X_train_s])
Xte = np.vstack([sent_vec(t, ft, ft_dim) for t in X_test_s])

# 5) (Optional) scale embeddings; LR often benefits a bit
scaler = StandardScaler()
Xtr_s = scaler.fit_transform(Xtr)
Xte_s = scaler.transform(Xte)

# 6) Classifier
clf_ft = LogisticRegression(
    max_iter=2000, multi_class="multinomial",
    class_weight="balanced", solver="lbfgs", n_jobs=-1
)
clf_ft.fit(Xtr_s, y_train_s)
pred_ft = clf_ft.predict(Xte_s)

print("FastText(avg) + LR accuracy:", round(accuracy_score(y_test_s, pred_ft), 4))
print("\nClassification report (FastText + LR):\n", classification_report(y_test_s, pred_ft, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_test_s, pred_ft))



FastText(avg) + LR accuracy: 0.435

Classification report (FastText + LR):
               precision    recall  f1-score   support

           1     0.5455    0.4800    0.5106        50
           2     0.1379    0.2500    0.1778        16
           3     0.2308    0.3333    0.2727        18
           4     0.1562    0.1724    0.1639        29
           5     0.6957    0.5517    0.6154        87

    accuracy                         0.4350       200
   macro avg     0.3532    0.3575    0.3481       200
weighted avg     0.4934    0.4350    0.4579       200

Confusion matrix:
 [[24  9  5  6  6]
 [ 3  4  2  4  3]
 [ 2  6  6  3  1]
 [ 4  4  5  5 11]
 [11  6  8 14 48]]
