In [4]:
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline

In [5]:
data = pd.read_csv("TopGamesDataClean.csv", usecols=["content", "score", "game_name"])
subset = (
    data.groupby("game_name", group_keys=False)
        .apply(lambda x: x.sample(n=min(50, len(x)), random_state=42))
        .reset_index(drop=True)
)
# subset = (
#     data.groupby(["game_name", "score"], group_keys=False)
#         .apply(lambda x: x.sample(n=min(200, len(x)), random_state=42))
#         .reset_index(drop=True)
# )
subset

  .apply(lambda x: x.sample(n=min(50, len(x)), random_state=42))


Unnamed: 0,game_name,content,score
0,8 Ball Pool,install game alot bug cheater game first game ...,1
1,8 Ball Pool,lot fun great way pas time,5
2,8 Ball Pool,discussting game pot ball way win jone,1
3,8 Ball Pool,think ball pool fun game play others plus get ...,5
4,8 Ball Pool,great way focus something different thats fun ...,4
...,...,...,...
995,Shadow Fight 2,improve offline playingwhen return map finish ...,5
996,Shadow Fight 2,whenever watch ad get discount reduce time sti...,3
997,Shadow Fight 2,wonderful animation nice game,5
998,Shadow Fight 2,shadow fight 2 amazing gamei use play year ago...,5


In [6]:
subset.value_counts('score')

score
5    457
1    223
4    136
3    112
2     72
Name: count, dtype: int64

In [7]:
X = subset["content"].values
y = subset["score"].astype(int).values

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
tfidf = TfidfVectorizer(max_features=7000, ngram_range=(1,2))  # unigram + bigram
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("\n===== TF-IDF =====")
print("Shape (train):", X_train_tfidf.shape)
print("Jumlah kata unik/vocab:", len(tfidf.vocabulary_))
print("Beberapa vocab:", list(tfidf.vocabulary_.keys())[:20])


===== TF-IDF =====
Shape (train): (800, 7000)
Jumlah kata unik/vocab: 7000
Beberapa vocab: ['awsome', 'hard', 'get', 'server', 'even', 'try', 'join', 'say', 'full', 'really', 'need', 'fix', 'make', 'game', 'hack', 'annoy', 'hard get', 'even try', 'server say', 'server full']


In [10]:
tfidf_lr = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        strip_accents="unicode",
        token_pattern=r"[A-Za-z]{2,}",   # words of 2+ letters
        ngram_range=(1,2),               # uni+bi-grams usually best for sentiment
        min_df=5,                        # ignore very rare terms
        max_df=0.9,                      # ignore too-common terms
        stop_words="english",            # keep negations in text; vectorizer removes generic stopwords
    )),
    ("clf", LogisticRegression(
        max_iter=2000,
        multi_class="multinomial",
        class_weight="balanced",
        solver="lbfgs",
        n_jobs=-1
    ))
])

tfidf_lr.fit(X_train, y_train)
pred = tfidf_lr.predict(X_test)

print("TF-IDF + LR accuracy:", accuracy_score(y_test, pred))
print("\nClassification report (TF-IDF + LR):\n", classification_report(y_test, pred, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_test, pred))



TF-IDF + LR accuracy: 0.455

Classification report (TF-IDF + LR):
               precision    recall  f1-score   support

           1     0.4783    0.4889    0.4835        45
           2     0.1429    0.2000    0.1667        15
           3     0.1724    0.2273    0.1961        22
           4     0.1515    0.1852    0.1667        27
           5     0.7887    0.6154    0.6914        91

    accuracy                         0.4550       200
   macro avg     0.3468    0.3433    0.3409       200
weighted avg     0.5166    0.4550    0.4799       200

Confusion matrix:
 [[22  8  7  7  1]
 [ 7  3  1  3  1]
 [ 7  1  5  6  3]
 [ 4  2  6  5 10]
 [ 6  7 10 12 56]]


In [27]:
import os
from gensim.models import FastText
from sklearn.preprocessing import StandardScaler

def tokenize_clean(s: str):
    return s.split()

sentences = [tokenize_clean(t) for t in subset["content"]]
y = subset["score"].astype(int).values

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(sentences, y, test_size=0.2, stratify=y, random_state=42)

# 3) Train FastText on the training sentences
ft_dim = 300
ft = FastText(
    vector_size=ft_dim,
    window=5,
    min_count=5,          # raise/lower depending on corpus size
    workers=os.cpu_count(),
    sg=1,                 # skip-gram (semantic)
    epochs=10
)
ft.build_vocab(corpus_iterable=X_train_s)
ft.train(corpus_iterable=X_train_s, total_examples=len(X_train_s), epochs=ft.epochs)

# 4) Sentence → vector (mean of word vectors)
def sent_vec(tokens, model, dim):
    if not tokens:
        return np.zeros(dim, dtype=np.float32)
    vecs = [model.wv[w] for w in tokens if w in model.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(dim, dtype=np.float32)

Xtr = np.vstack([sent_vec(t, ft, ft_dim) for t in X_train_s])
Xte = np.vstack([sent_vec(t, ft, ft_dim) for t in X_test_s])

# 5) (Optional) scale embeddings; LR often benefits a bit
scaler = StandardScaler()
Xtr_s = scaler.fit_transform(Xtr)
Xte_s = scaler.transform(Xte)

# 6) Classifier
clf_ft = LogisticRegression(
    max_iter=2000, multi_class="multinomial",
    class_weight="balanced", solver="lbfgs", n_jobs=-1
)
clf_ft.fit(Xtr_s, y_train_s)
pred_ft = clf_ft.predict(Xte_s)

print("FastText(avg) + LR accuracy:", round(accuracy_score(y_test_s, pred_ft), 4))
print("\nClassification report (FastText + LR):\n", classification_report(y_test_s, pred_ft, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_test_s, pred_ft))



FastText(avg) + LR accuracy: 0.425

Classification report (FastText + LR):
               precision    recall  f1-score   support

           1     0.5676    0.4667    0.5122        45
           2     0.0270    0.0667    0.0385        15
           3     0.0741    0.0909    0.0816        22
           4     0.2069    0.2222    0.2143        27
           5     0.7857    0.6044    0.6832        91

    accuracy                         0.4250       200
   macro avg     0.3323    0.2902    0.3060       200
weighted avg     0.5233    0.4250    0.4669       200

Confusion matrix:
 [[21  9  6  4  5]
 [ 6  1  3  4  1]
 [ 7  6  2  5  2]
 [ 1 10  3  6  7]
 [ 2 11 13 10 55]]
