In [6]:
# pip install -U transformers datasets accelerate evaluate scikit-learn
import pandas as pd
import numpy as np
import evaluate, torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# 1) Load data
df = pd.read_csv("TopGamesDataClean.csv", usecols=["content", "score"])
df["score"] = pd.to_numeric(df["score"], errors="coerce")
df = df[df["score"].between(1,5)].dropna(subset=["content"])
df["content"] = df["content"].astype(str).str.strip()
df = df[df["content"] != ""]

# (Optional) Subsample for speed
df = df.groupby("score", group_keys=False).apply(lambda x: x.sample(n=min(50000, len(x)), random_state=42)).reset_index(drop=True)

# labels must be 0..4 for HF models
label_map = {1:0, 2:1, 3:2, 4:3, 5:4}
df["label"] = df["score"].map(label_map).astype(int)

train_df, test_df = train_test_split(
    df[["content", "label"]],
    test_size=0.2, random_state=42, stratify=df["label"]
)

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
test_ds  = Dataset.from_pandas(test_df.reset_index(drop=True))

# 2) Tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["content"], truncation=True, max_length=256)

train_tok = train_ds.map(tokenize, batched=True, remove_columns=["content"])
test_tok  = test_ds.map(tokenize,  batched=True, remove_columns=["content"])

# 3) Model
num_labels = 5
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# 4) Training setup
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
metric_acc = evaluate.load("accuracy")
metric_f1  = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        "f1_weighted": metric_f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
    }

args = TrainingArguments(
    output_dir="distilbert-5cls",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# 5) Evaluate
eval_res = trainer.evaluate()
print(eval_res)

# 6) Predict on test set (labels back to 1..5)
preds = np.argmax(trainer.predict(test_tok).predictions, axis=1)
inv_label_map = {v:k for k,v in label_map.items()}
pred_scores = np.vectorize(inv_label_map.get)(preds)

from sklearn.metrics import classification_report, confusion_matrix
print("\nClassification report (DistilBERT):\n",
      classification_report(np.vectorize(inv_label_map.get)(test_df["label"].values),
                            pred_scores, digits=4))
print("Confusion matrix:\n", confusion_matrix(np.vectorize(inv_label_map.get)(test_df["label"].values), pred_scores))


  df = df.groupby("score", group_keys=False).apply(lambda x: x.sample(n=min(50000, len(x)), random_state=42)).reset_index(drop=True)


Map:   0%|          | 0/200000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,1.1383,1.168814,0.50324,0.498717,0.498717
2,1.1083,1.157901,0.51068,0.502485,0.502485
3,1.0509,1.180473,0.50904,0.505794,0.505794


{'eval_loss': 1.1804733276367188, 'eval_accuracy': 0.50904, 'eval_f1_macro': 0.5057936054958516, 'eval_f1_weighted': 0.5057936054958515, 'eval_runtime': 19.6878, 'eval_samples_per_second': 2539.644, 'eval_steps_per_second': 79.389, 'epoch': 3.0}

Classification report (DistilBERT):
               precision    recall  f1-score   support

           1     0.5957    0.5900    0.5928     10000
           2     0.4180    0.4254    0.4217     10000
           3     0.3928    0.3731    0.3827     10000
           4     0.4818    0.4343    0.4568     10000
           5     0.6334    0.7224    0.6750     10000

    accuracy                         0.5090     50000
   macro avg     0.5043    0.5090    0.5058     50000
weighted avg     0.5043    0.5090    0.5058     50000

Confusion matrix:
 [[5900 2548  939  246  367]
 [2335 4254 2175  770  466]
 [1036 2404 3731 1950  879]
 [ 383  764 2040 4343 2470]
 [ 250  207  613 1706 7224]]


In [7]:
# pip install -U transformers datasets accelerate evaluate scikit-learn
import pandas as pd
import numpy as np
import evaluate, torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# 1) Load data
df = pd.read_csv("TopGamesDataClean.csv", usecols=["content", "score"])
df["score"] = pd.to_numeric(df["score"], errors="coerce")
df = df[df["score"].between(1,5)].dropna(subset=["content"])
df["content"] = df["content"].astype(str).str.strip()
df = df[df["content"] != ""]

# (Optional) Subsample for speed
# df = df.groupby("score", group_keys=False).apply(lambda x: x.sample(n=min(50000, len(x)), random_state=42)).reset_index(drop=True)

# labels must be 0..4 for HF models
label_map = {1:0, 2:1, 3:2, 4:3, 5:4}
df["label"] = df["score"].map(label_map).astype(int)

train_df, test_df = train_test_split(
    df[["content", "label"]],
    test_size=0.2, random_state=42, stratify=df["label"]
)

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
test_ds  = Dataset.from_pandas(test_df.reset_index(drop=True))

# 2) Tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["content"], truncation=True, max_length=256)

train_tok = train_ds.map(tokenize, batched=True, remove_columns=["content"])
test_tok  = test_ds.map(tokenize,  batched=True, remove_columns=["content"])

# 3) Model
num_labels = 5
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# 4) Training setup
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
metric_acc = evaluate.load("accuracy")
metric_f1  = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        "f1_weighted": metric_f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
    }

args = TrainingArguments(
    output_dir="distilbert-5cls",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# 5) Evaluate
eval_res = trainer.evaluate()
print(eval_res)

# 6) Predict on test set (labels back to 1..5)
preds = np.argmax(trainer.predict(test_tok).predictions, axis=1)
inv_label_map = {v:k for k,v in label_map.items()}
pred_scores = np.vectorize(inv_label_map.get)(preds)

from sklearn.metrics import classification_report, confusion_matrix
print("\nClassification report (DistilBERT):\n",
      classification_report(np.vectorize(inv_label_map.get)(test_df["label"].values),
                            pred_scores, digits=4))
print("Confusion matrix:\n", confusion_matrix(np.vectorize(inv_label_map.get)(test_df["label"].values), pred_scores))


Map:   0%|          | 0/1620599 [00:00<?, ? examples/s]

Map:   0%|          | 0/405150 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.8477,0.847515,0.690295,0.468372,0.647658
2,0.7962,0.843503,0.691351,0.484785,0.660098
3,0.7825,0.855507,0.690406,0.491885,0.661771


{'eval_loss': 0.8555066585540771, 'eval_accuracy': 0.690406022460817, 'eval_f1_macro': 0.49188483010881556, 'eval_f1_weighted': 0.6617705737910411, 'eval_runtime': 161.9609, 'eval_samples_per_second': 2501.53, 'eval_steps_per_second': 78.173, 'epoch': 3.0}

Classification report (DistilBERT):
               precision    recall  f1-score   support

           1     0.6785    0.8321    0.7475     93654
           2     0.4511    0.1036    0.1685     27683
           3     0.3499    0.2924    0.3186     37901
           4     0.4654    0.3266    0.3839     55666
           5     0.7958    0.8918    0.8411    190246

    accuracy                         0.6904    405150
   macro avg     0.5481    0.4893    0.4919    405150
weighted avg     0.6580    0.6904    0.6618    405150

Confusion matrix:
 [[ 77929   1671   5568   1756   6730]
 [ 14762   2867   5222   2001   2831]
 [ 10555   1302  11082   6860   8102]
 [  5006    358   6245  18183  25874]
 [  6610    158   3554  10267 169657]]


In [9]:
# pip install -U transformers datasets accelerate evaluate scikit-learn
import pandas as pd
import numpy as np
import evaluate, torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# 1) Load data
df = pd.read_csv("TopGamesDataClean.csv", usecols=["content", "score", 'game_name'])
df["score"] = pd.to_numeric(df["score"], errors="coerce")
df = df[df["score"].between(1,5)].dropna(subset=["content"])
df["content"] = df["content"].astype(str).str.strip()
df = df[df["content"] != ""]
df = df[df["game_name"] == "Among Us"]

# (Optional) Subsample for speed
# df = df.groupby("score", group_keys=False).apply(lambda x: x.sample(n=min(50000, len(x)), random_state=42)).reset_index(drop=True)

# labels must be 0..4 for HF models
label_map = {1:0, 2:1, 3:2, 4:3, 5:4}
df["label"] = df["score"].map(label_map).astype(int)

train_df, test_df = train_test_split(
    df[["content", "label"]],
    test_size=0.2, random_state=42, stratify=df["label"]
)

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
test_ds  = Dataset.from_pandas(test_df.reset_index(drop=True))

# 2) Tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["content"], truncation=True, max_length=256)

train_tok = train_ds.map(tokenize, batched=True, remove_columns=["content"])
test_tok  = test_ds.map(tokenize,  batched=True, remove_columns=["content"])

# 3) Model
num_labels = 5
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# 4) Training setup
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
metric_acc = evaluate.load("accuracy")
metric_f1  = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        "f1_weighted": metric_f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
    }

args = TrainingArguments(
    output_dir="distilbert-5cls",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# 5) Evaluate
eval_res = trainer.evaluate()
print(eval_res)

# 6) Predict on test set (labels back to 1..5)
preds = np.argmax(trainer.predict(test_tok).predictions, axis=1)
inv_label_map = {v:k for k,v in label_map.items()}
pred_scores = np.vectorize(inv_label_map.get)(preds)

from sklearn.metrics import classification_report, confusion_matrix
print("\nClassification report (DistilBERT):\n",
      classification_report(np.vectorize(inv_label_map.get)(test_df["label"].values),
                            pred_scores, digits=4))
print("Confusion matrix:\n", confusion_matrix(np.vectorize(inv_label_map.get)(test_df["label"].values), pred_scores))


Map:   0%|          | 0/57487 [00:00<?, ? examples/s]

Map:   0%|          | 0/14372 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,1.0877,1.058318,0.556986,0.487351,0.528486
2,0.9799,1.046454,0.567492,0.509907,0.548277
3,0.8945,1.074016,0.565057,0.518349,0.554908


{'eval_loss': 1.0740162134170532, 'eval_accuracy': 0.5650570553854718, 'eval_f1_macro': 0.5183490056698038, 'eval_f1_weighted': 0.5549080762312745, 'eval_runtime': 5.8916, 'eval_samples_per_second': 2439.407, 'eval_steps_per_second': 76.38, 'epoch': 3.0}

Classification report (DistilBERT):
               precision    recall  f1-score   support

           1     0.6332    0.7562    0.6893      3331
           2     0.4129    0.1834    0.2540      1821
           3     0.3940    0.4520    0.4210      2427
           4     0.5046    0.5525    0.5275      3247
           5     0.7323    0.6703    0.6999      3546

    accuracy                         0.5651     14372
   macro avg     0.5354    0.5229    0.5183     14372
weighted avg     0.5603    0.5651    0.5549     14372

Confusion matrix:
 [[2519  229  395  145   43]
 [ 757  334  505  190   35]
 [ 444  165 1097  588  133]
 [ 152   66  577 1794  658]
 [ 106   15  210  838 2377]]


In [None]:
# 1) Load data
df = pd.read_csv("TopGamesDataClean.csv", usecols=["content", "score", 'game_name'])
df["score"] = pd.to_numeric(df["score"], errors="coerce")
df = df[df["score"].between(1,5)].dropna(subset=["content"])
df["content"] = df["content"].astype(str).str.strip()
df = df[df["content"] != ""]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,1.1966,1.165655,0.513707,0.422588,0.47356
2,1.1141,1.11928,0.53312,0.448753,0.495597
3,1.0292,1.104034,0.541748,0.482531,0.520431
4,0.9962,1.096651,0.542861,0.494399,0.529183
5,0.9618,1.10602,0.544949,0.496125,0.531706
6,0.8993,1.113924,0.544322,0.49344,0.529268


{'eval_loss': 1.1060199737548828, 'eval_accuracy': 0.5449485109935986, 'eval_f1_macro': 0.4961245010818229, 'eval_f1_weighted': 0.5317055859078008, 'eval_runtime': 1.5915, 'eval_samples_per_second': 9030.576, 'eval_steps_per_second': 71.003, 'epoch': 6.0}

Classification report (BiLSTM):
               precision    recall  f1-score   support

           1     0.6070    0.7550    0.6730      3331
           2     0.3837    0.1840    0.2487      1821
           3     0.3856    0.3869    0.3863      2427
           4     0.4891    0.5057    0.4973      3247
           5     0.6737    0.6771    0.6754      3546

    accuracy                         0.5449     14372
   macro avg     0.5078    0.5017    0.4961     14372
weighted avg     0.5312    0.5449    0.5317     14372

Confusion matrix:
 [[2515  239  351  145   81]
 [ 785  335  429  193   79]
 [ 499  204  939  586  199]
 [ 217   69  515 1642  804]
 [ 127   26  201  791 2401]]


In [16]:
# ---- CONTINUES FROM YOUR df ----
# df already cleaned and contains columns: content (str), score (1..5)

# 1) Load data
df = pd.read_csv("TopGamesDataClean.csv", usecols=["content", "score", 'game_name'])
df["score"] = pd.to_numeric(df["score"], errors="coerce")
df = df[df["score"].between(1,5)].dropna(subset=["content"])
df["content"] = df["content"].astype(str).str.strip()
df = df[df["content"] != ""]

import numpy as np, torch, evaluate
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding, TrainingArguments, Trainer
)
from transformers.trainer_utils import IntervalStrategy
from transformers import EarlyStoppingCallback

# 0) Labels 1..5 -> 0..4
label_map = {1:0, 2:1, 3:2, 4:3, 5:4}
df = df[df["score"].between(1,5)]
df["label"] = df["score"].map(label_map).astype(int)

# 1) Tokenizer
model_name = "distilbert-base-uncased"
num_labels = 5
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["content"], truncation=True, max_length=256)

# === SOLUSI 1: class_encode_column + stratified split, lalu tokenisasi ===
# a) Buat Dataset & ubah 'label' menjadi ClassLabel
full_ds = Dataset.from_pandas(df[["content","label"]].reset_index(drop=True))
full_ds = full_ds.class_encode_column("label")  # penting untuk stratify_by_column

# b) Split dengan stratifikasi
splits    = full_ds.train_test_split(test_size=0.1, stratify_by_column="label", seed=42)
train_all = splits["train"]
val_small = splits["test"]

# c) Tokenisasi SETELAH split
train_all = train_all.map(tokenize, batched=True, remove_columns=["content"])
val_small = val_small.map(tokenize,  batched=True, remove_columns=["content"])

# d) (Untuk pelatihan akhir 100% data) tokenisasi seluruh dataset juga
full_tok = full_ds.map(tokenize, batched=True, remove_columns=["content"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
metric_acc = evaluate.load("accuracy")
metric_f1  = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        "f1_weighted": metric_f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
    }

# 2) Hyperparameter tuning (Optuna backend)
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

def compute_objective(metrics):
    return metrics["eval_f1_macro"]

def hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 5e-6, 5e-5, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.2),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0.0, 0.2),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
        "per_device_eval_batch_size": trial.suggest_categorical("per_device_eval_batch_size", [16, 32, 64]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 5),
        "lr_scheduler_type": trial.suggest_categorical("lr_scheduler_type", ["linear", "cosine", "cosine_with_restarts"]),
    }

base_args = TrainingArguments(
    output_dir="distilbert-5cls",
    eval_strategy=IntervalStrategy.EPOCH,
    save_strategy=IntervalStrategy.EPOCH,
    logging_strategy=IntervalStrategy.STEPS,
    logging_steps=100,
    learning_rate=2e-5,                 # overridden by search
    per_device_train_batch_size=16,     # overridden
    per_device_eval_batch_size=32,      # overridden
    num_train_epochs=3,                 # overridden
    weight_decay=0.01,                  # overridden
    warmup_ratio=0.0,                   # overridden
    lr_scheduler_type="linear",         # overridden
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = Trainer(
    model_init=model_init,
    args=base_args,
    train_dataset=train_all,
    eval_dataset=val_small,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.0)],
)

best = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=compute_objective,
    n_trials=15,
)
print("Best trial:", best)

# 3) Retrain on 100% of the data with best HPs
best_args = {**base_args.to_dict(), **best.hyperparameters}
final_args = TrainingArguments(
    **best_args,
    output_dir="distilbert-5cls-final",
    load_best_model_at_end=False,      # training on all data; no holdout to pick "best"
    eval_strategy=IntervalStrategy.NO, # disable eval during training
    save_strategy=IntervalStrategy.EPOCH,
)

final_trainer = Trainer(
    model_init=model_init,
    args=final_args,
    train_dataset=full_tok,     # ALL data used here
    eval_dataset=None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

final_trainer.train()

# 4) Optional: quick reference eval on the small split (not a true test set)
ref_eval = Trainer(
    model=final_trainer.model,
    args=final_args,
    eval_dataset=val_small,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
).evaluate()
print("Reference eval on small holdout:", ref_eval)

# 5) Save
final_trainer.model.save_pretrained("distilbert-5cls-final/model")
tokenizer.save_pretrained("distilbert-5cls-final/model")


Stringifying the column:   0%|          | 0/2025749 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/2025749 [00:00<?, ? examples/s]

Map:   0%|          | 0/1823174 [00:00<?, ? examples/s]

Map:   0%|          | 0/202575 [00:00<?, ? examples/s]

Map:   0%|          | 0/2025749 [00:00<?, ? examples/s]

  trainer = Trainer(
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-11-08 10:36:27,225] A new study created in memory with name: no-name-e1e1ffb6-8248-4944-a096-0fe99ee56faa
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.8227,0.853137,0.690006,0.481092,0.654613
2,0.7631,0.847166,0.692242,0.486981,0.660335


[I 2025-11-08 16:36:14,551] Trial 0 finished with value: 0.48698108228640036 and parameters: {'learning_rate': 1.4134313218448877e-05, 'weight_decay': 0.07301787569610085, 'warmup_ratio': 0.03492832681816802, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 64, 'num_train_epochs': 2, 'lr_scheduler_type': 'cosine'}. Best is trial 0 with value: 0.48698108228640036.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.8275,0.856723,0.68929,0.478842,0.653014
2,0.7647,0.845698,0.69279,0.485165,0.659493


[I 2025-11-08 23:47:30,678] Trial 1 finished with value: 0.48516503018046403 and parameters: {'learning_rate': 1.7391409469426973e-05, 'weight_decay': 0.08885625355821286, 'warmup_ratio': 0.09255632508281356, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 64, 'num_train_epochs': 2, 'lr_scheduler_type': 'cosine'}. Best is trial 0 with value: 0.48698108228640036.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.831,0.849586,0.689231,0.477678,0.651525
2,0.8309,0.841636,0.690702,0.479559,0.657236
3,0.7802,0.842097,0.692188,0.482799,0.657738
4,0.7842,0.854989,0.690579,0.485278,0.658273
5,0.7345,0.867929,0.687074,0.4903,0.659659


[I 2025-11-09 04:50:40,702] Trial 2 finished with value: 0.4902997037046825 and parameters: {'learning_rate': 1.2910933245630052e-05, 'weight_decay': 0.061391765485448224, 'warmup_ratio': 0.017944562881385774, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'num_train_epochs': 5, 'lr_scheduler_type': 'linear'}. Best is trial 2 with value: 0.4902997037046825.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.8548,0.863915,0.684438,0.459417,0.639928
2,0.8524,0.846815,0.689567,0.475092,0.654369
3,0.8078,0.844451,0.691507,0.482349,0.657098
4,0.8366,0.84679,0.690752,0.484192,0.658261


[I 2025-11-09 10:54:19,085] Trial 3 finished with value: 0.4841920814542872 and parameters: {'learning_rate': 5.341368166342143e-06, 'weight_decay': 0.03944224903711979, 'warmup_ratio': 0.1949091135813038, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 64, 'num_train_epochs': 4, 'lr_scheduler_type': 'linear'}. Best is trial 2 with value: 0.4902997037046825.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.8551,0.867584,0.68275,0.451493,0.634316
2,0.8495,0.84551,0.689917,0.475682,0.655322
3,0.811,0.843464,0.691635,0.481714,0.656911
4,0.8338,0.84512,0.691996,0.482252,0.656934
5,0.8106,0.854031,0.690544,0.489957,0.660442


[I 2025-11-09 20:03:48,466] Trial 4 finished with value: 0.48995678847554547 and parameters: {'learning_rate': 6.7251172863520765e-06, 'weight_decay': 0.10314131354902362, 'warmup_ratio': 0.19993983250896374, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 64, 'num_train_epochs': 5, 'lr_scheduler_type': 'linear'}. Best is trial 2 with value: 0.4902997037046825.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.8455,0.857433,0.685662,0.458566,0.639838
2,0.8422,0.843186,0.690352,0.478604,0.657521
3,0.7914,0.841802,0.692682,0.486372,0.659881
4,0.8059,0.855172,0.690421,0.490363,0.661061


[I 2025-11-10 04:51:39,736] Trial 5 finished with value: 0.4903625701541766 and parameters: {'learning_rate': 1.2045308108537165e-05, 'weight_decay': 0.09092300989431466, 'warmup_ratio': 0.17294505407027128, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 64, 'num_train_epochs': 4, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 5 with value: 0.4903625701541766.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.8384,0.864078,0.687054,0.464504,0.644536


[I 2025-11-10 09:32:58,920] Trial 6 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.8434,0.854955,0.686926,0.462427,0.642847
2,0.8351,0.841503,0.69165,0.480074,0.657581
3,0.7848,0.843729,0.692163,0.485661,0.659388


[I 2025-11-10 16:44:35,800] Trial 7 finished with value: 0.4856611075241357 and parameters: {'learning_rate': 1.0653057689011208e-05, 'weight_decay': 0.10257929465774332, 'warmup_ratio': 0.18434172073395952, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'num_train_epochs': 3, 'lr_scheduler_type': 'cosine'}. Best is trial 5 with value: 0.4903625701541766.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.8575,0.868919,0.685218,0.46044,0.641652


[I 2025-11-10 20:39:15,929] Trial 8 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.8312,0.847935,0.690218,0.478544,0.652551
2,0.8278,0.839448,0.692099,0.47948,0.657571


[I 2025-11-10 22:53:39,641] Trial 9 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.8673,0.875796,0.679457,0.434207,0.623365


[I 2025-11-11 00:25:05,778] Trial 10 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.8386,0.857893,0.686778,0.467014,0.644683


[I 2025-11-11 01:25:12,211] Trial 11 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.8321,0.852395,0.688427,0.4747,0.650059


[I 2025-11-11 02:25:25,897] Trial 12 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.8368,0.854825,0.687029,0.464117,0.642498


[I 2025-11-11 03:25:43,294] Trial 13 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.8273,0.84836,0.689636,0.480726,0.65407
2,0.8264,0.841597,0.691048,0.480021,0.657045
3,0.7721,0.84195,0.692193,0.485454,0.658863


[I 2025-11-11 06:26:36,454] Trial 14 finished with value: 0.4854538665412809 and parameters: {'learning_rate': 1.2638607703581072e-05, 'weight_decay': 0.061215082657034826, 'warmup_ratio': 0.007422601170802312, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'num_train_epochs': 3, 'lr_scheduler_type': 'linear'}. Best is trial 5 with value: 0.4903625701541766.


Best trial: BestRun(run_id='5', objective=0.4903625701541766, hyperparameters={'learning_rate': 1.2045308108537165e-05, 'weight_decay': 0.09092300989431466, 'warmup_ratio': 0.17294505407027128, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 64, 'num_train_epochs': 4, 'lr_scheduler_type': 'cosine_with_restarts'}, run_summary=None)


TypeError: transformers.training_args.TrainingArguments() got multiple values for keyword argument 'output_dir'

In [17]:
final_trainer.model.save_pretrained("distilbert-5cls-final/model")
tokenizer.save_pretrained("distilbert-5cls-final/model")

('distilbert-5cls-final/model\\tokenizer_config.json',
 'distilbert-5cls-final/model\\special_tokens_map.json',
 'distilbert-5cls-final/model\\vocab.txt',
 'distilbert-5cls-final/model\\added_tokens.json',
 'distilbert-5cls-final/model\\tokenizer.json')

In [18]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer
import numpy as np, evaluate

model_path = "distilbert-5cls-final/model"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)


In [19]:
metric_acc = evaluate.load("accuracy")
metric_f1  = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        "f1_weighted": metric_f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
    }


In [20]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

ref_trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    eval_dataset=val_small,  # this was your 10% holdout
)

results = ref_trainer.evaluate()
print("Final model performance:")
for k, v in results.items():
    print(f"{k:<20} : {v:.4f}")


  ref_trainer = Trainer(


Final model performance:
eval_loss            : 1.0687
eval_model_preparation_time : 0.0010
eval_accuracy        : 0.6273
eval_f1_macro        : 0.4519
eval_f1_weighted     : 0.6153
eval_runtime         : 151.1461
eval_samples_per_second : 1340.2590
eval_steps_per_second : 167.5330


In [None]:
import pandas as pd, numpy as np, torch, evaluate
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding, TrainingArguments, Trainer,
    EarlyStoppingCallback
)
from transformers.trainer_utils import IntervalStrategy

# ==== 1. Load & clean ====
df = pd.read_csv("TopGamesDataClean.csv", usecols=["content", "score", "game_name"])
df["score"] = pd.to_numeric(df["score"], errors="coerce")
df = df[df["score"].between(1,5)].dropna(subset=["content"])
df["content"] = df["content"].astype(str).str.strip()
df = df[df["content"] != ""]

# 1–5 → 0–4
label_map = {1:0, 2:1, 3:2, 4:3, 5:4}
df["label"] = df["score"].map(label_map).astype(int)

# ==== 2. Model / tokenizer ====
model_name = "microsoft/deberta-v3-base"
num_labels = 5
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["content"], truncation=True, max_length=256)

# ==== 3. Dataset + stratified split ====
full_ds = Dataset.from_pandas(df[["content","label"]].reset_index(drop=True))
full_ds = full_ds.class_encode_column("label")
splits    = full_ds.train_test_split(test_size=0.1, stratify_by_column="label", seed=42)
train_all = splits["train"]
val_small = splits["test"]

train_tok = train_all.map(tokenize, batched=True, remove_columns=["content"])
val_tok   = val_small.map(tokenize, batched=True, remove_columns=["content"])
full_tok  = full_ds.map(tokenize, batched=True, remove_columns=["content"])

# ==== 4. Data collator & metrics ====
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
metric_acc = evaluate.load("accuracy")
metric_f1  = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        "f1_weighted": metric_f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
    }

# ==== 5. Training arguments ====
args = TrainingArguments(
    output_dir="deberta-5cls",
    eval_strategy=IntervalStrategy.EPOCH,
    save_strategy=IntervalStrategy.EPOCH,
    logging_strategy=IntervalStrategy.STEPS,
    logging_steps=100,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    warmup_ratio=0.06,
    lr_scheduler_type="linear",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

# ==== 6. Trainer ====
trainer = Trainer(
    model_init=lambda: AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels),
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

# ==== 7. Evaluate on validation set ====
eval_results = trainer.evaluate()
print("\nValidation results:")
for k, v in eval_results.items():
    if isinstance(v, float):
        print(f"{k:<20} : {v:.4f}")

# ==== 8. Retrain on full data (optional final model) ====
final_args = TrainingArguments(
    output_dir="deberta-5cls-final",
    per_device_train_batch_size=16,
    num_train_epochs=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.06,
    lr_scheduler_type="linear",
    fp16=torch.cuda.is_available(),
    evaluation_strategy=IntervalStrategy.NO,
    save_strategy=IntervalStrategy.EPOCH,
    report_to="none",
)

final_trainer = Trainer(
    model_init=lambda: AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels),
    args=final_args,
    train_dataset=full_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

final_trainer.train()

# ==== 9. Save model ====
final_trainer.model.save_pretrained("deberta-5cls-final/model")
tokenizer.save_pretrained("deberta-5cls-final/model")
print("✅ Model saved at deberta-5cls-final/model")



Stringifying the column:   0%|          | 0/2025749 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/2025749 [00:00<?, ? examples/s]

Map:   0%|          | 0/1823174 [00:00<?, ? examples/s]

Map:   0%|          | 0/202575 [00:00<?, ? examples/s]

Map:   0%|          | 0/2025749 [00:00<?, ? examples/s]

  trainer = Trainer(
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.8273,0.844383,0.692168,0.470902,0.64664
2,0.8295,0.839747,0.693486,0.492865,0.666408
3,0.786,0.84142,0.694572,0.499555,0.667211
