In [1]:
# Jika CPU-only: ganti baris torch → "torch==2.3.1" + index-url CPU.
!pip -q install --upgrade pip
!pip -q install "torch==2.3.1+cu121" --index-url https://download.pytorch.org/whl/cu121
!pip -q install "transformers==4.44.2" "accelerate==0.33.0" "datasets==2.20.0" "evaluate==0.4.1"
!pip -q install "scikit-learn==1.5.2" "pandas==2.2.2" "pyarrow==16.1.0" "ipykernel"
!pip -q install sentencepiece==0.1.99 "protobuf>=3.20,<5" tokenizers>=0.19.0
# (opsional) progress bar mulus di notebook
!pip -q install ipywidgets==8.1.2 jupyterlab_widgets

In [2]:
import sys, torch, transformers, datasets, accelerate
print("Python:", sys.executable)
print("torch:", torch.__version__, "| cuda:", torch.version.cuda, "| is_cuda:", torch.cuda.is_available())
print("transformers:", transformers.__version__, "| datasets:", datasets.__version__, "| accelerate:", accelerate.__version__)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

Python: D:\.Portofolio\Coding\social-sentiment\.venv\Scripts\python.exe
torch: 2.3.1+cu121 | cuda: 12.1 | is_cuda: True
transformers: 4.44.2 | datasets: 2.20.0 | accelerate: 0.33.0


In [3]:
from datasets import load_dataset

# EN: TweetEval sentiment (0=neg,1=neu,2=pos)
ds_en = load_dataset("cardiffnlp/tweet_eval", "sentiment")

# ID: IndoNLU SmSA (butuh trust_remote_code) → fallback ke NusaX-senti jika gagal
try:
    ds_id = load_dataset("indonlp/indonlu", "smsa", trust_remote_code=True)
    src_id = "indonlu-smsa"
except Exception as e:
    print("⚠️ IndoNLU SmSA gagal, fallback ke NusaX-senti (id). Error:", e)
    ds_id = load_dataset("indonlp/NusaX-senti", "id")
    src_id = "nusax-senti"

ds_en, ds_id, src_id

(DatasetDict({
     train: Dataset({
         features: ['text', 'label'],
         num_rows: 45615
     })
     test: Dataset({
         features: ['text', 'label'],
         num_rows: 12284
     })
     validation: Dataset({
         features: ['text', 'label'],
         num_rows: 2000
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['text', 'label'],
         num_rows: 11000
     })
     validation: Dataset({
         features: ['text', 'label'],
         num_rows: 1260
     })
     test: Dataset({
         features: ['text', 'label'],
         num_rows: 500
     })
 }),
 'indonlu-smsa')

In [4]:
import pandas as pd
from datasets import Dataset, DatasetDict

# EN map: label sudah int (0/1/2)
def map_en(x):
    return {"text": x["text"], "label": int(x["label"]), "lang": "en"}
en_splits = {k: ds_en[k].map(map_en, remove_columns=ds_en[k].column_names) for k in ds_en.keys()}

# ID map: bisa string/ClassLabel → jadikan 0/1/2
try:
    id_label_names = ds_id["train"].features["label"].names
except Exception:
    id_label_names = None

def map_id(x):
    mapping = {"negative": 0, "neutral": 1, "positive": 2}
    if id_label_names and isinstance(x["label"], int):
        name = id_label_names[x["label"]].lower()
    else:
        name = str(x["label"]).lower()
    if name in mapping:
        y = mapping[name]
    else:
        # fallback jika sudah int 0/1/2
        y = int(x["label"])
    return {"text": x["text"], "label": y, "lang": "id"}

id_splits = {k: ds_id[k].map(map_id, remove_columns=ds_id[k].column_names) for k in ds_id.keys()}

def concat_if_exists(split):
    parts = []
    if split in en_splits: parts.append(en_splits[split])
    if split in id_splits: parts.append(id_splits[split])
    return Dataset.from_pandas(pd.concat([p.to_pandas() for p in parts], ignore_index=True))

merged = DatasetDict({
    "train": concat_if_exists("train"),
    "validation": concat_if_exists("validation"),
    "test": concat_if_exists("test"),
})
merged

Map:   0%|          | 0/11000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1260 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'lang'],
        num_rows: 56615
    })
    validation: Dataset({
        features: ['text', 'label', 'lang'],
        num_rows: 3260
    })
    test: Dataset({
        features: ['text', 'label', 'lang'],
        num_rows: 12784
    })
})

In [5]:
for split in merged:
    df = merged[split].to_pandas()
    print(f"=== {split.upper()} === Rows:", len(df))
    print("Lang counts:\n", df["lang"].value_counts())
    print("Label counts (0=neg,1=neu,2=pos):\n", df["label"].value_counts(), "\n")

=== TRAIN === Rows: 56615
Lang counts:
 lang
en    45615
id    11000
Name: count, dtype: int64
Label counts (0=neg,1=neu,2=pos):
 label
2    24265
1    21821
0    10529
Name: count, dtype: int64 

=== VALIDATION === Rows: 3260
Lang counts:
 lang
en    2000
id    1260
Name: count, dtype: int64
Label counts (0=neg,1=neu,2=pos):
 label
2    1554
1    1000
0     706
Name: count, dtype: int64 

=== TEST === Rows: 12784
Lang counts:
 lang
en    12284
id      500
Name: count, dtype: int64
Label counts (0=neg,1=neu,2=pos):
 label
1    6025
0    4176
2    2583
Name: count, dtype: int64 



In [6]:
import re
from datasets import DatasetDict
from transformers import AutoTokenizer

MODEL_NAME = "cardiffnlp/twitter-xlm-roberta-base-sentiment"

def clean_text(t: str):
    t = re.sub(r"http\S+|www\.\S+", "<URL>", t)
    t = re.sub(r"@\w+", "<USER>", t)
    return t.strip()

def add_clean(batch):
    return {"text": [clean_text(x) for x in batch["text"]]}

merged_clean = DatasetDict({
    split: merged[split].map(add_clean, batched=True)
    for split in merged.keys()
})

def load_tokenizer_robust(model_name):
    # 1) fast
    try:
        print("Trying FAST tokenizer...")
        tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        print("Loaded FAST tokenizer.")
        return tok
    except Exception as e1:
        print("FAST tokenizer failed:", e1)
    # 2) slow
    try:
        print("Trying SLOW tokenizer (SentencePiece)...", end="")
        tok = AutoTokenizer.from_pretrained(model_name, use_fast=False)
        print("Loaded SLOW tokenizer.")
        return tok
    except Exception as e2:
        print("\nSLOW tokenizer failed:", e2)
    # 3) fallback
    print("FALLBACK to 'xlm-roberta-base' tokenizer...")
    tok = AutoTokenizer.from_pretrained("xlm-roberta-base", use_fast=False)
    print("Loaded fallback tokenizer.")
    return tok

tokenizer = load_tokenizer_robust(MODEL_NAME)

Map:   0%|          | 0/56615 [00:00<?, ? examples/s]

Map:   0%|          | 0/3260 [00:00<?, ? examples/s]

Map:   0%|          | 0/12784 [00:00<?, ? examples/s]

Trying FAST tokenizer...




Loaded FAST tokenizer.


In [7]:
from datasets import DatasetDict
from transformers import DataCollatorWithPadding

def tok(batch):
    return tokenizer(batch["text"], truncation=True, max_length=160)

def build_tokd(ds):
    out = {}
    for split in ds.keys():
        # hanya 'label' dipertahankan; buang 'text','lang' agar collator tidak mencoba mem-pad string
        keep = ["label"]
        remove_cols = [c for c in ds[split].column_names if c not in keep]
        out[split] = ds[split].map(tok, batched=True, remove_columns=remove_cols)
    return DatasetDict(out)

_src = merged_clean  # pakai yang sudah dibersihkan
tokd = build_tokd(_src)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print({k: tokd[k].column_names for k in tokd})  # → ['label','input_ids','attention_mask',(...)]
print({k: tokd[k].num_rows for k in tokd})

Map:   0%|          | 0/56615 [00:00<?, ? examples/s]

Map:   0%|          | 0/3260 [00:00<?, ? examples/s]

Map:   0%|          | 0/12784 [00:00<?, ? examples/s]

{'train': ['label', 'input_ids', 'attention_mask'], 'validation': ['label', 'input_ids', 'attention_mask'], 'test': ['label', 'input_ids', 'attention_mask']}
{'train': 56615, 'validation': 3260, 'test': 12784}


In [8]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoModelForSequenceClassification

num_labels = 3
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

def metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"accuracy": accuracy_score(labels, preds),
            "macro_f1": f1_score(labels, preds, average="macro")}

In [9]:
import torch
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from torch.utils.data import WeightedRandomSampler, DataLoader

args = TrainingArguments(
    output_dir="artifacts/xlmr-sentiment",
    learning_rate=1e-5,                 # kecil karena mulai dari checkpoint tugas-spesifik
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,                  # early stop menjaga overfit
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    warmup_ratio=0.1,
    logging_steps=100,
    seed=42,
    fp16=torch.cuda.is_available(),
    label_smoothing_factor=0.05
)

# bobot dari distribusi label + bahasa (di dataset sebelum dibuang kolom)
train_df = _src["train"].to_pandas()
label_counts = train_df["label"].value_counts().to_dict()
lang_counts  = train_df["lang"].value_counts().to_dict()

weights = []
for _, r in train_df.iterrows():
    wl = 1.0 / label_counts[r["label"]]
    wg = 1.0 / lang_counts[r["lang"]]
    weights.append(wl * wg)

weights = torch.DoubleTensor(weights)
sampler = WeightedRandomSampler(weights, num_samples=len(weights), replacement=True)

train_loader = DataLoader(
    tokd["train"],
    batch_size=args.per_device_train_batch_size,
    sampler=sampler,
    collate_fn=data_collator
)

class SamplerTrainer(Trainer):
    def get_train_dataloader(self):
        return train_loader

trainer = SamplerTrainer(
    model=model,
    args=args,
    train_dataset=tokd["train"],
    eval_dataset=tokd["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)



In [10]:
train_result = trainer.train()

val_metrics = trainer.evaluate(tokd["validation"])
test_metrics = trainer.evaluate(tokd["test"])
print("Val (balanced):", val_metrics)
print("Test (balanced):", test_metrics)

save_dir = "artifacts/xlmr-sentiment-best-balanced"
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)
print("Saved to:", save_dir)

Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,0.4462,0.514444,0.829448,0.816721
2,0.3781,0.546475,0.82454,0.812002
3,0.3721,0.555685,0.832209,0.821462
4,0.3433,0.561373,0.833129,0.822511
5,0.317,0.580424,0.833129,0.821009


Val (balanced): {'eval_loss': 0.5613728761672974, 'eval_accuracy': 0.8331288343558282, 'eval_macro_f1': 0.8225106053485268, 'eval_runtime': 4.4935, 'eval_samples_per_second': 725.491, 'eval_steps_per_second': 22.699, 'epoch': 5.0}
Test (balanced): {'eval_loss': 0.8344465494155884, 'eval_accuracy': 0.7103410513141427, 'eval_macro_f1': 0.7130114351314533, 'eval_runtime': 13.4281, 'eval_samples_per_second': 952.033, 'eval_steps_per_second': 29.788, 'epoch': 5.0}
Saved to: artifacts/xlmr-sentiment-best-balanced


In [11]:
import numpy as np, torch
from sklearn.metrics import classification_report

def eval_subset(dataset, lang_code, batch_size=32, max_len=128, use_fp16=True):
    df = dataset.to_pandas()
    sub = df[df["lang"] == lang_code]
    if len(sub) == 0:
        print(f"No samples for lang={lang_code}")
        return
    texts = sub["text"].tolist()
    y_true = sub["label"].to_numpy()
    preds_all = []

    device = model.device
    model.eval()
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            enc = tokenizer(texts[i:i+batch_size], truncation=True, max_length=max_len, padding=True, return_tensors="pt").to(device)
            if use_fp16 and torch.cuda.is_available():
                with torch.cuda.amp.autocast(dtype=torch.float16):
                    logits = model(**enc).logits
            else:
                logits = model(**enc).logits
            preds = logits.argmax(dim=1).detach().cpu().numpy()
            preds_all.append(preds)
            del enc, logits
            if torch.cuda.is_available(): torch.cuda.empty_cache()

    y_pred = np.concatenate(preds_all, axis=0)
    print(f"Lang={lang_code}  n={len(texts)}")
    print(classification_report(y_true, y_pred, digits=4, target_names=["neg","neu","pos"]))

ds_val  = merged_clean["validation"]
ds_test = merged_clean["test"]

print("=== VALIDATION per language ===")
eval_subset(ds_val, "en", batch_size=32, max_len=128, use_fp16=True)
eval_subset(ds_val, "id", batch_size=32, max_len=128, use_fp16=True)

print("=== TEST per language ===")
eval_subset(ds_test, "en", batch_size=32, max_len=128, use_fp16=True)
eval_subset(ds_test, "id", batch_size=32, max_len=128, use_fp16=True)

=== VALIDATION per language ===
Lang=en  n=2000
              precision    recall  f1-score   support

         neg     0.6676    0.7596    0.7106       312
         neu     0.7631    0.7043    0.7325       869
         pos     0.8066    0.8303    0.8183       819

    accuracy                         0.7645      2000
   macro avg     0.7458    0.7647    0.7538      2000
weighted avg     0.7660    0.7645    0.7642      2000

Lang=id  n=1260
              precision    recall  f1-score   support

         neg     0.9233    0.9467    0.9348       394
         neu     0.8952    0.8473    0.8706       131
         pos     0.9604    0.9565    0.9584       735

    accuracy                         0.9421      1260
   macro avg     0.9263    0.9168    0.9213      1260
weighted avg     0.9420    0.9421    0.9419      1260

=== TEST per language ===
Lang=en  n=12284
              precision    recall  f1-score   support

         neg     0.6812    0.7842    0.7291      3972
         neu     0.744

In [12]:
import numpy as np, torch
from sklearn.metrics import f1_score, classification_report, accuracy_score

runner = trainer
model_eval = runner.model
bs = 32

# --- VAL probs ---
val_texts = ds_val["text"]; val_labels = np.array(ds_val["label"])
probs = []
model_eval.eval()
with torch.no_grad():
    for i in range(0, len(val_texts), bs):
        enc = tokenizer(val_texts[i:i+bs], truncation=True, max_length=160, padding=True, return_tensors="pt").to(model_eval.device)
        lg  = model_eval(**enc).logits.detach().cpu().numpy()
        ex  = np.exp(lg - lg.max(axis=1, keepdims=True))
        probs.append(ex / ex.sum(axis=1, keepdims=True))
        if torch.cuda.is_available(): torch.cuda.empty_cache()
probs_val = np.vstack(probs)

def predict_with_t_neu(ps, t):
    pred = np.argmax(ps, axis=1).copy()
    pred[ps[:,1] >= t] = 1
    return pred

best_t, best_f1 = 0.5, -1
for t in np.linspace(0.30, 0.70, 21):
    f1m = f1_score(val_labels, predict_with_t_neu(probs_val, t), average="macro")
    if f1m > best_f1:
        best_f1, best_t = f1m, t
print(f"Best t_neu on VAL = {best_t:.2f} | macro-F1 = {best_f1:.4f}")

# --- TEST apply ---
test_texts = ds_test["text"]; test_labels = np.array(ds_test["label"])
probs = []
with torch.no_grad():
    for i in range(0, len(test_texts), bs):
        enc = tokenizer(test_texts[i:i+bs], truncation=True, max_length=160, padding=True, return_tensors="pt").to(model_eval.device)
        lg  = model_eval(**enc).logits.detach().cpu().numpy()
        ex  = np.exp(lg - lg.max(axis=1, keepdims=True))
        probs.append(ex / ex.sum(axis=1, keepdims=True))
        if torch.cuda.is_available(): torch.cuda.empty_cache()
probs_test = np.vstack(probs)

pred_test = predict_with_t_neu(probs_test, best_t)
print("TEST (thresholded) macro-F1:", f1_score(test_labels, pred_test, average="macro"))
print("TEST (thresholded) accuracy:", accuracy_score(test_labels, pred_test))
print(classification_report(test_labels, pred_test, digits=4, target_names=["neg","neu","pos"]))

Best t_neu on VAL = 0.34 | macro-F1 = 0.8286
TEST (thresholded) macro-F1: 0.7208214535440476
TEST (thresholded) accuracy: 0.7193366708385481
              precision    recall  f1-score   support

         neg     0.7130    0.7646    0.7379      4176
         neu     0.7308    0.6825    0.7058      6025
         pos     0.7059    0.7321    0.7187      2583

    accuracy                         0.7193     12784
   macro avg     0.7166    0.7264    0.7208     12784
weighted avg     0.7199    0.7193    0.7189     12784



In [13]:
import os, json
best_dir = "artifacts/xlmr-sentiment-best-balanced"
trainer.save_model(best_dir)
tokenizer.save_pretrained(best_dir)
with open(os.path.join(best_dir, "inference_config.json"), "w") as f:
    json.dump({"t_neu": float(best_t)}, f, indent=2)
print("Saved to:", best_dir, "| t_neu =", best_t)

Saved to: artifacts/xlmr-sentiment-best-balanced | t_neu = 0.33999999999999997
