In [24]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd 
import re
from sklearn.model_selection import train_test_split
import optuna
from datasets import Dataset as HFDataset

In [25]:
train = pd.read_csv(r"E:\Lomba\ANFORCOM DDSC COMPETITION 2025 - PENYISIHAN\penyisihan-ddsc25\train.csv")
test = pd.read_csv(r"E:\Lomba\ANFORCOM DDSC COMPETITION 2025 - PENYISIHAN\penyisihan-ddsc25\test.csv")

In [26]:
le = LabelEncoder()
train["label_enc"] = le.fit_transform(train["label"])

In [None]:
slang_dict = {
    "gak": "tidak",
    "ga": "tidak",
    "nggak": "tidak",
    "aja": "saja",
    "tp": "tapi",
    "yg": "yang",
    "dgn": "dengan",
    "dlm": "dalam",
    "utk": "untuk",
    "jg": "juga",
    "tdk": "tidak",
    "sdh": "sudah",
    "blm": "belum"
}

def normalize_slang(text):
    tokens = text.split()
    return " ".join([slang_dict.get(t, t) for t in tokens])

def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # remove URL
    text = re.sub(r"@\w+", "", text)  # remove username
    text = re.sub(r"#\w+", "", text)  # remove hashtag
    text = re.sub(r"[^A-Za-z0-9\s]", " ", text)  # remove special char
    return text.strip()

def add_features(df):
    df['text_clean'] = df['text'].apply(normalize_slang)
    df["text_clean"] = df["text_clean"].apply(clean_text)
    df["text_len"] = df["text_clean"].apply(len)
    df["word_count"] = df["text_clean"].apply(lambda x: len(x.split()))
    df["capital_ratio"] = df["text"].apply(lambda x: sum(1 for c in x if c.isupper()) / (len(x)+1))
    return df

In [28]:
train = add_features(train)
test = add_features(test)

In [29]:
train

Unnamed: 0,text,label,label_enc,text_clean,text_len,word_count,capital_ratio
0,@hyalfay @NatharElyas @BosPurwa @NatharElyas @...,ideologi,2,Prabowo menang di 2024 karena strategi cerdas ...,200,33,0.046875
1,@sienchao @Sherly0ctaviany @jokowi Halo @Sienc...,ideologi,2,Halo mungkin terus mengangkat isu negatif Jo...,195,25,0.023438
2,@giovannikurnwn @BudiBukanIntel Nama Budi Berl...,ideologi,2,Nama Budi BerlarI mungkin mencerminkan sikap k...,208,29,0.117188
3,@LeotardosMy @easyanying Kita akui keberhasila...,ideologi,2,Kita akui keberhasilannya kita akui juga kekur...,231,36,0.031250
4,@LuckyGaben @susipudjiastuti @prabowo Indonesi...,ideologi,2,Indonesia masih bisa menuju kesuksesan tapi RU...,217,33,0.042969
...,...,...,...,...,...,...,...
4995,Memang kristen zionis itu rata-rata IslamoPhob...,harmoni,0,Memang kristen zionis itu rata rata IslamoPhob...,146,23,0.058480
4996,Masjid Lafadz Allah Dan wanita yang lekat deng...,harmoni,0,Masjid Lafadz Allah Dan wanita yang lekat deng...,275,41,0.046205
4997,@_MbakSri_ Mulut yang selalu jual toleransi ma...,harmoni,0,Mulut yang selalu jual toleransi malah dia sen...,238,34,0.011952
4998,@itsmesoya_ kalo masalah fisik dan lisan masi ...,pekerjaan,3,kalo masalah fisik dan lisan masi bisa di tole...,148,26,0.000000


In [30]:
test

Unnamed: 0,id,text,text_clean,text_len,word_count,capital_ratio
0,0,@xuminghao_ogf @ShopeeID @ShopeePay_ID Bapak g...,Bapak gua juga begini bedanya dia bahkan gaper...,109,18,0.053691
1,1,tidak hanya membayangkan masa depan sebagai ki...,tidak hanya membayangkan masa depan sebagai ki...,198,26,0.005025
2,2,@feliscxtus Kalau aku jadi pemimpin RI visiku ...,Kalau aku jadi pemimpin RI visiku bikin Indone...,243,37,0.035156
3,3,@EkoDwiSant73293 @Dahnilanzar @prabowo Masyara...,Masyarakat bisa menerapkan strategi seperti be...,218,27,0.019380
4,4,@goonjol Mengamankan data pribadi rakyat RI si...,Mengamankan data pribadi rakyat RI sik durung ...,96,13,0.028302
...,...,...,...,...,...,...
4995,4995,Cuma mau ngingetin den presiden @prabowo Sama ...,Cuma mau ngingetin den presiden Sama kil den ...,161,27,0.020513
4996,4996,@BebySoSweet Bikin usaha aja selalu bubar ini ...,Bikin usaha saja selalu bubar ini pakai janji ...,153,24,0.024390
4997,4997,@BebySoSweet Sebenar nya @gibran_tweet ini uda...,Sebenar nya ini udah banyak buka lapangan pek...,243,35,0.014981
4998,4998,@WongAlasRoban Bentuk pengusiran secara Halus....,Bentuk pengusiran secara Halus Atau Memang Ki...,183,27,0.059113


In [31]:
train_df, val_df = train_test_split(train, test_size=0.2, stratify=train["label_enc"], random_state=42)

In [32]:
model_name = "indolem/indobertweet-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [33]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

train_dataset = HFDataset.from_pandas(train_df[["text", "label_enc"]])
val_dataset = HFDataset.from_pandas(val_df[["text", "label_enc"]])
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

train_dataset = train_dataset.rename_column("label_enc", "labels")
val_dataset = val_dataset.rename_column("label_enc", "labels")
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Map: 100%|██████████| 4000/4000 [00:00<00:00, 5281.54 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 5589.04 examples/s]


In [34]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    bal_acc = balanced_accuracy_score(labels, predictions)
    return {"balanced_accuracy": bal_acc}

In [35]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(le.classes_))

def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 6),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0.0, 0.2),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.3),
    }

In [36]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="balanced_accuracy",
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=10,  # bisa ditambah agar hasil lebih optimal
)

print("Best trial:", best_trial)

[I 2025-08-31 23:56:21,340] A new study created in memory with name: no-name-85bf2daf-a3fd-4b8e-8bd6-22353306e8b3
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Balanced Accuracy
1,No log,1.282884,0.430252
2,No log,1.172827,0.492563
3,No log,1.154815,0.525694


[I 2025-09-01 01:26:32,809] Trial 0 finished with value: 0.5256935224329947 and parameters: {'learning_rate': 1.6871865539597492e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 32, 'warmup_ratio': 0.09633310348803154, 'weight_decay': 0.13317646922307683}. Best is trial 0 with value: 0.5256935224329947.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


[W 2025-09-01 01:29:28,874] Trial 1 failed with parameters: {'learning_rate': 3.400360179120958e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 8, 'warmup_ratio': 0.0994313357038509, 'weight_decay': 0.012768747985179407} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\MSI_PC\anaconda3\envs\proyek_nlp\lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
  File "c:\Users\MSI_PC\anaconda3\envs\proyek_nlp\lib\site-packages\transformers\integrations\integration_utils.py", line 274, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "c:\Users\MSI_PC\anaconda3\envs\proyek_nlp\lib\site-packages\transformers\trainer.py", line 2238, in train
    return inner_training_loop(
  File "c:\Users\MSI_PC\anaconda3\envs\proyek_nlp\lib\site-packages\transformers\trainer.py", line 2582, in _inner_training_loop
    tr_loss_step = self.training_step(model,

KeyboardInterrupt: 

In [None]:
best_params = best_trial.hyperparameters
training_args = TrainingArguments(
    output_dir="./results_best",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=best_params["learning_rate"],
    per_device_train_batch_size=best_params["per_device_train_batch_size"],
    per_device_eval_batch_size=best_params["per_device_train_batch_size"],
    num_train_epochs=best_params["num_train_epochs"],
    warmup_ratio=best_params["warmup_ratio"],
    weight_decay=best_params["weight_decay"],
    load_best_model_at_end=True,
    metric_for_best_model="balanced_accuracy",
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
preds = trainer.predict(test)
final_preds = np.argmax(preds.predictions, axis=-1)
test["label"] = le.inverse_transform(final_preds)

test[["id", "label"]].to_csv("submission_15.csv", index=False)
