In [None]:
# %pip install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# %pip install transformers[torch]==4.30.2
# %pip install accelerate -U
# %pip install optuna
%pip install ipywidgets

In [14]:
import ipywidgets as widgets

In [None]:
model_choice_int = 1
bert_model_path = 'indolem/indobert-base-uncased'
distilbert_model_path = 'cahya/distilbert-base-indonesian'

# 0 = distilbert, 1 = bert

widgets.Dropdown(
    options=[('Distilbert', 1), ('Indobert', 2)],
    value=1,
    description='Model:',
    disabled=False,
    
)

Dropdown(description='Model:', options=(('Distilbert', 1), ('Indobert', 2)), value=1)

In [3]:
import pandas as pd

datasets = {}

df = pd.read_csv('Dataset\Pre-Processed Dataset\combined_dataset.csv')
datasets['combined_dataset'] = df[['encoded_label', 'clean_text']]

datasets['combined_dataset'].head(10)

Unnamed: 0,encoded_label,clean_text
0,1.0,kaka tidur yaa sudah pagi tidak boleh capek2
1,1.0,makan nasi padang saja badannya
2,0.0,suka cukur jembut manggung
3,1.0,hai kak isyana ngefans sekali kak isyana suka ...
4,1.0,manusia bidadari sih herann deh cantik
5,0.0,ayu kinantii isyan sekarang berubah ya baju ny...
6,1.0,gemesnya isyan mirip tango berlapis lapis ciaaaa
7,0.0,jelek saja anaknya ayahnya cakep2
8,0.0,anaknya mirip sudah tua begitu ya mukanya kart...
9,0.0,muka anak nya ko tua sekali yaa tidak ngegemes...


## Train-Test Split

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset

# Custom dataset
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.labels)

# Load tokenizer
if model_choice_int == 0:
    tokenizer = AutoTokenizer.from_pretrained(distilbert_model_path)
    
else:
    tokenizer = AutoTokenizer.from_pretrained(bert_model_path)


# Dictionary to hold train and test custom datasets
train_test_datasets = {}

for name, df in datasets.items():
    # Get text and labels
    X = df['clean_text'].tolist()
    y = df['encoded_label'].tolist()

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Tokenize
    train_encodings = tokenizer(X_train, truncation=True, padding=True)
    test_encodings = tokenizer(X_test, truncation=True, padding=True)

    # Create datasets
    train_dataset = CustomDataset(train_encodings, y_train)
    test_dataset = CustomDataset(test_encodings, y_test)

    # Store in Dict
    train_test_datasets[name] = {
        "train": train_dataset,
        "test": test_dataset,
        # "val": val_dataset

    }

    print(f"Dataset: {name}, Train size: {len(train_dataset)}, Test size: {len(test_dataset)}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Dataset: combined_dataset, Train size: 3033, Test size: 759


## Train and Evaluation


In [6]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

In [7]:
output_dir = f'./results/{name}'
log_dir = f'./logs/{name}'
os.makedirs(output_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

default_training_args = TrainingArguments(
    output_dir=output_dir,
    run_name=f'training_{name}',
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # gradient_accumulation_steps=2,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir=log_dir,
    logging_steps=10,
    do_eval=True,
    eval_steps=500,
    save_strategy="epoch",
    save_total_limit=1,
    report_to=[],
    fp16=True,
    
)

best_training_args_distil = TrainingArguments(
    output_dir=output_dir,
    run_name=f'training_{name}',
    per_device_eval_batch_size=16,  # can stay as-is for evaluation
    num_train_epochs=6,            # ⬅️ from Optuna
    per_device_train_batch_size=16,  # ⬅️ from Optuna
    warmup_steps=29,               # ⬅️ from Optuna
    weight_decay=0.09793901282245424,  # ⬅️ from Optuna
    learning_rate=3.694163912198525e-05,  # ⬅️ from Optuna
    logging_dir=log_dir,
    logging_steps=10,
    do_eval=True,
    eval_steps=500,
    save_strategy="epoch",
    save_total_limit=1,
    report_to=[],
    fp16=True,
)

best_training_args_bert = TrainingArguments(
    output_dir=output_dir,
    run_name=f'training_{name}',
    per_device_eval_batch_size=8,
    per_device_train_batch_size=8,  # From Optuna
    num_train_epochs=9,             # From Optuna
    warmup_steps=117,               # From Optuna
    weight_decay=0.12399629519542921,  # From Optuna
    learning_rate=2.9677882858655988e-05,  # From Optuna
    logging_dir=log_dir,
    logging_steps=10,
    do_eval=True,
    eval_steps=500,
    save_strategy="epoch",
    save_total_limit=1,
    report_to=[],
    fp16=True,
)


### Default


In [8]:
os.environ["WANDB_MODE"] = "disabled"

In [9]:
def evaluate_model(model, test_dataset, batch_size, output_dir, dataset_name):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    dataloader = DataLoader(test_dataset, batch_size=batch_size)

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            # Move inputs to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Move to CPU and accumulate
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    # Metrics
    report = classification_report(all_labels, all_preds, digits=4)
    with open(os.path.join(output_dir, "classification_report.txt"), "w") as f:
        f.write(report)

    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix: {dataset_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "confusion_matrix.png"))
    plt.close()

    print(f"✅ Evaluation complete for {dataset_name}")

In [10]:
print(f"\n🚀 Training on dataset: {name}")

# Initialize model
if model_choice_int == 0:
    model = DistilBertForSequenceClassification.from_pretrained(
            distilbert_model_path, num_labels=2
    )

    training_args = best_training_args_distil

elif model_choice_int == 1:
    model = BertForSequenceClassification.from_pretrained(
            bert_model_path, num_labels=2
    )

    training_args = best_training_args_bert

else:
    model = DistilBertForSequenceClassification.from_pretrained(
            distilbert_model_path, num_labels=2
    )

    training_args = default_training_args


print(training_args)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_test_datasets[name]['train'],
    eval_dataset=train_test_datasets[name]['test'],
)

trainer.train()

# Save model
trainer.save_model(output_dir)


🚀 Training on dataset: combined_dataset


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=500,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HU



  0%|          | 0/3420 [00:00<?, ?it/s]

{'loss': 0.6858, 'learning_rate': 1.7755998291503582e-06, 'epoch': 0.03}
{'loss': 0.7449, 'learning_rate': 4.31217101365087e-06, 'epoch': 0.05}
{'loss': 0.6916, 'learning_rate': 6.59508507970133e-06, 'epoch': 0.08}
{'loss': 0.7761, 'learning_rate': 9.131656264201844e-06, 'epoch': 0.11}
{'loss': 0.7092, 'learning_rate': 1.1668227448702355e-05, 'epoch': 0.13}
{'loss': 0.7449, 'learning_rate': 1.3951141514752815e-05, 'epoch': 0.16}
{'loss': 0.7206, 'learning_rate': 1.6234055580803276e-05, 'epoch': 0.18}
{'loss': 0.7211, 'learning_rate': 1.8770626765303786e-05, 'epoch': 0.21}
{'loss': 0.7213, 'learning_rate': 2.13071979498043e-05, 'epoch': 0.24}
{'loss': 0.7765, 'learning_rate': 2.384376913430481e-05, 'epoch': 0.26}
{'loss': 0.639, 'learning_rate': 2.638034031880532e-05, 'epoch': 0.29}
{'loss': 0.6782, 'learning_rate': 2.8916911503305833e-05, 'epoch': 0.32}
{'loss': 0.695, 'learning_rate': 2.9614986951901344e-05, 'epoch': 0.34}
{'loss': 0.6061, 'learning_rate': 2.9525135656537564e-05, 'epo

In [11]:
# Evaluate and log
evaluate_model(model, train_test_datasets[name]['test'], 4, output_dir, name)

Evaluating: 100%|██████████| 190/190 [00:04<00:00, 43.02it/s]


✅ Evaluation complete for combined_dataset


### Optuna

In [12]:
import torch
import numpy as np
from sklearn.metrics import accuracy_score
from transformers import Trainer, DistilBertForSequenceClassification, BertForSequenceClassification
import gc
from torch.utils.data import DataLoader
from tqdm import tqdm
from optuna import create_study
from sklearn.metrics import f1_score

def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16]),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.3),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 10),
        "warmup_steps": trial.suggest_int("warmup_steps", 0, 300),
    }

def manual_evaluate(model, dataset, batch_size=8):
    model.eval()
    model.to("cuda" if torch.cuda.is_available() else "cpu")
    dataloader = DataLoader(dataset, batch_size=batch_size)

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            inputs = {k: v.to(model.device) for k, v in batch.items() if k in ["input_ids", "attention_mask"]}
            labels = batch["labels"].to(model.device)

            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = f1_score(all_labels, all_preds)

    torch.cuda.empty_cache()
    gc.collect()

    return acc
    

# Subclass Trainer to inject memory cleanup
class CleanTrainer(Trainer):
    def train(self, *args, **kwargs):
        result = super().train(*args, **kwargs)
        torch.cuda.empty_cache()
        gc.collect()
        return result

if model_choice_int == 0:
    # Initialize model
    model_init = lambda: DistilBertForSequenceClassification.from_pretrained(
        distilbert_model_path, num_labels=2
    )

else:
    model_init = lambda: BertForSequenceClassification.from_pretrained(
        bert_model_path, num_labels=2
    )



# Initialize the Trainer
trainer = CleanTrainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_test_datasets[name]["train"],
    eval_dataset=train_test_datasets[name]["test"],
    compute_metrics= manual_evaluate(trainer.model, train_test_datasets[name]['test'], batch_size=8),
)

def objective(trial):
    hp = optuna_hp_space(trial)

    training_args = TrainingArguments(
        output_dir=f"./results/optuna_trial_{trial.number}",
        num_train_epochs=hp["num_train_epochs"],
        per_device_train_batch_size=hp["per_device_train_batch_size"],
        warmup_steps=hp["warmup_steps"],
        weight_decay=hp["weight_decay"],
        save_strategy="no",        # ← disables saving
        save_total_limit=0,         # ← just in case (optional safety)
        logging_dir=f"./logs/optuna_trial_{trial.number}",
        report_to=[],  # Disable wandb
        fp16=True,
    )

    model = model_init()

    trainer = CleanTrainer(
        model=model,
        args=training_args,
        train_dataset=train_test_datasets[name]["train"],
        eval_dataset=train_test_datasets[name]["test"],
    )

    trainer.train()

    acc = manual_evaluate(trainer.model, train_test_datasets[name]["test"], batch_size=8)
    return acc

study = create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("✅ Best trial:")
print(study.best_trial)



Evaluating: 100%|██████████| 95/95 [00:02<00:00, 41.57it/s]
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

  0%|          | 0/1520 [00:00<?, ?it/s]

{'loss': 0.463, 'learning_rate': 3.5904628330995795e-05, 'epoch': 1.32}
{'loss': 0.1532, 'learning_rate': 1.8373071528751755e-05, 'epoch': 2.63}
{'loss': 0.053, 'learning_rate': 8.415147265077139e-07, 'epoch': 3.95}
{'train_runtime': 130.8412, 'train_samples_per_second': 92.723, 'train_steps_per_second': 11.617, 'train_loss': 0.22079304056732277, 'epoch': 4.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 55.88it/s]
[I 2025-08-18 18:48:26,155] Trial 0 finished with value: 0.9654088050314465 and parameters: {'learning_rate': 2.4111941151269226e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.2871265242030436, 'num_train_epochs': 4, 'warmup_steps': 94}. Best is trial 0 with value: 0.9654088050314465.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequen

  0%|          | 0/1520 [00:00<?, ?it/s]

{'loss': 0.3439, 'learning_rate': 3.6528866714183895e-05, 'epoch': 2.63}
{'loss': 0.0648, 'learning_rate': 1.870990734141126e-05, 'epoch': 5.26}
{'loss': 0.0225, 'learning_rate': 8.909479686386316e-07, 'epoch': 7.89}
{'train_runtime': 148.6926, 'train_samples_per_second': 163.182, 'train_steps_per_second': 10.222, 'train_loss': 0.1423149692776956, 'epoch': 8.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 51.25it/s]
[I 2025-08-18 18:50:58,459] Trial 1 finished with value: 0.9640062597809077 and parameters: {'learning_rate': 2.3341108257567182e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.06286913365127207, 'num_train_epochs': 8, 'warmup_steps': 117}. Best is trial 0 with value: 0.9654088050314465.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSeq

  0%|          | 0/1140 [00:00<?, ?it/s]

{'loss': 0.3216, 'learning_rate': 2.8808243727598565e-05, 'epoch': 2.63}
{'loss': 0.0442, 'learning_rate': 6.406810035842293e-06, 'epoch': 5.26}
{'train_runtime': 114.5073, 'train_samples_per_second': 158.924, 'train_steps_per_second': 9.956, 'train_loss': 0.16310628966281288, 'epoch': 6.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 53.02it/s]
[I 2025-08-18 18:52:56,568] Trial 2 finished with value: 0.9796557120500783 and parameters: {'learning_rate': 1.0673160333375926e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.04668752250156914, 'num_train_epochs': 6, 'warmup_steps': 24}. Best is trial 2 with value: 0.9796557120500783.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequ

  0%|          | 0/760 [00:00<?, ?it/s]

{'loss': 0.4205, 'learning_rate': 2.6383399209486165e-05, 'epoch': 2.63}
{'train_runtime': 74.1405, 'train_samples_per_second': 163.635, 'train_steps_per_second': 10.251, 'train_loss': 0.30330435602288497, 'epoch': 4.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 58.82it/s]
[I 2025-08-18 18:54:13,960] Trial 3 finished with value: 0.9651898734177214 and parameters: {'learning_rate': 1.6726306539686333e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.2859658053547885, 'num_train_epochs': 4, 'warmup_steps': 254}. Best is trial 2 with value: 0.9796557120500783.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequ

  0%|          | 0/2660 [00:00<?, ?it/s]

{'loss': 0.479, 'learning_rate': 4.316460741331208e-05, 'epoch': 1.32}
{'loss': 0.177, 'learning_rate': 3.3200478278198486e-05, 'epoch': 2.63}
{'loss': 0.0615, 'learning_rate': 2.3236349143084895e-05, 'epoch': 3.95}
{'loss': 0.0207, 'learning_rate': 1.3272220007971303e-05, 'epoch': 5.26}
{'loss': 0.0142, 'learning_rate': 3.3080908728577124e-06, 'epoch': 6.58}
{'train_runtime': 224.0772, 'train_samples_per_second': 94.749, 'train_steps_per_second': 11.871, 'train_loss': 0.1423984403897049, 'epoch': 7.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 52.24it/s]
[I 2025-08-18 18:58:01,644] Trial 4 finished with value: 0.9826224328593998 and parameters: {'learning_rate': 3.918392179981142e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.1081716243429309, 'num_train_epochs': 7, 'warmup_steps': 151}. Best is trial 4 with value: 0.9826224328593998.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequen

  0%|          | 0/6831 [00:00<?, ?it/s]

{'loss': 0.579, 'learning_rate': 4.6926836492891e-05, 'epoch': 0.66}
{'loss': 0.4516, 'learning_rate': 4.322422985781991e-05, 'epoch': 1.32}
{'loss': 0.3133, 'learning_rate': 3.9521623222748815e-05, 'epoch': 1.98}
{'loss': 0.2329, 'learning_rate': 3.581901658767773e-05, 'epoch': 2.64}
{'loss': 0.1362, 'learning_rate': 3.211640995260663e-05, 'epoch': 3.29}
{'loss': 0.1677, 'learning_rate': 2.842120853080569e-05, 'epoch': 3.95}
{'loss': 0.1259, 'learning_rate': 2.4718601895734598e-05, 'epoch': 4.61}
{'loss': 0.0971, 'learning_rate': 2.1015995260663507e-05, 'epoch': 5.27}
{'loss': 0.1039, 'learning_rate': 1.7313388625592416e-05, 'epoch': 5.93}
{'loss': 0.0702, 'learning_rate': 1.3610781990521326e-05, 'epoch': 6.59}
{'loss': 0.0734, 'learning_rate': 9.908175355450237e-06, 'epoch': 7.25}
{'loss': 0.0554, 'learning_rate': 6.205568720379147e-06, 'epoch': 7.91}
{'loss': 0.0436, 'learning_rate': 2.502962085308057e-06, 'epoch': 8.56}
{'train_runtime': 558.4683, 'train_samples_per_second': 48.878

Evaluating: 100%|██████████| 95/95 [00:01<00:00, 53.67it/s]
[I 2025-08-18 19:07:23,555] Trial 5 finished with value: 0.9671361502347419 and parameters: {'learning_rate': 1.692254452337742e-05, 'per_device_train_batch_size': 4, 'weight_decay': 0.08019437096042144, 'num_train_epochs': 9, 'warmup_steps': 79}. Best is trial 4 with value: 0.9826224328593998.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequen

  0%|          | 0/6072 [00:00<?, ?it/s]

{'loss': 0.6425, 'learning_rate': 4.6913372582001685e-05, 'epoch': 0.66}
{'loss': 0.5304, 'learning_rate': 4.2716568544995794e-05, 'epoch': 1.32}
{'loss': 0.4849, 'learning_rate': 3.851976450798991e-05, 'epoch': 1.98}
{'loss': 0.5976, 'learning_rate': 3.4314550042052145e-05, 'epoch': 2.64}
{'loss': 0.7017, 'learning_rate': 3.0109335576114384e-05, 'epoch': 3.29}
{'loss': 0.6985, 'learning_rate': 2.590412111017662e-05, 'epoch': 3.95}
{'loss': 0.6999, 'learning_rate': 2.1698906644238856e-05, 'epoch': 4.61}
{'loss': 0.6931, 'learning_rate': 1.7493692178301092e-05, 'epoch': 5.27}
{'loss': 0.6955, 'learning_rate': 1.3288477712363332e-05, 'epoch': 5.93}
{'loss': 0.6966, 'learning_rate': 9.083263246425568e-06, 'epoch': 6.59}
{'loss': 0.6901, 'learning_rate': 4.8780487804878055e-06, 'epoch': 7.25}
{'loss': 0.6886, 'learning_rate': 6.72834314550042e-07, 'epoch': 7.91}
{'train_runtime': 487.6471, 'train_samples_per_second': 49.757, 'train_steps_per_second': 12.452, 'train_loss': 0.652151502325286

Evaluating: 100%|██████████| 95/95 [00:01<00:00, 52.24it/s]
[I 2025-08-18 19:15:34,689] Trial 6 finished with value: 0.0 and parameters: {'learning_rate': 4.6864538825921724e-05, 'per_device_train_batch_size': 4, 'weight_decay': 0.008295471453653835, 'num_train_epochs': 8, 'warmup_steps': 127}. Best is trial 4 with value: 0.9826224328593998.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassifica

  0%|          | 0/2660 [00:00<?, ?it/s]

{'loss': 0.5005, 'learning_rate': 4.432841932841933e-05, 'epoch': 1.32}
{'loss': 0.1774, 'learning_rate': 3.409090909090909e-05, 'epoch': 2.63}
{'loss': 0.0637, 'learning_rate': 2.3853398853398853e-05, 'epoch': 3.95}
{'loss': 0.0177, 'learning_rate': 1.3615888615888617e-05, 'epoch': 5.26}
{'loss': 0.009, 'learning_rate': 3.3783783783783788e-06, 'epoch': 6.58}
{'train_runtime': 213.7748, 'train_samples_per_second': 99.315, 'train_steps_per_second': 12.443, 'train_loss': 0.14545012129876847, 'epoch': 7.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 54.29it/s]
[I 2025-08-18 19:19:11,892] Trial 7 finished with value: 0.967948717948718 and parameters: {'learning_rate': 3.3850662096055935e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.1913676089047663, 'num_train_epochs': 7, 'warmup_steps': 218}. Best is trial 4 with value: 0.9826224328593998.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequen

  0%|          | 0/570 [00:00<?, ?it/s]

{'loss': 0.3445, 'learning_rate': 8.296943231441049e-06, 'epoch': 2.63}
{'train_runtime': 55.9941, 'train_samples_per_second': 162.499, 'train_steps_per_second': 10.18, 'train_loss': 0.31050122244316236, 'epoch': 3.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 51.69it/s]
[I 2025-08-18 19:20:11,330] Trial 8 finished with value: 0.9438202247191011 and parameters: {'learning_rate': 4.426963547581735e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.13113619037197863, 'num_train_epochs': 3, 'warmup_steps': 112}. Best is trial 4 with value: 0.9826224328593998.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequ

  0%|          | 0/6831 [00:00<?, ?it/s]

{'loss': 0.6057, 'learning_rate': 4.75390156062425e-05, 'epoch': 0.66}
{'loss': 0.4223, 'learning_rate': 4.379501800720288e-05, 'epoch': 1.32}
{'loss': 0.2217, 'learning_rate': 4.004351740696279e-05, 'epoch': 1.98}
{'loss': 0.1473, 'learning_rate': 3.629201680672269e-05, 'epoch': 2.64}
{'loss': 0.12, 'learning_rate': 3.25405162064826e-05, 'epoch': 3.29}
{'loss': 0.0788, 'learning_rate': 2.8789015606242496e-05, 'epoch': 3.95}
{'loss': 0.0371, 'learning_rate': 2.5037515006002404e-05, 'epoch': 4.61}
{'loss': 0.0524, 'learning_rate': 2.1286014405762305e-05, 'epoch': 5.27}
{'loss': 0.0189, 'learning_rate': 1.754201680672269e-05, 'epoch': 5.93}
{'loss': 0.0095, 'learning_rate': 1.3790516206482592e-05, 'epoch': 6.59}
{'loss': 0.0133, 'learning_rate': 1.0039015606242498e-05, 'epoch': 7.25}
{'loss': 0.0047, 'learning_rate': 6.2875150060024004e-06, 'epoch': 7.91}
{'loss': 0.0108, 'learning_rate': 2.543517406962785e-06, 'epoch': 8.56}
{'train_runtime': 606.8944, 'train_samples_per_second': 44.978

Evaluating: 100%|██████████| 95/95 [00:01<00:00, 54.87it/s]
[I 2025-08-18 19:30:21,543] Trial 9 finished with value: 0.9748427672955976 and parameters: {'learning_rate': 2.1647774646251175e-05, 'per_device_train_batch_size': 4, 'weight_decay': 0.08535011955162826, 'num_train_epochs': 9, 'warmup_steps': 167}. Best is trial 4 with value: 0.9826224328593998.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequ

  0%|          | 0/2280 [00:00<?, ?it/s]

{'loss': 0.5244, 'learning_rate': 4.292929292929293e-05, 'epoch': 1.32}
{'loss': 0.209, 'learning_rate': 3.090428090428091e-05, 'epoch': 2.63}
{'loss': 0.0805, 'learning_rate': 1.887926887926888e-05, 'epoch': 3.95}
{'loss': 0.0235, 'learning_rate': 6.854256854256854e-06, 'epoch': 5.26}
{'train_runtime': 187.0095, 'train_samples_per_second': 97.311, 'train_steps_per_second': 12.192, 'train_loss': 0.18532144521412097, 'epoch': 6.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 51.83it/s]
[I 2025-08-18 19:33:32,115] Trial 10 finished with value: 0.9685534591194969 and parameters: {'learning_rate': 3.313971781960163e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.19432666625223874, 'num_train_epochs': 6, 'warmup_steps': 201}. Best is trial 4 with value: 0.9826224328593998.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequ

  0%|          | 0/2280 [00:00<?, ?it/s]

{'loss': 0.4619, 'learning_rate': 3.9609236234458264e-05, 'epoch': 1.32}
{'loss': 0.1572, 'learning_rate': 2.8507992895204267e-05, 'epoch': 2.63}
{'loss': 0.0558, 'learning_rate': 1.7406749555950266e-05, 'epoch': 3.95}
{'loss': 0.0181, 'learning_rate': 6.305506216696271e-06, 'epoch': 5.26}
{'train_runtime': 197.2122, 'train_samples_per_second': 92.276, 'train_steps_per_second': 11.561, 'train_loss': 0.1546648067340516, 'epoch': 6.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 51.33it/s]
[I 2025-08-18 19:36:52,856] Trial 11 finished with value: 0.9750778816199377 and parameters: {'learning_rate': 1.0563481837762657e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.007174323851631512, 'num_train_epochs': 6, 'warmup_steps': 28}. Best is trial 4 with value: 0.9826224328593998.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSeq

  0%|          | 0/950 [00:00<?, ?it/s]

{'loss': 0.3249, 'learning_rate': 2.4252136752136752e-05, 'epoch': 2.63}
{'train_runtime': 92.1551, 'train_samples_per_second': 164.559, 'train_steps_per_second': 10.309, 'train_loss': 0.19048501667223477, 'epoch': 5.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 55.20it/s]
[I 2025-08-18 19:38:28,366] Trial 12 finished with value: 0.9728867623604466 and parameters: {'learning_rate': 1.0261255510351852e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.12874328939594196, 'num_train_epochs': 5, 'warmup_steps': 14}. Best is trial 4 with value: 0.9826224328593998.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSeq

  0%|          | 0/380 [00:00<?, ?it/s]

{'train_runtime': 37.2263, 'train_samples_per_second': 162.949, 'train_steps_per_second': 10.208, 'train_loss': 0.3621800472861842, 'epoch': 2.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 52.11it/s]
[I 2025-08-18 19:39:09,335] Trial 13 finished with value: 0.9290322580645162 and parameters: {'learning_rate': 1.3380754679059802e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.048579036780905124, 'num_train_epochs': 2, 'warmup_steps': 58}. Best is trial 4 with value: 0.9826224328593998.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSe

  0%|          | 0/3800 [00:00<?, ?it/s]

{'loss': 0.5123, 'learning_rate': 4.721428571428572e-05, 'epoch': 1.32}
{'loss': 0.1876, 'learning_rate': 4.007142857142857e-05, 'epoch': 2.63}
{'loss': 0.0516, 'learning_rate': 3.292857142857143e-05, 'epoch': 3.95}
{'loss': 0.0226, 'learning_rate': 2.5785714285714284e-05, 'epoch': 5.26}
{'loss': 0.0091, 'learning_rate': 1.8657142857142858e-05, 'epoch': 6.58}
{'loss': 0.0054, 'learning_rate': 1.1514285714285715e-05, 'epoch': 7.89}
{'loss': 0.0069, 'learning_rate': 4.371428571428571e-06, 'epoch': 9.21}
{'train_runtime': 291.3101, 'train_samples_per_second': 104.116, 'train_steps_per_second': 13.045, 'train_loss': 0.10548730743558783, 'epoch': 10.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 58.32it/s]
[I 2025-08-18 19:44:03,943] Trial 14 finished with value: 0.9922239502332814 and parameters: {'learning_rate': 3.164159687360286e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.1640557421098752, 'num_train_epochs': 10, 'warmup_steps': 300}. Best is trial 14 with value: 0.9922239502332814.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSeq

  0%|          | 0/3800 [00:00<?, ?it/s]

{'loss': 0.5141, 'learning_rate': 4.682629640124682e-05, 'epoch': 1.32}
{'loss': 0.1834, 'learning_rate': 3.974213658260131e-05, 'epoch': 2.63}
{'loss': 0.0717, 'learning_rate': 3.265797676395579e-05, 'epoch': 3.95}
{'loss': 0.0322, 'learning_rate': 2.5573816945310287e-05, 'epoch': 5.26}
{'loss': 0.0096, 'learning_rate': 1.8489657126664778e-05, 'epoch': 6.58}
{'loss': 0.0109, 'learning_rate': 1.140549730801927e-05, 'epoch': 7.89}
{'loss': 0.0105, 'learning_rate': 4.3213374893737605e-06, 'epoch': 9.21}
{'train_runtime': 286.5391, 'train_samples_per_second': 105.849, 'train_steps_per_second': 13.262, 'train_loss': 0.1100570777842873, 'epoch': 10.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 56.57it/s]
[I 2025-08-18 19:48:53,958] Trial 15 finished with value: 0.9732283464566929 and parameters: {'learning_rate': 3.415483189042178e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.17481038013263972, 'num_train_epochs': 10, 'warmup_steps': 271}. Best is trial 14 with value: 0.9922239502332814.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSe

  0%|          | 0/3800 [00:00<?, ?it/s]

{'loss': 0.5293, 'learning_rate': 4.54858243875585e-05, 'epoch': 1.32}
{'loss': 0.2905, 'learning_rate': 3.861822185521607e-05, 'epoch': 2.63}
{'loss': 0.1689, 'learning_rate': 3.1736856592347925e-05, 'epoch': 3.95}
{'loss': 0.082, 'learning_rate': 2.485549132947977e-05, 'epoch': 5.26}
{'loss': 0.0586, 'learning_rate': 1.7974126066611615e-05, 'epoch': 6.58}
{'loss': 0.0401, 'learning_rate': 1.1092760803743462e-05, 'epoch': 7.89}
{'loss': 0.038, 'learning_rate': 4.21139554087531e-06, 'epoch': 9.21}
{'train_runtime': 291.8411, 'train_samples_per_second': 103.926, 'train_steps_per_second': 13.021, 'train_loss': 0.16147889839975457, 'epoch': 10.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 52.53it/s]
[I 2025-08-18 19:53:49,366] Trial 16 finished with value: 0.9611197511664075 and parameters: {'learning_rate': 2.8727684502572765e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.2234835666926877, 'num_train_epochs': 10, 'warmup_steps': 167}. Best is trial 14 with value: 0.9922239502332814.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSe

  0%|          | 0/3040 [00:00<?, ?it/s]

{'loss': 0.5194, 'learning_rate': 4.52991452991453e-05, 'epoch': 1.32}
{'loss': 0.2098, 'learning_rate': 3.6396011396011395e-05, 'epoch': 2.63}
{'loss': 0.0859, 'learning_rate': 2.7492877492877494e-05, 'epoch': 3.95}
{'loss': 0.0222, 'learning_rate': 1.860754985754986e-05, 'epoch': 5.26}
{'loss': 0.009, 'learning_rate': 9.704415954415955e-06, 'epoch': 6.58}
{'loss': 0.0067, 'learning_rate': 8.012820512820512e-07, 'epoch': 7.89}
{'train_runtime': 245.85, 'train_samples_per_second': 98.694, 'train_steps_per_second': 12.365, 'train_loss': 0.1406049379980878, 'epoch': 8.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 56.32it/s]
[I 2025-08-18 19:57:58,724] Trial 17 finished with value: 0.9635499207606973 and parameters: {'learning_rate': 3.9544372954998644e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.2319519653792289, 'num_train_epochs': 8, 'warmup_steps': 232}. Best is trial 14 with value: 0.9922239502332814.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSeq

  0%|          | 0/3420 [00:00<?, ?it/s]

{'loss': 0.5088, 'learning_rate': 4.6844971172325435e-05, 'epoch': 1.32}
{'loss': 0.1953, 'learning_rate': 3.883728379244074e-05, 'epoch': 2.63}
{'loss': 0.0567, 'learning_rate': 3.082959641255605e-05, 'epoch': 3.95}
{'loss': 0.0249, 'learning_rate': 2.2821909032671366e-05, 'epoch': 5.26}
{'loss': 0.0094, 'learning_rate': 1.4814221652786675e-05, 'epoch': 6.58}
{'loss': 0.0113, 'learning_rate': 6.8065342729019865e-06, 'epoch': 7.89}
{'train_runtime': 278.4956, 'train_samples_per_second': 98.016, 'train_steps_per_second': 12.28, 'train_loss': 0.11817602431565, 'epoch': 9.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 53.41it/s]
[I 2025-08-18 20:02:41,967] Trial 18 finished with value: 0.9905956112852665 and parameters: {'learning_rate': 2.985409318583019e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.10637062096469205, 'num_train_epochs': 9, 'warmup_steps': 298}. Best is trial 14 with value: 0.9922239502332814.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSeq

  0%|          | 0/3800 [00:00<?, ?it/s]

{'loss': 0.5197, 'learning_rate': 4.690604598353676e-05, 'epoch': 1.32}
{'loss': 0.1889, 'learning_rate': 3.980982117513483e-05, 'epoch': 2.63}
{'loss': 0.0606, 'learning_rate': 3.27135963667329e-05, 'epoch': 3.95}
{'loss': 0.0238, 'learning_rate': 2.5617371558330973e-05, 'epoch': 5.26}
{'loss': 0.0163, 'learning_rate': 1.852114674992904e-05, 'epoch': 6.58}
{'loss': 0.0022, 'learning_rate': 1.1424921941527108e-05, 'epoch': 7.89}
{'loss': 0.0007, 'learning_rate': 4.328697133125178e-06, 'epoch': 9.21}
{'train_runtime': 316.7309, 'train_samples_per_second': 95.76, 'train_steps_per_second': 11.998, 'train_loss': 0.10688148015423825, 'epoch': 10.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 57.49it/s]
[I 2025-08-18 20:08:02,176] Trial 19 finished with value: 0.9811320754716981 and parameters: {'learning_rate': 2.7573728191928755e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.15824457740424994, 'num_train_epochs': 10, 'warmup_steps': 277}. Best is trial 14 with value: 0.9922239502332814.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForS

  0%|          | 0/3420 [00:00<?, ?it/s]

{'loss': 0.5521, 'learning_rate': 4.6875e-05, 'epoch': 1.32}
{'loss': 0.229, 'learning_rate': 3.886217948717949e-05, 'epoch': 2.63}
{'loss': 0.0985, 'learning_rate': 3.0865384615384616e-05, 'epoch': 3.95}
{'loss': 0.0647, 'learning_rate': 2.2852564102564103e-05, 'epoch': 5.26}
{'loss': 0.0304, 'learning_rate': 1.483974358974359e-05, 'epoch': 6.58}
{'loss': 0.0111, 'learning_rate': 6.826923076923076e-06, 'epoch': 7.89}
{'train_runtime': 274.3172, 'train_samples_per_second': 99.509, 'train_steps_per_second': 12.467, 'train_loss': 0.14523575389594362, 'epoch': 9.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 57.78it/s]
[I 2025-08-18 20:12:39,836] Trial 20 finished with value: 0.9683544303797469 and parameters: {'learning_rate': 1.8076339697662663e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.22779026810826972, 'num_train_epochs': 9, 'warmup_steps': 300}. Best is trial 14 with value: 0.9922239502332814.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSe

  0%|          | 0/2660 [00:00<?, ?it/s]

{'loss': 0.4918, 'learning_rate': 4.5829805249788315e-05, 'epoch': 1.32}
{'loss': 0.207, 'learning_rate': 3.524555461473328e-05, 'epoch': 2.63}
{'loss': 0.0779, 'learning_rate': 2.466130397967824e-05, 'epoch': 3.95}
{'loss': 0.0416, 'learning_rate': 1.40770533446232e-05, 'epoch': 5.26}
{'loss': 0.0159, 'learning_rate': 3.4928027095681624e-06, 'epoch': 6.58}
{'train_runtime': 212.0542, 'train_samples_per_second': 100.121, 'train_steps_per_second': 12.544, 'train_loss': 0.15751062425455653, 'epoch': 7.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 56.73it/s]
[I 2025-08-18 20:16:15,225] Trial 21 finished with value: 0.965625 and parameters: {'learning_rate': 3.9326469827980505e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.10651663285181716, 'num_train_epochs': 7, 'warmup_steps': 298}. Best is trial 14 with value: 0.9922239502332814.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClas

  0%|          | 0/3420 [00:00<?, ?it/s]

{'loss': 0.4974, 'learning_rate': 4.5333333333333335e-05, 'epoch': 1.32}
{'loss': 0.1938, 'learning_rate': 3.758139534883721e-05, 'epoch': 2.63}
{'loss': 0.0765, 'learning_rate': 2.9829457364341084e-05, 'epoch': 3.95}
{'loss': 0.0488, 'learning_rate': 2.2077519379844965e-05, 'epoch': 5.26}
{'loss': 0.0342, 'learning_rate': 1.434108527131783e-05, 'epoch': 6.58}
{'loss': 0.0239, 'learning_rate': 6.589147286821707e-06, 'epoch': 7.89}
{'train_runtime': 277.1369, 'train_samples_per_second': 98.496, 'train_steps_per_second': 12.34, 'train_loss': 0.13029738560057522, 'epoch': 9.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 54.60it/s]
[I 2025-08-18 20:20:55,730] Trial 22 finished with value: 0.9716088328075709 and parameters: {'learning_rate': 2.8466883706833626e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.12073369898976337, 'num_train_epochs': 9, 'warmup_steps': 195}. Best is trial 14 with value: 0.9922239502332814.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSe

  0%|          | 0/2660 [00:00<?, ?it/s]

{'loss': 0.5016, 'learning_rate': 4.484258492129246e-05, 'epoch': 1.32}
{'loss': 0.1956, 'learning_rate': 3.4486329743164876e-05, 'epoch': 2.63}
{'loss': 0.0745, 'learning_rate': 2.413007456503728e-05, 'epoch': 3.95}
{'loss': 0.0444, 'learning_rate': 1.3773819386909696e-05, 'epoch': 5.26}
{'loss': 0.0271, 'learning_rate': 3.4175642087821044e-06, 'epoch': 6.58}
{'train_runtime': 221.7271, 'train_samples_per_second': 95.753, 'train_steps_per_second': 11.997, 'train_loss': 0.15920075523225885, 'epoch': 7.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 57.34it/s]
[I 2025-08-18 20:24:41,058] Trial 23 finished with value: 0.9797191887675506 and parameters: {'learning_rate': 3.841117027981422e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.09693061191752174, 'num_train_epochs': 7, 'warmup_steps': 246}. Best is trial 14 with value: 0.9922239502332814.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSeq

  0%|          | 0/3800 [00:00<?, ?it/s]

{'loss': 0.5436, 'learning_rate': 4.695849914724275e-05, 'epoch': 1.32}
{'loss': 0.2059, 'learning_rate': 3.985218874360433e-05, 'epoch': 2.63}
{'loss': 0.0842, 'learning_rate': 3.274587833996589e-05, 'epoch': 3.95}
{'loss': 0.0275, 'learning_rate': 2.5639567936327456e-05, 'epoch': 5.26}
{'loss': 0.0234, 'learning_rate': 1.8547470153496305e-05, 'epoch': 6.58}
{'loss': 0.0223, 'learning_rate': 1.1441159749857874e-05, 'epoch': 7.89}
{'loss': 0.0137, 'learning_rate': 4.34906196702672e-06, 'epoch': 9.21}
{'train_runtime': 295.9213, 'train_samples_per_second': 102.493, 'train_steps_per_second': 12.841, 'train_loss': 0.12171447082569725, 'epoch': 10.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 55.57it/s]
[I 2025-08-18 20:29:40,384] Trial 24 finished with value: 0.978125 and parameters: {'learning_rate': 2.9942886916170066e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.15497679014198357, 'num_train_epochs': 10, 'warmup_steps': 282}. Best is trial 14 with value: 0.9922239502332814.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceCla

  0%|          | 0/6072 [00:00<?, ?it/s]

{'loss': 0.6802, 'learning_rate': 4.712692242690553e-05, 'epoch': 0.66}
{'loss': 0.4601, 'learning_rate': 4.2910258576981584e-05, 'epoch': 1.32}
{'loss': 0.3592, 'learning_rate': 3.868514449890147e-05, 'epoch': 1.98}
{'loss': 0.2512, 'learning_rate': 3.4460030420821365e-05, 'epoch': 2.64}
{'loss': 0.1843, 'learning_rate': 3.0243366570897414e-05, 'epoch': 3.29}
{'loss': 0.1523, 'learning_rate': 2.6018252492817308e-05, 'epoch': 3.95}
{'loss': 0.1612, 'learning_rate': 2.1793138414737198e-05, 'epoch': 4.61}
{'loss': 0.0907, 'learning_rate': 1.756802433665709e-05, 'epoch': 5.27}
{'loss': 0.1344, 'learning_rate': 1.3342910258576981e-05, 'epoch': 5.93}
{'loss': 0.0974, 'learning_rate': 9.117796180496873e-06, 'epoch': 6.59}
{'loss': 0.081, 'learning_rate': 4.892682102416765e-06, 'epoch': 7.25}
{'loss': 0.0886, 'learning_rate': 6.675680243366571e-07, 'epoch': 7.91}
{'train_runtime': 465.2218, 'train_samples_per_second': 52.156, 'train_steps_per_second': 13.052, 'train_loss': 0.22673306948896768

Evaluating: 100%|██████████| 95/95 [00:01<00:00, 54.81it/s]
[I 2025-08-18 20:37:29,082] Trial 25 finished with value: 0.9513343799058085 and parameters: {'learning_rate': 4.890209235597461e-05, 'per_device_train_batch_size': 4, 'weight_decay': 0.14036593121687996, 'num_train_epochs': 8, 'warmup_steps': 155}. Best is trial 14 with value: 0.9922239502332814.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSeq

  0%|          | 0/3420 [00:00<?, ?it/s]

{'loss': 0.5083, 'learning_rate': 4.5530987231392094e-05, 'epoch': 1.32}
{'loss': 0.1724, 'learning_rate': 3.774525070071629e-05, 'epoch': 2.63}
{'loss': 0.056, 'learning_rate': 2.9959514170040487e-05, 'epoch': 3.95}
{'loss': 0.0259, 'learning_rate': 2.2173777639364685e-05, 'epoch': 5.26}
{'loss': 0.0137, 'learning_rate': 1.4388041108688881e-05, 'epoch': 6.58}
{'loss': 0.0096, 'learning_rate': 6.617876051074432e-06, 'epoch': 7.89}
{'train_runtime': 300.6995, 'train_samples_per_second': 90.778, 'train_steps_per_second': 11.373, 'train_loss': 0.11613313900796991, 'epoch': 9.0}


Evaluating: 100%|██████████| 95/95 [00:02<00:00, 42.09it/s]
[I 2025-08-18 20:42:33,827] Trial 26 finished with value: 0.978125 and parameters: {'learning_rate': 2.0543374996039574e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.1064471553629868, 'num_train_epochs': 9, 'warmup_steps': 209}. Best is trial 14 with value: 0.9922239502332814.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClass

  0%|          | 0/2660 [00:00<?, ?it/s]

{'loss': 0.5025, 'learning_rate': 4.4975083056478404e-05, 'epoch': 1.32}
{'loss': 0.186, 'learning_rate': 3.4593023255813954e-05, 'epoch': 2.63}
{'loss': 0.0598, 'learning_rate': 2.4210963455149503e-05, 'epoch': 3.95}
{'loss': 0.028, 'learning_rate': 1.382890365448505e-05, 'epoch': 5.26}
{'loss': 0.0048, 'learning_rate': 3.446843853820598e-06, 'epoch': 6.58}
{'train_runtime': 223.1161, 'train_samples_per_second': 95.157, 'train_steps_per_second': 11.922, 'train_loss': 0.14683581692047584, 'epoch': 7.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 53.29it/s]
[I 2025-08-18 20:46:20,637] Trial 27 finished with value: 0.9716088328075709 and parameters: {'learning_rate': 2.553046282895703e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.17337773779947113, 'num_train_epochs': 7, 'warmup_steps': 252}. Best is trial 14 with value: 0.9922239502332814.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSeq

  0%|          | 0/1900 [00:00<?, ?it/s]

{'loss': 0.4806, 'learning_rate': 4.092765460910152e-05, 'epoch': 1.32}
{'loss': 0.1697, 'learning_rate': 2.637106184364061e-05, 'epoch': 2.63}
{'loss': 0.0486, 'learning_rate': 1.1814469078179698e-05, 'epoch': 3.95}
{'train_runtime': 151.3257, 'train_samples_per_second': 100.214, 'train_steps_per_second': 12.556, 'train_loss': 0.18869317054748536, 'epoch': 5.0}


Evaluating: 100%|██████████| 95/95 [00:01<00:00, 56.95it/s]
[I 2025-08-18 20:48:55,226] Trial 28 finished with value: 0.9795918367346939 and parameters: {'learning_rate': 3.217732490546123e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.03351354423237021, 'num_train_epochs': 5, 'warmup_steps': 186}. Best is trial 14 with value: 0.9922239502332814.
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSeq

  0%|          | 0/3800 [00:00<?, ?it/s]

{'loss': 0.5101, 'learning_rate': 4.63684800897364e-05, 'epoch': 1.32}
{'loss': 0.2169, 'learning_rate': 3.935782389231632e-05, 'epoch': 2.63}
{'loss': 0.0893, 'learning_rate': 3.2347167694896246e-05, 'epoch': 3.95}
{'loss': 0.0633, 'learning_rate': 2.533651149747617e-05, 'epoch': 5.26}
{'loss': 0.0488, 'learning_rate': 1.8325855300056086e-05, 'epoch': 6.58}
{'loss': 0.0298, 'learning_rate': 1.1315199102636007e-05, 'epoch': 7.89}
{'loss': 0.0326, 'learning_rate': 4.304542905215928e-06, 'epoch': 9.21}
{'train_runtime': 315.7421, 'train_samples_per_second': 96.059, 'train_steps_per_second': 12.035, 'train_loss': 0.13254492232674048, 'epoch': 10.0}


Evaluating: 100%|██████████| 95/95 [00:02<00:00, 47.10it/s]
[I 2025-08-18 20:54:14,619] Trial 29 finished with value: 0.9761526232114467 and parameters: {'learning_rate': 4.258744059975119e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.2639083181574777, 'num_train_epochs': 10, 'warmup_steps': 234}. Best is trial 14 with value: 0.9922239502332814.


✅ Best trial:
FrozenTrial(number=14, state=TrialState.COMPLETE, values=[0.9922239502332814], datetime_start=datetime.datetime(2025, 8, 18, 19, 39, 9, 336919), datetime_complete=datetime.datetime(2025, 8, 18, 19, 44, 3, 943244), params={'learning_rate': 3.164159687360286e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.1640557421098752, 'num_train_epochs': 10, 'warmup_steps': 300}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=5e-05, log=True, low=1e-05, step=None), 'per_device_train_batch_size': CategoricalDistribution(choices=(4, 8, 16)), 'weight_decay': FloatDistribution(high=0.3, log=False, low=0.0, step=None), 'num_train_epochs': IntDistribution(high=10, log=False, low=2, step=1), 'warmup_steps': IntDistribution(high=300, log=False, low=0, step=1)}, trial_id=14, value=None)


In [13]:
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer  # Only if tokenizer not yet loaded


# 1. Retrieve best hyperparameters from Optuna
best_hp = study.best_trial.params

# 2. Re-initialize training arguments with best parameters
final_training_args = TrainingArguments(
    output_dir="./results/final_eval",
    num_train_epochs=best_hp["num_train_epochs"],
    per_device_train_batch_size=best_hp["per_device_train_batch_size"],
    warmup_steps=best_hp["warmup_steps"],
    weight_decay=best_hp["weight_decay"],
    learning_rate=best_hp["learning_rate"],
    save_strategy="no",
    report_to=[],
    fp16=True,
    logging_dir="./logs/final_eval"
)

# 3. Re-initialize model with same init function
final_model = model_init()

# 4. Retrain model on TRAIN + VAL before final test evaluation (optional but recommended)
from torch.utils.data import ConcatDataset
train_val_dataset = ConcatDataset([
    train_test_datasets[name]["train"],
    train_test_datasets[name]["val"]
])

# Load the same tokenizer used to tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained(distilbert_model_path if model_choice_int == 0 else bert_model_path)

# Create a data collator to dynamically pad batches
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


final_trainer = CleanTrainer(
    model=final_model,
    args=final_training_args,
    train_dataset=train_val_dataset,
    data_collator=data_collator  # <- Fixes padding issue
)

# 5. Train on full train+val
final_trainer.train()

# 6. Final evaluation on the test set
final_f1 = manual_evaluate(final_model, train_test_datasets[name]["test"], batch_size=8)

print(f"\n🎯 Final Test F1 Score (macro): {final_f1:.4f}")


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not

KeyError: 'val'