In [155]:
# %pip install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# %pip install transformers[torch]==4.30.2
# %pip install accelerate -U
# %pip install optuna
# %pip install ipywidgets

In [156]:
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer

In [157]:
import pandas as pd

# Dictionary of dataset names and file paths
dataset_files = {
    "combined_dataset": "Dataset/Pre-Processed Dataset/combined_dataset.csv",
    "dataset_1_Cyberbullying_Bahasa_Indonesia": "Dataset/Pre-Processed Dataset/dataset_1_Cyberbullying_Bahasa_Indonesia-Kaggle-CitaTiaraHanni.csv",
    "dataset_2_cyberbullying_dataset": "Dataset/Pre-Processed Dataset/dataset_2_cyberbullying_dataset-Huggingface-aditdwi123.csv",
    "dataset_3_dataset_komentar_instagram_cyberbullying": "Dataset/Pre-Processed Dataset/dataset_3_dataset_komentar_instagram_cyberbullying-github-rizalespe.csv",
    "dataset_4_dataset_luqyana": "Dataset/Pre-Processed Dataset/dataset_4_dataset_luqyana.csv",
}

# Load datasets into a dictionary
datasets = {
    name: pd.read_csv(path)[["encoded_label", "clean_text"]]
    for name, path in dataset_files.items()
}

# Example: show first 10 rows of combined_dataset
datasets["combined_dataset"].head(10)

Unnamed: 0,encoded_label,clean_text
0,1.0,kaka tidur yaa sudah pagi tidak boleh capek2
1,1.0,makan nasi padang saja badannya
2,0.0,suka cukur jembut manggung
3,1.0,hai kak isyana ngefans sekali kak isyana suka ...
4,1.0,manusia bidadari sih herann deh cantik
5,0.0,ayu kinantii isyan sekarang berubah ya baju ny...
6,1.0,gemesnya isyan mirip tango berlapis lapis ciaaaa
7,0.0,jelek saja anaknya ayahnya cakep2
8,0.0,anaknya mirip sudah tua begitu ya mukanya kart...
9,0.0,muka anak nya ko tua sekali yaa tidak ngegemes...


## Options

In [158]:
import ipywidgets as widgets
from IPython.display import display

In [None]:
bert_model_path = 'indolem/indobert-base-uncased'
distilbert_model_path = 'cahya/distilbert-base-indonesian'

# 0 = distilbert, 1 = bert

model_choice_int = widgets.Dropdown(
    options=[('Distilbert', 0), ('Indobert', 1)],
    value=1,
    description='Model:',
    disabled=False,
)

optuna_trials = widgets.BoundedIntText(
    value=30,
    min=0,
    max=50,
    step=1,
    description='optuna_trials:',
    disabled=False
)

dataset_selector = widgets.Dropdown(
    options=list(datasets.keys()),  # take all dataset names
    description='Dataset:',
    disabled=False
)

display(model_choice_int, optuna_trials, dataset_selector)


Dropdown(description='Model:', index=1, options=(('Distilbert', 0), ('Indobert', 1)), value=1)

BoundedIntText(value=30, description='optuna_trials:', max=50)

Dropdown(description='Dataset:', options=('combined_dataset', 'dataset_1_Cyberbullying_Bahasa_Indonesia', 'dat…

In [160]:
import ipywidgets as widgets
from IPython.display import display, clear_output

# Create an output area
out = widgets.Output()

def on_train_click(b):
    with out:  # capture all stdout/stderr into this widget
        clear_output(wait=True)  # clears previous runs
        print("Starting training...")  
        default_start_model_training_and_evaluation(
            model_choice_int.value,
            dataset_selector.value
        )
        print("Training finished!")

def on_optuna_click(b):
    with out:  # capture all stdout/stderr into this widget
        clear_output(wait=True)  # clears previous runs
        print("Starting training...")  
        start_optuna()
        print("Training finished!")

train_button = widgets.Button(description="Run Default Training and Evaluation", layout={'width': 'max-content'})
train_button.on_click(on_train_click)

optuna_button = widgets.Button(description="Run Optuna", layout={'width': 'max-content'})
optuna_button.on_click(on_optuna_click)

# Display everything together
display(train_button, optuna_button, out)


Button(description='Run Default Training and Evaluation', layout=Layout(width='max-content'), style=ButtonStyl…

Button(description='Run Optuna', layout=Layout(width='max-content'), style=ButtonStyle())

Output()

## Train-Test Split

In [161]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset

# Custom dataset
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.labels)

# Load tokenizer
if model_choice_int.value == 0:
    tokenizer = AutoTokenizer.from_pretrained(distilbert_model_path)
    
else:
    tokenizer = AutoTokenizer.from_pretrained(bert_model_path)


# Dictionary to hold train and test custom datasets
train_test_datasets = {}

for name, df in datasets.items():
    # Get text and labels
    X = df['clean_text'].tolist()
    y = df['encoded_label'].tolist()

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Tokenize
    train_encodings = tokenizer(X_train, truncation=True, padding=True)
    test_encodings = tokenizer(X_test, truncation=True, padding=True)

    # Create datasets
    train_dataset = CustomDataset(train_encodings, y_train)
    test_dataset = CustomDataset(test_encodings, y_test)

    # Store in Dict
    train_test_datasets[name] = {
        "train": train_dataset,
        "test": test_dataset,
        # "val": val_dataset

    }

    print(f"Dataset: {name}, Train size: {len(train_dataset)}, Test size: {len(test_dataset)}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Dataset: combined_dataset, Train size: 2748, Test size: 687
Dataset: dataset_1_Cyberbullying_Bahasa_Indonesia, Train size: 520, Test size: 130
Dataset: dataset_2_cyberbullying_dataset, Train size: 815, Test size: 204
Dataset: dataset_3_dataset_komentar_instagram_cyberbullying, Train size: 317, Test size: 80
Dataset: dataset_4_dataset_luqyana, Train size: 1095, Test size: 274


## Train and Evaluation


In [162]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

In [163]:
output_dir = f'./results/{dataset_selector.value}_{"distilbert" if model_choice_int.value == 0 else "bert"}'
log_dir = f'./logs/{dataset_selector.value}_{"distilbert" if model_choice_int.value == 0 else "bert"}'
os.makedirs(output_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

default_training_args = TrainingArguments(
    output_dir=output_dir,
    run_name=f'training_{dataset_selector.value}_{"distilbert" if model_choice_int.value == 0 else "bert"}',
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # gradient_accumulation_steps=2,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir=log_dir,
    logging_steps=10,
    do_eval=True,
    eval_steps=500,
    save_strategy="epoch",
    save_total_limit=1,
    report_to=[],
    fp16=True,
    
)

best_training_args_distil = TrainingArguments(
    output_dir=output_dir,
    run_name=f'training_{dataset_selector.value}_{"distilbert" if model_choice_int.value == 0 else "bert"}',
    per_device_eval_batch_size=16,  # can stay as-is for evaluation
    num_train_epochs=6,            # ⬅️ from Optuna
    per_device_train_batch_size=16,  # ⬅️ from Optuna
    warmup_steps=29,               # ⬅️ from Optuna
    weight_decay=0.09793901282245424,  # ⬅️ from Optuna
    learning_rate=3.694163912198525e-05,  # ⬅️ from Optuna
    logging_dir=log_dir,
    logging_steps=10,
    do_eval=True,
    eval_steps=500,
    save_strategy="epoch",
    save_total_limit=1,
    report_to=[],
    fp16=True,
)

best_training_args_bert = TrainingArguments(
    output_dir=output_dir,
    run_name=f'training_{dataset_selector.value}_{"distilbert" if model_choice_int.value == 0 else "bert"}',
    per_device_eval_batch_size=8,
    per_device_train_batch_size=8,  # From Optuna
    num_train_epochs=9,             # From Optuna
    warmup_steps=117,               # From Optuna
    weight_decay=0.12399629519542921,  # From Optuna
    learning_rate=2.9677882858655988e-05,  # From Optuna
    logging_dir=log_dir,
    logging_steps=10,
    do_eval=True,
    eval_steps=500,
    save_strategy="epoch",
    save_total_limit=1,
    report_to=[],
    fp16=True,
)


### Default


In [164]:
os.environ["WANDB_MODE"] = "disabled"

In [165]:
def evaluate_model(model, test_dataset, batch_size, output_dir, dataset_name):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    dataloader = DataLoader(test_dataset, batch_size=batch_size)

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            # Move inputs to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Move to CPU and accumulate
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    # Metrics
    report = classification_report(all_labels, all_preds, digits=4)
    with open(os.path.join(output_dir, "classification_report.txt"), "w") as f:
        f.write(report)

    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix: {dataset_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "confusion_matrix.png"))
    plt.close()

    print(f"Evaluation complete for {dataset_name}")

In [166]:
def default_start_model_training_and_evaluation(model_choice = model_choice_int.value, dataset_name = dataset_selector.value):
    print(f"\nTraining on dataset: {dataset_name}")

    # Initialize model
    if model_choice == 0:
        model = DistilBertForSequenceClassification.from_pretrained(
                distilbert_model_path, num_labels=2
        )

        training_args = best_training_args_distil

    elif model_choice == 1:
        model = BertForSequenceClassification.from_pretrained(
                bert_model_path, num_labels=2
        )

        training_args = best_training_args_bert

    else:
        model = DistilBertForSequenceClassification.from_pretrained(
                distilbert_model_path, num_labels=2
        )

        training_args = default_training_args


    print(training_args)

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_test_datasets[dataset_name]['train'],
        eval_dataset=train_test_datasets[dataset_name]['test'],
    )

    trainer.train()

    # Save model
    trainer.save_model(output_dir)

    # Evaluate and log
    evaluate_model(model, train_test_datasets[dataset_selector.value]['test'], 4, output_dir, dataset_selector.value)

### Optuna

In [None]:
import torch
import numpy as np
from sklearn.metrics import accuracy_score
from transformers import Trainer, DistilBertForSequenceClassification, BertForSequenceClassification
import gc
from torch.utils.data import DataLoader
from tqdm import tqdm
from optuna import create_study
from sklearn.metrics import f1_score

output_dir_optuna = output_dir + '_optuna'

def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16]),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.3),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 10),
        "warmup_steps": trial.suggest_int("warmup_steps", 0, 300),
    }

def manual_evaluate(model, dataset, batch_size=8):
    model.eval()
    model.to("cuda" if torch.cuda.is_available() else "cpu")
    dataloader = DataLoader(dataset, batch_size=batch_size)

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            inputs = {k: v.to(model.device) for k, v in batch.items() if k in ["input_ids", "attention_mask"]}
            labels = batch["labels"].to(model.device)

            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    f1 = f1_score(all_labels, all_preds)

    torch.cuda.empty_cache()
    gc.collect()

    return f1
    

# Subclass Trainer to inject memory cleanup
class CleanTrainer(Trainer):
    def train(self, *args, **kwargs):
        result = super().train(*args, **kwargs)
        torch.cuda.empty_cache()
        gc.collect()
        return result

if model_choice_int.value == 0:
    # Initialize model
    model_init = lambda: DistilBertForSequenceClassification.from_pretrained(
        distilbert_model_path, num_labels=2
    )

else:
    model_init = lambda: BertForSequenceClassification.from_pretrained(
        bert_model_path, num_labels=2
    )


# Initialize model
if model_choice_int.value == 0:
    model = DistilBertForSequenceClassification.from_pretrained(
            distilbert_model_path, num_labels=2
    )

    training_args = best_training_args_distil

elif model_choice_int.value == 1:
    model = BertForSequenceClassification.from_pretrained(
            bert_model_path, num_labels=2
    )

    training_args = best_training_args_bert

else:
    model = DistilBertForSequenceClassification.from_pretrained(
            distilbert_model_path, num_labels=2
    )

    training_args = default_training_args

# Initialize the Trainer
trainer = CleanTrainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_test_datasets[dataset_selector.value]["train"],
    eval_dataset=train_test_datasets[dataset_selector.value]["test"],
    compute_metrics= manual_evaluate(trainer.model, train_test_datasets[dataset_selector.value]['test'], batch_size=8),
)

def objective(trial):
    hp = optuna_hp_space(trial)

    training_args = TrainingArguments(
        output_dir=f"{output_dir_optuna}_{trial.number}",
        num_train_epochs=hp["num_train_epochs"],
        per_device_train_batch_size=hp["per_device_train_batch_size"],
        warmup_steps=hp["warmup_steps"],
        weight_decay=hp["weight_decay"],
        save_strategy="no",        
        save_total_limit=0,         
        # logging_dir=f"./logs/optuna_trial_{trial.number}",
        report_to=[], # Disable wandb
        fp16=True,
        logging_strategy="no"
    )

    model = model_init()

    trainer = CleanTrainer(
        model=model,
        args=training_args,
        train_dataset=train_test_datasets[dataset_selector.value]["train"],
        eval_dataset=train_test_datasets[dataset_selector.value]["test"],
    )

    trainer.train()

    f1 = manual_evaluate(trainer.model, train_test_datasets[dataset_selector.value]["test"], batch_size=8)
    return f1




  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not

In [168]:
def final_optuna_model(study, output_dir=f"./results/final_eval"):
    # 1. Retrieve best hyperparameters from Optuna
    best_hp = study.best_trial.params

    # 2. Re-initialize training arguments with best parameters
    final_training_args = TrainingArguments(
        output_dir="./results/final_eval",
        num_train_epochs=best_hp["num_train_epochs"],
        per_device_train_batch_size=best_hp["per_device_train_batch_size"],
        warmup_steps=best_hp["warmup_steps"],
        weight_decay=best_hp["weight_decay"],
        learning_rate=best_hp["learning_rate"],
        save_strategy="no",
        report_to=[],
        fp16=True,
        logging_dir="./logs/final_eval"
    )

    # 3. Re-initialize model with same init function
    final_model = model_init()

    # 4. Retrain model on TRAIN + VAL before final test evaluation (optional but recommended)
    from torch.utils.data import ConcatDataset
    train_val_dataset = ConcatDataset([
        train_test_datasets[dataset_selector.value]["train"],
        train_test_datasets[dataset_selector.value]["test"]
    ])

    # Load the same tokenizer used to tokenize the dataset
    tokenizer = AutoTokenizer.from_pretrained(distilbert_model_path if model_choice_int == 0 else bert_model_path)

    # Create a data collator to dynamically pad batches
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


    final_trainer = CleanTrainer(
        model=final_model,
        args=final_training_args,
        train_dataset=train_val_dataset,
        data_collator=data_collator  # <- Fixes padding issue
    )

    # 5. Train on full train+val
    final_trainer.train()

    # 6. Final evaluation on the test set
    final_f1 = manual_evaluate(final_model, train_test_datasets[dataset_selector.value]["test"], batch_size=8)

    print(f"\n🎯 Final Test F1 Score (macro): {final_f1:.4f}")

    final_trainer.save_model(output_dir)
    print(f"Model saved to {output_dir}")


def start_optuna():
    # if use_optuna is False:
    #     print("Optuna is disabled.")
    #     return
    study = create_study(direction="maximize")
    study.optimize(objective, n_trials=optuna_trials.value)

    print("✅ Best trial:")
    print(study.best_trial)

    final_optuna_model(study, output_dir_optuna)