In [27]:
# !pip install --upgrade numpy

In [None]:
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# !pip install transformers datasets accelerate
# !pip install captum  # for explainability methods
# !pip install onnx onnxruntime optimum scikit-learn  # for quantization and metrics

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, DatasetDict
import numpy as np
import os
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# IndoBERT FP32 Fine-tuning Pipeline

This notebook fine-tunes the IndoBERT-base model on Indonesian sentiment analysis datasets.

**Datasets:**
1. ...
2. ...

**Pipeline:**
1. Load and preprocess datasets
2. Standardize to binary sentiment (0: negative, 1: positive)
3. Tokenize using IndoBERT tokenizer
4. Combine all datasets into one dataset
5. Fine-tune model
6. Evaluate on test sets

**Hyperparameters (consistent across all experiments):**
- Learning rate: 2e-5
- Batch size: 16
- Epochs: 3
- Precision: FP32
- Optimizer: AdamW
- Max sequence length: 128

In [30]:
tokenizer = AutoTokenizer.from_pretrained(
    "indobenchmark/indobert-base-p1"
)

In [31]:
# tokenization
def tokenize_fn(batch):
    """
    Tokenize text for IndoBERT
    - max_length=128 to balance context and efficiency
    - truncation=True for longer texts
    - padding='max_length' for consistent batch sizes
    """
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

In [32]:
# load base FP32 IndoBERT model
model_fp32 = AutoModelForSequenceClassification.from_pretrained(
    "indobenchmark/indobert-base-p1",
    num_labels=2
)

print(f"Model loaded: {model_fp32.__class__.__name__}")
print(f"Number of parameters: {model_fp32.num_parameters():,}")