In [4]:
# 1. Setup and Imports
from transformers import (
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
from datasets import Dataset, DatasetDict
import pandas as pd
import torch
from accelerate import Accelerator
import math
from huggingface_hub import notebook_login

# 2. Initialize Model and Tokenizer (Global Scope)
model_checkpoint = "indobenchmark/indobert-base-p2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

# 3. Data Preparation
def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    dataset = Dataset.from_pandas(df[['Context', 'Response']].dropna())
    return dataset.train_test_split(test_size=0.1, seed=42)

# 4. Tokenization
def tokenize_function(examples):
    return tokenizer(
        examples["Context"],
        truncation=True,
        max_length=128,
        padding="max_length",
        return_special_tokens_mask=True
    )

# 5. Training Function (Fixed)
def train_with_accelerate(train_dataset, eval_dataset, model, batch_size=64, epochs=5):
    accelerator = Accelerator()
    
    # DataLoaders
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm_probability=0.20
    )
    train_dataloader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, collate_fn=data_collator
    )
    eval_dataloader = torch.utils.data.DataLoader(
        eval_dataset, batch_size=batch_size, collate_fn=data_collator
    )
    
    # Prepare with Accelerator
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model,
        torch.optim.AdamW(model.parameters(), lr=5e-5),
        train_dataloader,
        eval_dataloader
    )
    
    # Training Loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_dataloader:
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
        
        # Evaluation
        model.eval()
        eval_losses = []
        for batch in eval_dataloader:
            with torch.no_grad():
                outputs = model(**batch)
                eval_losses.append(outputs.loss.item())
        
        print(f"Epoch {epoch}: Perplexity = {math.exp(sum(eval_losses)/len(eval_losses)):.2f}")
    
    return model

# 6. Main Workflow
if __name__ == "__main__":
    # Load and preprocess data
    dataset = load_and_preprocess_data("/kaggle/input/psychikadataset-7b/data.csv")
    
    # Tokenize
    tokenized_datasets = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=["Context", "Response"]
    )
    
    # Train (now passing the model explicitly)
    trained_model = train_with_accelerate(
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        model=model  # Pass the pre-loaded model
    )
    
    # Save model
    trained_model.save_pretrained("./indobert-finetuned")
    tokenizer.save_pretrained("./indobert-finetuned")
    print("Model saved successfully!")

Some weights of BertForMaskedLM were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/5100 [00:00<?, ? examples/s]

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Epoch 0: Perplexity = 26.66
Epoch 1: Perplexity = 12.64
Epoch 2: Perplexity = 8.52
Epoch 3: Perplexity = 7.07
Epoch 4: Perplexity = 5.57
Model saved successfully!


In [6]:
from transformers import pipeline
from tabulate import tabulate

# Initialize the mask filler
mask_filler = pipeline("fill-mask", model="./indobert-finetuned")

# Get predictions
predictions = mask_filler("Aku merasa sangat [MASK].")

# Format the output
output_data = []
for i, pred in enumerate(predictions, 1):
    output_data.append([
        i,
        pred['token_str'],
        f"{pred['score']:.3f}",
        pred['sequence']
    ])

# Print as a neat table
headers = ["Rank", "Prediction", "Score", "Complete Sentence"]
print(tabulate(output_data, headers=headers, tablefmt="grid"))

Device set to use cuda:0


+--------+--------------+---------+------------------------------+
|   Rank | Prediction   |   Score | Complete Sentence            |
|      1 | sedih        |   0.119 | aku merasa sangat sedih.     |
+--------+--------------+---------+------------------------------+
|      2 | tertekan     |   0.114 | aku merasa sangat tertekan.  |
+--------+--------------+---------+------------------------------+
|      3 | sendirian    |   0.058 | aku merasa sangat sendirian. |
+--------+--------------+---------+------------------------------+
|      4 | sakit        |   0.055 | aku merasa sangat sakit.     |
+--------+--------------+---------+------------------------------+
|      5 | gugup        |   0.046 | aku merasa sangat gugup.     |
+--------+--------------+---------+------------------------------+


In [7]:
# Initialize both models
original_model = pipeline("fill-mask", model="indobenchmark/indobert-base-p2")
finetuned_model = pipeline("fill-mask", model="./indobert-finetuned")

# Get predictions from both models
text = "Aku merasa sangat [MASK]."
original_preds = original_model(text)
finetuned_preds = finetuned_model(text)

# Prepare comparison data
comparison_data = []
for i in range(5):
    comparison_data.append([
        i+1,
        original_preds[i]['token_str'],
        f"{original_preds[i]['score']:.3f}",
        finetuned_preds[i]['token_str'],
        f"{finetuned_preds[i]['score']:.3f}"
    ])

# Print comparison table
headers = [
    "Rank",
    "Original Prediction", 
    "Original Score",
    "Fine-tuned Prediction",
    "Fine-tuned Score"
]
print(tabulate(comparison_data, headers=headers, tablefmt="grid"))
print("\nKey Observations:")
print("- Fine-tuned model shows stronger confidence in emotional words ('sedih', 'tertekan')")
print("- Original model may include more generic completions")
print("- Score differences highlight domain adaptation benefits")

Some weights of BertForMaskedLM were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0
Device set to use cuda:0


+--------+-----------------------+------------------+-------------------------+--------------------+
|   Rank | Original Prediction   |   Original Score | Fine-tuned Prediction   |   Fine-tuned Score |
|      1 | ##vian                |                0 | sedih                   |              0.119 |
+--------+-----------------------+------------------+-------------------------+--------------------+
|      2 | ##ans                 |                0 | tertekan                |              0.114 |
+--------+-----------------------+------------------+-------------------------+--------------------+
|      3 | gif                   |                0 | sendirian               |              0.058 |
+--------+-----------------------+------------------+-------------------------+--------------------+
|      4 | ##arit                |                0 | sakit                   |              0.055 |
+--------+-----------------------+------------------+-------------------------+------------