In [1]:
!pip install -U transformers datasets scikit-learn evaluate

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset, DatasetDict
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
import os

In [4]:
import transformers
print(f"Transformers version: {transformers.__version__}")

Transformers version: 4.51.3


In [5]:
cache_dir = "./model_cache"
os.makedirs(cache_dir, exist_ok=True)
print(f"Model akan disimpan di: {cache_dir}")

Model akan disimpan di: ./model_cache


In [6]:
file_path = "/content/Review Instagram.csv"

In [8]:
from google.colab import files
uploaded = files.upload()

Saving Review Instagram.csv to Review Instagram.csv


In [9]:
df = pd.read_csv(file_path)

In [10]:
def map_sentiment(score):
    if score <= 2:
        return "negative"
    elif score == 3:
        return "neutral"
    else:
        return "positive"

In [11]:
df = df[["Review Text", "Rating"]].dropna()
df["label"] = df["Rating"].apply(map_sentiment)

In [12]:
le = LabelEncoder()
df["label_id"] = le.fit_transform(df["label"])

In [13]:
print(f"Label distribution: {df['label'].value_counts()}")
print(f"Label mapping: {dict(zip(le.classes_, le.transform(le.classes_)))}")

Label distribution: label
negative    637
positive    229
neutral     134
Name: count, dtype: int64
Label mapping: {'negative': np.int64(0), 'neutral': np.int64(1), 'positive': np.int64(2)}


In [14]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["Review Text"].tolist(),
    df["label_id"].tolist(),
    test_size=0.2,
    stratify=df["label_id"],
    random_state=42
)
print(f"Data training: {len(train_texts)}, Data validasi: {len(val_texts)}")

Data training: 800, Data validasi: 200


In [15]:
print("Memuat model IndoBERT - ini mungkin akan mengunduh model jika belum ada di cache...")
model_name = "indolem/indobert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir, local_files_only=False)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    cache_dir=cache_dir,
    local_files_only=False
)
print("Model berhasil dimuat!")

Memuat model IndoBERT - ini mungkin akan mengunduh model jika belum ada di cache...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/234k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model berhasil dimuat!


In [16]:
dataset = DatasetDict({
    'train': Dataset.from_dict({"text": train_texts, "label": train_labels}),
    'validation': Dataset.from_dict({"text": val_texts, "label": val_labels})
})

In [17]:
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding=True)

print("Melakukan tokenisasi dataset...")
tokenized_dataset = dataset.map(tokenize, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
print("Tokenisasi selesai!")

Melakukan tokenisasi dataset...


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Tokenisasi selesai!


In [18]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [19]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_dir="./logs",
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    do_eval=True,
    report_to=["none"],        # <<< tambahkan ini
)

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
# Train
print("Mulai fine-tuning...")
trainer.train()
print("Fine-tuning selesai!")

Mulai fine-tuning...


Step,Training Loss


In [None]:
model_save_path = "./finetuned-indobert-instagram"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model disimpan di: {model_save_path}")

In [None]:
sample_texts = [
    "Akun saya tidak bisa login, sangat mengecewakan",
    "Update terbaru sangat membantu, makin keren",
    "Biasa saja, belum terlalu terasa bedanya"
]

In [None]:
print("\nPrediksi Sampel:")
encoded = tokenizer(sample_texts, padding=True, truncation=True, return_tensors="pt")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
encoded = {k: v.to(device) for k, v in encoded.items()}

In [None]:
with torch.no_grad():
    outputs = model(**encoded)
    logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=-1)
    preds = torch.argmax(probs, dim=-1)

In [None]:
for text, pred, prob in zip(sample_texts, preds, probs):
    label = le.inverse_transform([pred.item()])[0]
    confidence = prob[pred.item()].item()
    print(f"Text: {text}")
    print(f"Predicted Sentiment: {label} (Confidence: {confidence:.4f})")
    print("-" * 50)