In [None]:
!pip install transformers datasets evaluate accelerate scikit-learn matplotlib seaborn
!pip install torch --upgrade

In [None]:
import torch
from huggingface_hub import login

# Otentikasi ke Hugging Face Hub untuk push to hub
TOKEN = "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"
login(token=TOKEN)

# memeriksa ketersediaan CUDA untuk akselerasi komputasi matriks (Tensor Operations) karena jiika menggunakan CPU, proses fine-tuning akan berjalan lebih lambat.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Hardware Akselerator: {device}")
MODEL_CKPT = "distilbert-base-uncased"

# Hyperparameter Training
BATCH_SIZE = 16
EPOCHS = 3
#    Learning Rate 2e-5 (0.00002) adalah standar emas untuk fine-tuning Transformer
LEARNING_RATE = 2e-5
MAX_LENGTH = 128

# Sampling
SAMPLE_SIZE = 5000
SEED = 42

In [None]:
from datasets import load_dataset

print("\n--- Memuat Dataset GLUE Benchmark (Subset MNLI) ---")
# MNLI (Multi-Genre Natural Language Inference) adalah dataset standar untuk menguji kemampuan logika model.
dataset = load_dataset("glue", "mnli")

id2label = {0: "Entailment", 1: "Neutral", 2: "Contradiction"}
label2id = {"Entailment": 0, "Neutral": 1, "Contradiction": 2}
NUM_LABELS = 3

print(f"Skema Label: {id2label}")
print(f"\nMelakukan Sub-sampling data ({SAMPLE_SIZE} sampel)...")

# Shuffle dengan SEED tetap untuk menjamin Reproducibility
train_dataset = dataset["train"].shuffle(seed=SEED).select(range(SAMPLE_SIZE))
eval_dataset = dataset["validation_matched"].shuffle(seed=SEED).select(range(1000))

print("Dataset siap. Contoh data mentah:", train_dataset[0])

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

print("\n--- Tokenisasi Input ---")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CKPT)

def preprocess_nli(examples):
    # NLI memerlukan DUA input: Premise dan Hypothesis.
    # Tokenizer BERT/DistilBERT secara otomatis akan menyusun input menjadi: [CLS] Premise [SEP] Hypothesis [SEP] Token [SEP] sebagai separator agar tahu batas antar kalimat
    return tokenizer(
        examples["premise"],
        examples["hypothesis"],
        truncation=True,       # Memotong kalimat jika total token > 128
        max_length=MAX_LENGTH
    )

# Mengaplikasikan fungsi preprocessing secara parallel
tokenized_train = train_dataset.map(preprocess_nli, batched=True)
tokenized_eval = eval_dataset.map(preprocess_nli, batched=True)

# Membersihkan kolom teks mentah yang tidak dibutuhkan
cols_to_remove = ["premise", "hypothesis", "idx"]
tokenized_train = tokenized_train.remove_columns(cols_to_remove)
tokenized_eval = tokenized_eval.remove_columns(cols_to_remove)

# Rename kolom target agar sesuai ekspektasi Loss Function (CrossEntropyLoss) HuggingFace
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_eval = tokenized_eval.rename_column("label", "labels")

# Konversi format list Python ke PyTorch Tensor
tokenized_train.set_format("torch")
tokenized_eval.set_format("torch")

print("Preprocessing selesai. Struktur Input IDs sudah mengandung separator token.")

In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Mengambil indeks dengan probabilitas tertinggi
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy.compute(predictions=predictions, references=labels)     # Accuracy: Metrik dasar untuk melihat persentase kebenaran global
    f1_score = f1.compute(predictions=predictions, references=labels, average="weighted")    # F1-Score: Digunakan karena dalam klasifikasi multi-kelas, perlu menyeimbangkan performa antar kelas terutama jika distribusi label data validasi tidak 100% seimbang

    return {"accuracy": acc["accuracy"], "f1": f1_score["f1"]}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

print("\n--- Memuat Model Pre-trained ---")

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CKPT,
    num_labels=NUM_LABELS,
    id2label=id2label,
    label2id=label2id
)
model.to(device) # Memindahkan model ke VRAM GPU

repo_name = "finetuning-distilbert-mnli"

# Konfigurasi Training
training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,            # Regularisasi L2 untuk mencegah Overfitting
    eval_strategy="epoch",        # Evaluasi dilakukan setiap akhir epoch
    save_strategy="epoch",        # Checkpoint model disimpan setiap akhir epoch
    load_best_model_at_end=True,  # Otomatis me-load model dengan performa validasi terbaik di akhir
    push_to_hub=True,
    report_to="none",
    logging_steps=50,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
print("Trainer siap dieksekusi.")

In [None]:
import matplotlib.pyplot as plt

# Mulai Training
print("Mulai Training MNLI...")
trainer.train()

# Upload
trainer.push_to_hub()
print("Model ter-upload!")

# Visualisasi
history = trainer.state.log_history
steps = []
losses = []

for entry in history:
    if "loss" in entry:
        steps.append(entry["step"])
        losses.append(entry["loss"])

plt.figure(figsize=(10, 5))
plt.plot(steps, losses, label="Training Loss", color="purple", marker='o')
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Grafik Training MNLI (DistilBERT)")
plt.legend()
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

In [None]:
import torch
import seaborn as sns

def predict_logic(premise, hypothesis):
    inputs = tokenizer(premise, hypothesis, return_tensors="pt", truncation=True, padding=True).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].cpu().numpy()

    # Ambil prediksi tertinggi
    pred_idx = probs.argmax()
    pred_label = id2label[pred_idx]
    pred_score = probs[pred_idx]

    print(f"\nPremise:    {premise}")
    print(f"Hypothesis: {hypothesis}")
    print(f"ðŸ‘‰ Hubungan: {pred_label} ({pred_score:.2%})")

    # Visualisasi
    plt.figure(figsize=(6, 3))
    colors = ["#2ca02c", "#7f7f7f", "#d62728"]
    sns.barplot(x=probs, y=list(id2label.values()), palette=colors)
    plt.xlim(0, 1.1)
    plt.title("Probabilitas Logika")
    plt.show()

# Entailment
predict_logic("A soccer player is running across the field.", "A person is moving.")

# Contradiction
predict_logic("A man is inspecting the uniform of a figure in some East Asian country.", "The man is sleeping on the couch.")

# Neutral
predict_logic("The product was launched in 2010.", "The product is very expensive.")