# **TASK 2**

# **INSTALASI**

In [None]:
import os
!pip install -U transformers accelerate datasets evaluate torch

# **SETUP & LOAD DATASET**

In [None]:
import torch
import shutil
import os
from huggingface_hub import login
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

# Masukkan Token
TOKEN = "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"
login(token=TOKEN)

# Cek Hardware
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"✅ Hardware: {device}")

# Import Check
print("✅ Import Library Berhasil (clear_device_cache error fixed).")

# Bersihkan Cache Dataset SQuAD
cache_dir = os.path.expanduser("~/.cache/huggingface/datasets/squad")
if os.path.exists(cache_dir):
    shutil.rmtree(cache_dir)

# Load Dataset
print("--- Memuat Dataset SQuAD ---")
raw_datasets = load_dataset("squad", download_mode="force_redownload")

# Sampling Data
train_data = raw_datasets["train"].shuffle(seed=2024).select(range(4000))
val_data = raw_datasets["validation"].shuffle(seed=2024).select(range(800))

print(f"Data Siap: {len(train_data)} train, {len(val_data)} validation")

# **PREPROCESSING**

In [None]:
from transformers import AutoTokenizer

MODEL_CHECKPOINT = "t5-small"

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

MAX_INPUT = 512
MAX_TARGET = 64

def preprocess_function(examples):
    # Input: Gabungkan Question + Context
    inputs = [f"question: {q} context: {c}" for q, c in zip(examples["question"], examples["context"])]

    # Target: Jawaban teks
    targets = [ans["text"][0] for ans in examples["answers"]]

    # 3. Tokenisasi Input
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT, truncation=True, padding="max_length")

    # 4. Tokenisasi Output (Jawaban)
    labels = tokenizer(targets, max_length=MAX_TARGET, truncation=True, padding="max_length")

    # TEKNIKAL: Mengganti token padding (0) menjadi -100 pada label, PyTorch akan mengabaikan nilai -100 saat menghitung Loss.
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("Memproses data menjadi format token...")
tokenized_train = train_data.map(preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names)
tokenized_val = val_data.map(preprocess_function, batched=True, remove_columns=raw_datasets["validation"].column_names)
print("Selesai.")

# **SETUP MODEL - SEQ2SEQ**

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

# Definisi ulang variabel yang hilang
MODEL_CHECKPOINT = "t5-small"
batch_size = 8

# Load Model Pre-trained T5
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)
model.to(device)

# Nama Repo
repo_name = "finetuning-t5-question-answering"

args = Seq2SeqTrainingArguments(
    output_dir=repo_name,
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    push_to_hub=True,
    logging_steps=50,
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer
)

print("Trainer siap.")

# **TRAINING**

In [None]:
print("Mulai Fine-Tuning T5...")
trainer.train()

print("\n--- Uploading ke Hugging Face ---")
trainer.push_to_hub()
print("Selesai! Model sudah online.")

# **VISUALISASI**

In [None]:
import matplotlib.pyplot as plt

history = trainer.state.log_history
steps = []
losses = []

# Ekstrak data loss
for entry in history:
    if "loss" in entry:
        steps.append(entry["step"])
        losses.append(entry["loss"])

plt.figure(figsize=(10, 5))
plt.plot(steps, losses, label="Training Loss", color="#008080", marker='o')
plt.xlabel("Langkah Training (Steps)")
plt.ylabel("Loss")
plt.title("Kurva Pembelajaran Model T5 (SQuAD)")
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

# **INFERENCE**

In [None]:
# Fungsi Pembantu untuk Melakukan Prediksi
def ask_t5(context, question):
    # Format Input sesuai format T5
    input_text = f"question: {question} context: {context}"

    # Tokenisasi
    inputs = tokenizer(input_text, return_tensors="pt").to(device)

    # Generate Jawaban
    outputs = model.generate(**inputs, max_length=64)

    # Decode
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print("-" * 50)
    print(f"TANYA: {question}")
    print(f"JAWAB: {answer}")
    print("-" * 50)

# --- TES 1 ---
context_1 = """
Borobudur is a 9th-century Mahayana Buddhist temple in Magelang Regency,
not far from the town of Muntilan, in Central Java, Indonesia.
It is the world's largest Buddhist temple. The temple consists of nine stacked platforms,
six square and three circular, topped by a central dome.
"""
ask_t5(context_1, "Where is Borobudur located?")
ask_t5(context_1, "What consists of nine stacked platforms?")

# --- TES 2: ---
context_2 = """
Python is a high-level, general-purpose programming language.
Its design philosophy emphasizes code readability with the use of significant indentation.
Python is dynamically typed and garbage-collected.
"""
ask_t5(context_2, "What is Python?")
ask_t5(context_2, "What does Python design emphasize?")