# **Task 1**
---

# **Install Library**

In [None]:
!pip install transformers datasets evaluate accelerate scikit-learn matplotlib seaborn
!pip install torch --upgrade


# **SETUP & CONFIGURASI**

In [None]:
import torch
import numpy as np
import evaluate
from datasets import load_dataset
from huggingface_hub import login
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

if device == "cuda":
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("⚠️ PERINGATAN: Anda tidak menggunakan GPU. Training akan sangat lambat!")

TOKEN = "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" # Ganti jika buat token baru
login(token=TOKEN)

# KONFIGURASI GO_EMOTIONS
MODEL_CKPT = "distilbert-base-uncased"
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 3e-5
MAX_LENGTH = 128
SAMPLE_SIZE = 4000  # Jumlah sampel data
SEED = 2025         # Seed agar hasil konsisten

# **LOAD DATASET & LABEL MAPPING**

In [None]:
from datasets import load_dataset

print("\n--- Loading Dataset GoEmotions ---")

# Load Dataset
dataset = load_dataset("go_emotions", "simplified") # config 'simplified' agar lebih ringan

# Ambil Daftar Nama Label
labels_list = dataset["train"].features["labels"].feature.names
NUM_LABELS = len(labels_list)

# Mapping ID <-> Label
id2label = {idx: label for idx, label in enumerate(labels_list)}
label2id = {label: idx for idx, label in enumerate(labels_list)}

print(f"Total Labels: {NUM_LABELS}")
print(f"Contoh Labels: {labels_list[:5]}...") # Tampilkan 5 pertama aja

# Sampling Data (Biar cepat)
print(f"\nMelakukan sampling {SAMPLE_SIZE} data...")

def transform_labels(example):
    # Ambil label pertama saja (Simplifikasi Multi-class)
    label_id = example["labels"][0] if len(example["labels"]) > 0 else 27
    return {"labels": label_id}

formatted_dataset = dataset.map(transform_labels, remove_columns=["labels"])
train_dataset = formatted_dataset["train"].shuffle(seed=SEED).select(range(SAMPLE_SIZE))
eval_dataset = formatted_dataset["test"].shuffle(seed=SEED).select(range(SAMPLE_SIZE // 5))

print("Sampling & Formatting Selesai!")

# **TOKENISASI**

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

print("\n--- 3. Tokenizing ---")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CKPT)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=MAX_LENGTH)

# Tokenisasi
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = eval_dataset.map(preprocess_function, batched=True)

cols_to_remove = ["text", "id"]

tokenized_train = tokenized_train.remove_columns(cols_to_remove)
tokenized_test = tokenized_test.remove_columns(cols_to_remove)

# Set format ke PyTorch Tensor
tokenized_train.set_format("torch")
tokenized_test.set_format("torch")

print("Tokenisasi Selesai. Kolom yang tersedia:", tokenized_train.column_names)

# **METRICS**

In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    acc = accuracy.compute(predictions=predictions, references=labels)
    f1_score = f1.compute(predictions=predictions, references=labels, average="weighted") # Average='weighted' sangat penting karena data GoEmotions tidak seimbang

    return {"accuracy": acc["accuracy"], "f1": f1_score["f1"]}

# **SETUP MODEL & TRAINER**

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

print("\n--- 5. Setup Model ---")

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CKPT,
    num_labels=NUM_LABELS, # Otomatis 28
    id2label=id2label,
    label2id=label2id
)
model.to(device)

repo_name = "finetuning-bert-text-classification-goemotions"

training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
    report_to="none",
    logging_steps=10, # Agar grafik muncul walau data sedikit
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
print("Setup Siap.")

# **EKSEKUSI TRAINING**

In [None]:
print("\n--- 6. Mulai Training... ---")
trainer.train()

print("\n--- Uploading Model ---")
trainer.push_to_hub()
print("Selesai! Cek akun Hugging Face Anda.")

# **GRAFIK**

In [None]:
import matplotlib.pyplot as plt

history = trainer.state.log_history
steps = []
losses = []

for entry in history:
    if "loss" in entry:
        steps.append(entry["step"])
        losses.append(entry["loss"])

plt.figure(figsize=(10, 5))
plt.plot(steps, losses, label="Training Loss", color="#2ca02c",)
plt.xlabel("Langkah (Steps)")
plt.ylabel("Loss")
plt.title("Grafik Training GoEmotions")
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

# **TES MANUAL**

In [None]:
import torch
import seaborn as sns
import matplotlib.pyplot as plt

def predict_emotion(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)

    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].cpu().numpy()

    # Ambil Top 5 Emosi
    sorted_indices = probs.argsort()[::-1][:5]
    sorted_probs = probs[sorted_indices]
    sorted_labels = [id2label[idx] for idx in sorted_indices]

    print(f"\n'{text}'")
    print(f"Emosi Utama: {sorted_labels[0]} ({sorted_probs[0]:.2%})")

    plt.figure(figsize=(10, 4))
    ax = sns.barplot(x=sorted_probs, y=sorted_labels, palette="cool") # Palette 'cool' cocok buat emosi
    plt.xlabel("Confidence")
    plt.title("Top 5 Prediksi Emosi")
    plt.xlim(0, 1.1)

    for i, v in enumerate(sorted_probs):
        ax.text(v + 0.01, i, f"{v:.2%}", color='black', va='center')
    plt.show()

# CONTOH TES (Coba kalimat emosional)
predict_emotion("I am so happy and grateful for this amazing gift!")
predict_emotion("This is absolutely terrible, I hate it so much.")
predict_emotion("I'm really worried about the exam tomorrow.")

# **VALIDATION METRICS**

In [None]:
import matplotlib.pyplot as plt

print("\n--- 9. Menganalisis Performa Model ---")

# Ambil Riwayat Log dari Trainer
history = trainer.state.log_history

# Siapkan list untuk menampung data
train_steps = []
train_loss = []
eval_steps = []
eval_loss = []
eval_acc = []
eval_f1 = []

# Ekstrak data dari history
for entry in history:
    if "loss" in entry: # data Training
        train_steps.append(entry["step"])
        train_loss.append(entry["loss"])
    elif "eval_loss" in entry: # data Validation
        eval_steps.append(entry["step"])
        eval_loss.append(entry["eval_loss"])
        eval_acc.append(entry["eval_accuracy"])
        if "eval_f1" in entry:
            eval_f1.append(entry["eval_f1"])

# Training Loss vs Validation Loss
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(train_steps, train_loss, label="Training Loss", color="orange", alpha=0.6)
plt.plot(eval_steps, eval_loss, label="Validation Loss", color="blue", marker='o', linewidth=2)
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Training vs Validation Loss")
plt.legend()
plt.grid(True, linestyle='--', alpha=0.5)

# Validation Accuracy & F1 Score
plt.subplot(1, 2, 2)
plt.plot(eval_steps, eval_acc, label="Val Accuracy", color="green", marker='s', linewidth=2)
if eval_f1:
    plt.plot(eval_steps, eval_f1, label="Val F1 Score", color="purple", marker='^', linestyle='--')

plt.xlabel("Steps")
plt.ylabel("Score (0.0 - 1.0)")
plt.title("Peningkatan Akurasi Model")
plt.legend()
plt.grid(True, linestyle='--', alpha=0.5)
plt.ylim(0, 1.0) # Batas grafik 0 sampai 100%

plt.tight_layout()
plt.show()

# Hasil Akhir
if eval_acc:
    print(f"\nStatistik Akhir:")
    print(f"Akurasi Terbaik: {max(eval_acc):.2%}")
    print(f"Validation Loss Terendah: {min(eval_loss):.4f}")
    if eval_f1:
        print(f"F1 Score Terbaik: {max(eval_f1):.2%}")
else:
    print("Data evaluasi tidak ditemukan")