In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from peft import get_peft_model, AdaLoraConfig, TaskType
from datasets import load_from_disk
import evaluate
import numpy as np

# 1. SETUP & LOAD DATA
checkpoint = "prajjwal1/bert-tiny"
dataset = load_from_disk("processed_data/smsa_processed") 

id2label = {0: "positif", 1: "netral", 2: "negatif"}
label2id = {"positif": 0, "netral": 1, "negatif": 2}

# 2. LOAD BASE MODEL
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, 
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

# --- REVISI HYPERPARAMETER (ANTI-OVERFIT) ---
BATCH_SIZE = 32
NUM_EPOCHS = 30  # Turunkan drastis dari 100 ke 30 (Kualitas > Kuantitas)
train_samples = len(dataset["train"])

# Hitung Total Steps Baru
total_steps = (train_samples // BATCH_SIZE) * NUM_EPOCHS

# Hitung tinit dan tfinal
t_init_steps = int(total_steps * 0.15) 
t_final_steps = int(total_steps * 0.20) 

print(f"Total Steps Baru: {total_steps}")
print(f"Jadwal AdaLoRA -> Init: {t_init_steps}, Final: {t_final_steps}")

# 3. KONFIGURASI ADALORA (LEBIH PINTAR & STABIL)
peft_config = AdaLoraConfig(
    task_type=TaskType.SEQ_CLS, 
    r=32,            # NAIKKAN: Biar kapasitas otaknya lebih besar
    lora_alpha=64,   # NAIKKAN: Biasanya 2x dari r
    target_modules=["query", "value"], 
    lora_dropout=0.1, # NAIKKAN: Biar gak gampang menghafal (Overfitting)
    bias="none",
    init_r=12, 
    target_r=8, 
    beta1=0.85, 
    beta2=0.85,
    tinit=t_init_steps, 
    tfinal=t_final_steps, 
    deltaT=10,
    total_step=total_steps 
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# 4. METRIC
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# 5. TRAINING ARGUMENTS (OPTIMAL)
training_args = TrainingArguments(
    output_dir="model_output/smsa_adalora_tuned",
    learning_rate=5e-4,      # TURUNKAN: Biar belajarnya pelan tapi pasti
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    
    # Strategi Evaluasi
    eval_strategy="epoch", 
    save_strategy="epoch",
    load_best_model_at_end=True,
    
    # --- KUNCI ANTI SAKIT HATI ---
    metric_for_best_model="accuracy", # Simpan berdasarkan Akurasi
    greater_is_better=True,           # Semakin tinggi akurasi, semakin bagus
    save_total_limit=2,               # Cuma simpan 2 model terbaik biar hemat storage
    # -----------------------------
    
    logging_steps=50,   # Ubah ke 50 biar log-nya gak spamming (tapi tetep update)
    report_to="none",
)

# Inisialisasi Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=AutoTokenizer.from_pretrained(checkpoint),
    compute_metrics=compute_metrics,
)

print("Mulai Training Revisi (30 Epoch, Rank 32)...")
trainer.train()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# 1. Ambil Data Log dari Trainer
history = trainer.state.log_history
df = pd.DataFrame(history)

# 2. Pisahkan Data Training dan Data Validasi
# Training log biasanya punya key 'loss', Validation punya 'eval_loss'
train_loss = df[df['loss'].notna()][['epoch', 'loss']]
val_loss = df[df['eval_loss'].notna()][['epoch', 'eval_loss']]
val_acc = df[df['eval_accuracy'].notna()][['epoch', 'eval_accuracy']]

# 3. Plotting
plt.figure(figsize=(15, 5))
sns.set_style("whitegrid")

# --- Grafik 1: Loss (Training vs Validation) ---
plt.subplot(1, 2, 1)
plt.plot(train_loss['epoch'], train_loss['loss'], label='Training Loss', color='blue', linestyle='--')
plt.plot(val_loss['epoch'], val_loss['eval_loss'], label='Validation Loss', color='red', linewidth=2)
plt.title('Kurva Loss (Semakin Rendah Semakin Bagus)', fontsize=14)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# --- Grafik 2: Akurasi (Validation) ---
plt.subplot(1, 2, 2)
plt.plot(val_acc['epoch'], val_acc['eval_accuracy'], label='Validation Accuracy', color='green', marker='o')
plt.title('Kurva Akurasi (Semakin Tinggi Semakin Bagus)', fontsize=14)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding
from peft import PeftModel, PeftConfig
from datasets import load_from_disk
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

# ==========================================
# 1. SETUP & LOAD MODEL YANG SUDAH JADI
# ==========================================
print("üìÇ Sedang meload Model SMSA dari: saved_models/smsa_adalora_best ...")

# A. Load Data SMSA
dataset = load_from_disk("processed_data/smsa_processed") 

# B. Label Mapping (Harus 3 kelas: Positif, Netral, Negatif)
id2label = {0: "positif", 1: "netral", 2: "negatif"}
label2id = {"positif": 0, "netral": 1, "negatif": 2}
label_names = ["Positif", "Netral", "Negatif"]

# C. Load Base Model (TinyBERT Polos)
checkpoint = "prajjwal1/bert-tiny"
base_model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, 
    num_labels=3, 
    id2label=id2label, 
    label2id=label2id
)

# D. Load "Otak" AdaLoRA yang sudah kamu simpan
model_path = "saved_models/smsa_adalora_best"

try:
    model = PeftModel.from_pretrained(base_model, model_path)
    print("‚úÖ Model berhasil diload! Siap untuk ujian.")
except Exception as e:
    print(f"‚ùå Gagal load model di path: {model_path}")
    print(f"Error: {e}")
    print("Pastikan nama foldernya benar-benar ada.")
    # Stop eksekusi jika gagal load
    raise e

# ==========================================
# 2. INFERENCE MANUAL (ANTI-CRASH ADALORA)
# ==========================================
print("\nüîç Sedang menjalankan Prediksi (Manual Loop)...")

# Setup Device (GPU/CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval() # Mode Evaluasi (Penting!)

# Setup DataLoader
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
test_loader = DataLoader(dataset["test"], batch_size=32, collate_fn=data_collator)

all_preds = []
all_labels = []

# Loop Prediksi
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        # Pindah ke GPU
        batch = {k: v.to(device) for k, v in batch.items()}
        labels = batch.pop("labels")
        
        # Forward Pass
        outputs = model(**batch)
        
        # Ambil prediksi (index dengan nilai tertinggi)
        preds = torch.argmax(outputs.logits, dim=-1)
        
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# ==========================================
# 3. VISUALISASI HASIL
# ==========================================
print("\nüìä Membuat Visualisasi...")

# A. Confusion Matrix
cm = confusion_matrix(all_labels, all_preds)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', 
            xticklabels=label_names, yticklabels=label_names,
            cbar_kws={'label': 'Jumlah Sampel'})

plt.title('Confusion Matrix: SMSA (Sentiment Analysis)', fontsize=14, fontweight='bold', pad=20)
plt.xlabel('Prediksi Model', fontsize=12)
plt.ylabel('Label Sebenarnya', fontsize=12)
plt.tight_layout()
plt.show()

# B. Laporan Klasifikasi
print("\n" + "="*50)
print("HASIL AKHIR SMSA (DARI SAVED MODEL)")
print("="*50)
report = classification_report(all_labels, all_preds, target_names=label_names, digits=4)
print(report)

In [None]:
import os

# Saya ubah nama foldernya jadi 'best' biar lebih jelas
save_path = "saved_models/smsa_adalora_best"
os.makedirs(save_path, exist_ok=True)

# 1. Simpan Model
# Karena load_best_model_at_end=True, ini OTOMATIS menyimpan checkpoint terbaik
trainer.save_model(save_path)

# 2. Simpan Tokenizer (Wajib sepaket)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.save_pretrained(save_path)

print(f"‚úÖ Best Model sukses disimpan di: {save_path}")

# 3. Cek Checkpoint Mana yang Dipilih
# Ini untuk memastikan dia tidak mengambil epoch 100
if trainer.state.best_model_checkpoint:
    print(f"‚ÑπÔ∏è Model ini diambil dari checkpoint: {trainer.state.best_model_checkpoint}")
else:
    print("‚ÑπÔ∏è Info checkpoint tidak tersedia, tapi model yang tersimpan tetap yang terbaik.")

Emot

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from peft import get_peft_model, AdaLoraConfig, TaskType
from datasets import load_from_disk
import evaluate
import numpy as np

# --- 1. SETUP DATA & MODEL (Load Ulang Biar Bersih) ---
checkpoint = "prajjwal1/bert-tiny"
dataset = load_from_disk("processed_data/emot_processed") 

# Label Mapping untuk EMOT
id2label = {0: "sadness", 1: "anger", 2: "love", 3: "fear", 4: "happy"}
label2id = {"sadness": 0, "anger": 1, "love": 2, "fear": 3, "happy": 4}

# Load Model Baru (PENTING: Jangan pakai variabel 'model' bekas SMSA)
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, 
    num_labels=5,         # EMOT ada 5 label
    id2label=id2label,
    label2id=label2id
)

# --- 2. HITUNG ULANG STEPS (KHUSUS DATA EMOT) ---
BATCH_SIZE = 32
NUM_EPOCHS = 30 # Kita pakai 30 epoch sesuai strategi baru
train_samples = len(dataset["train"])

# Rumus Steps
total_steps = (train_samples // BATCH_SIZE) * NUM_EPOCHS
t_init_steps = int(total_steps * 0.15) 
t_final_steps = int(total_steps * 0.20) 

print(f"Data EMOT: {train_samples} sampel.")
print(f"Jadwal AdaLoRA: Init={t_init_steps}, Final={t_final_steps}, Total={total_steps}")

# --- 3. KONFIGURASI ADALORA (KODEMU) ---
peft_config = AdaLoraConfig(
    task_type=TaskType.SEQ_CLS, 
    r=32,            
    lora_alpha=64,   
    target_modules=["query", "value"], 
    lora_dropout=0.1, 
    bias="none",
    init_r=12, 
    target_r=8, 
    beta1=0.85, 
    beta2=0.85,
    tinit=t_init_steps,     # Sekarang variabel ini sudah benar isinya
    tfinal=t_final_steps,   # Sekarang variabel ini sudah benar isinya
    deltaT=10,
    total_step=total_steps  
)

# Bungkus model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters() 

# --- 4. METRIC & TRAINER ---
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="model_output/emot_adalora_tuned",
    learning_rate=5e-4, 
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    eval_strategy="epoch", 
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy", 
    greater_is_better=True,           
    save_total_limit=2,               
    logging_steps=50,   
    report_to="none",
)

# Custom Trainer (Fix Bug)
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        if "num_items_in_batch" in inputs:
            del inputs["num_items_in_batch"]
        return super().compute_loss(model, inputs, return_outputs)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"], 
    eval_dataset=dataset["test"], 
    tokenizer=AutoTokenizer.from_pretrained(checkpoint),
    compute_metrics=compute_metrics,
)

print("üöÄ Mulai Training EMOT (Revisi Strategi)...")
trainer.train()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report


# ==========================================
# BAGIAN 1: VISUALISASI KURVA BELAJAR (VERSI AMAN)
# ==========================================
print("Sedang membuat grafik Kurva Belajar...")

# 1. Ambil history training
history = trainer.state.log_history
df = pd.DataFrame(history)

# 2. Pisahkan Data
# Data Training (Loss)
train_history = df[df['loss'].notna()]
# Data Validation (Loss & Accuracy)
val_history = df[df['eval_loss'].notna()]

# 3. Plotting
plt.figure(figsize=(16, 6))
sns.set_theme(style="whitegrid")

# --- Subplot 1: Grafik Loss ---
plt.subplot(1, 2, 1)

# Plot Training Loss (Sumbu X pakai Epoch)
plt.plot(train_history['epoch'], train_history['loss'], 
         label='Training Loss', color='blue', alpha=0.4, linestyle='--')

# Plot Validation Loss (Sumbu X pakai Epoch)
plt.plot(val_history['epoch'], val_history['eval_loss'], 
         label='Validation Loss', color='red', linewidth=2, marker='o')

plt.title('Kurva Loss: Training vs Validation', fontsize=12, fontweight='bold')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# --- Subplot 2: Grafik Akurasi ---
plt.subplot(1, 2, 2)
# Plot Validation Accuracy
plt.plot(val_history['epoch'], val_history['eval_accuracy'], 
         label='Validation Accuracy', color='green', linewidth=2, marker='s')

plt.title('Kurva Akurasi (Validation)', fontsize=12, fontweight='bold')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim(bottom=0, top=1.0) # Skala 0 sampai 1
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# ==========================================
# BAGIAN 1.5: LAKUKAN PREDIKSI DULU (INI YANG HILANG TADI)
# ==========================================
print("\nSedang melakukan prediksi pada Data Test...")

# 1. Suruh Trainer memprediksi data test
predictions_output = trainer.predict(dataset["test"])

# 2. Ambil hasil prediksi (y_pred) dan label asli (y_true)
y_pred = np.argmax(predictions_output.predictions, axis=1)
y_true = predictions_output.label_ids

# 3. Definisi Label (Harus urut 0-4 sesuai training)
label_names = ["Sadness (0)", "Anger (1)", "Love (2)", "Fear (3)", "Happy (4)"]

# ==========================================
# BAGIAN 2: VISUALISASI CONFUSION MATRIX
# ==========================================
print("Sedang membuat Confusion Matrix...")

# 4. Hitung Matriks
cm = confusion_matrix(y_true, y_pred)

# 5. Plot Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_names, yticklabels=label_names,
            cbar_kws={'label': 'Jumlah Sampel'}, annot_kws={"size": 12})

plt.title('Confusion Matrix: Detail Kesalahan Prediksi EMOT', fontsize=14, fontweight='bold', pad=20)
plt.xlabel('Prediksi Model', fontsize=12)
plt.ylabel('Label Sebenarnya (Kunci Jawaban)', fontsize=12)
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# ==========================================
# BAGIAN 3: LAPORAN ANGKA DETAIL (CLASSIFICATION REPORT)
# ==========================================
print("\n" + "="*50)
print("DETAIL PERFORMA PER KATEGORI EMOSI")
print("="*50)
# Menampilkan Precision, Recall, dan F1-Score untuk setiap emosi
report = classification_report(y_true, y_pred, target_names=label_names, digits=4)
print(report)

In [None]:
import os

# Nama folder penyimpanan
save_path = "saved_models/emot_adalora_best"
os.makedirs(save_path, exist_ok=True)

# 1. Simpan Model Adapter (Best Model)
trainer.save_model(save_path)

# 2. Simpan Tokenizer (Wajib sepaket)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.save_pretrained(save_path)

print(f"‚úÖ Model EMOT Terbaik sukses disimpan di: {save_path}")

# Cek model dari epoch berapa yang diambil
if trainer.state.best_model_checkpoint:
    print(f"‚ÑπÔ∏è Model ini adalah juara dari: {trainer.state.best_model_checkpoint}")

Nerp

In [None]:
# Cek Metadata Label
try:
    # Biasanya tersimpan di fitur 'ner_tags' atau 'labels'
    if "ner_tags" in dataset["train"].features:
        labels_names = dataset["train"].features["ner_tags"].feature.names
    else:
        labels_names = dataset["train"].features["labels"].feature.names
        
    print("‚úÖ BERHASIL DITEMUKAN!")
    print(f"Total Label: {len(labels_names)}")
    print(f"Daftar Label: {labels_names}")
    
except Exception as e:
    print("‚ö†Ô∏è Gagal membaca metadata otomatis.")
    print("Label tersimpan sebagai angka mentah. Kita harus pakai nama sementara.")
    # Fallback: Buat label dummy biar training tetap jalan
    labels_names = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC", "LABEL_9", "LABEL_10"]
    print(f"Saran Label List Sementara: {labels_names}")

In [None]:
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForTokenClassification, 
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)
from peft import get_peft_model, AdaLoraConfig, TaskType
from datasets import load_from_disk
import evaluate
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report

from datasets import load_from_disk, concatenate_datasets

# ==========================================
# 1. PERSIAPAN DATA (PAKSA SPLIT 80:20)
# ==========================================
print("üöÄ Memulai Pipeline Training NERP...")

# Load Data Mentah
try:
    dataset_raw = load_from_disk("processed_data/nerp_processed")
    print("‚úÖ Dataset raw berhasil diload.")
except:
    raise ValueError("‚ùå Folder 'processed_data/nerp_processed' tidak ditemukan!")

# --- LOGIKA BARU: FORCE RESPLIT 0.2 ---
print("üîÑ Sedang membagi ulang dataset menjadi 80% Train : 20% Test...")

# Cek apakah dataset ini berbentuk Dictionary (sudah ada train/test-nya)
if isinstance(dataset_raw, dict) or hasattr(dataset_raw, "keys"):
    keys = list(dataset_raw.keys())
    if "train" in keys and "test" in keys:
        # Kasus: Sudah terlanjur dibagi (misal 90/10), kita gabung dulu
        full_data = concatenate_datasets([dataset_raw["train"], dataset_raw["test"]])
        dataset = full_data.train_test_split(test_size=0.2) # <--- DISINI KITA UBAH JADI 0.2
    elif "train" in keys:
        # Kasus: Cuma ada train, langsung split
        dataset = dataset_raw["train"].train_test_split(test_size=0.2)
    else:
        # Kasus: Dataset tunggal (belum diapa-apain)
        dataset = dataset_raw.train_test_split(test_size=0.2)
else:
    # Jaga-jaga kalau formatnya lain
    dataset = dataset_raw.train_test_split(test_size=0.2)

# Validasi Kolom (Anti-Gado-Gado)
sample_cols = dataset["train"].column_names
if "labels" not in sample_cols and "ner_tags" not in sample_cols:
    raise ValueError(f"‚ùå INI BUKAN DATA NER! Kolom: {sample_cols}")

print(f"‚úÖ DATA READY! Train: {len(dataset['train'])}, Test: {len(dataset['test'])}")
print(f"   (Rasio Test: {len(dataset['test']) / (len(dataset['train']) + len(dataset['test'])):.2%})")

# ==========================================
# 2. CONFIG MODEL & LABEL
# ==========================================
# Definisi 11 Label (Sesuaikan urutan jika beda)
label_list = [
    "O", "B-PER", "I-PER", "B-ORG", "I-ORG", 
    "B-LOC", "I-LOC", "B-MISC", "I-MISC", 
    "LABEL_9", "LABEL_10"
]
id2label = {i: l for i, l in enumerate(label_list)}
label2id = {l: i for i, l in enumerate(label_list)}

checkpoint = "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Load Base Model
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id
)

# Config AdaLoRA (STRICT NER MODE)
peft_config = AdaLoraConfig(
    task_type=TaskType.TOKEN_CLS,  # <--- WAJIB TOKEN_CLS UNTUK NER
    inference_mode=False, 
    r=16, lora_alpha=32, lora_dropout=0.1, bias="none",
    target_modules=["query", "value"],
    init_r=12, target_r=8, beta1=0.85, beta2=0.85,
    tinit=200, tfinal=1000, deltaT=10, total_step=10000 
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# ==========================================
# 3. METRIK & TRAINING
# ==========================================
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Filter -100 (Padding)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Custom Trainer (Anti-Bug)
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        if "num_items_in_batch" in inputs: del inputs["num_items_in_batch"]
        return super().compute_loss(model, inputs, return_outputs)

training_args = TrainingArguments(
    output_dir="model_output/nerp_adalora_final",
    learning_rate=1e-3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10, # Coba 10 epoch biar kelihatan grafiknya
    weight_decay=0.01,
    save_strategy="epoch",
    eval_strategy="epoch", # Versi baru Transformers
    logging_steps=50,
    load_best_model_at_end=True,
    report_to="none"
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics # Metrik dimasukkan disini
)

print("\n‚è≥ Sedang Melatih Model NER...")
trainer.train()


In [None]:

# SIMPAN MODEL
save_path = "saved_models/nerp_adalora_best"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print(f"\nüíæ Model Berhasil Disimpan di: {save_path}")

In [None]:
# A. Kurva Loss & Accuracy
plt.figure(figsize=(12, 5))

# Grafik Loss (Kiri)
plt.subplot(1, 2, 1)
# Plot Training Loss (Garis biru putus-putus)
loss_data = history.dropna(subset=['loss'])
plt.plot(loss_data['epoch'], loss_data['loss'], label='Train Loss', alpha=0.6, linestyle='--')

# Plot Validation Loss (Garis merah bulat)
if 'eval_loss' in history.columns: 
    val_loss_data = history.dropna(subset=['eval_loss'])
    plt.plot(val_loss_data['epoch'], val_loss_data['eval_loss'], label='Val Loss', marker='o', color='red')

plt.title("Kurva Loss (Semakin Rendah Semakin Baik)")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)

# Grafik Accuracy (Kanan) - PENGGANTI F1
plt.subplot(1, 2, 2)
if 'eval_accuracy' in history.columns:
    val_acc_data = history.dropna(subset=['eval_accuracy'])
    plt.plot(val_acc_data['epoch'], val_acc_data['eval_accuracy'], label='Val Accuracy', marker='s', color='orange')
    plt.title("Kurva Accuracy (Semakin Tinggi Semakin Baik)")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.grid(True)
else:
    # Jaga-jaga kalau key-nya beda, kadang bisa 'eval_overall_accuracy' tergantung versi library
    print("‚ö†Ô∏è Kolom 'eval_accuracy' tidak ditemukan di history. Cek nama kolom history.columns")

plt.tight_layout()
plt.show()

In [None]:


# B. Confusion Matrix
print("\nüìä Membuat Confusion Matrix...")
predictions, labels, _ = trainer.predict(dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = []
true_preds = []

for i in range(len(labels)):
    for j in range(len(labels[i])):
        if labels[i][j] != -100:
            true_labels.append(label_list[labels[i][j]])
            true_preds.append(label_list[predictions[i][j]])

unique_labels = sorted(list(set(true_labels + true_preds)))
display_labels = [l for l in unique_labels if "LABEL" not in l] # Buang label sampah

cm = confusion_matrix(true_labels, true_preds, labels=display_labels)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=display_labels, yticklabels=display_labels)
plt.title('Confusion Matrix NER')
plt.ylabel('Asli')
plt.xlabel('Prediksi')
plt.show()

print("\nüèÜ DETAIL RAPOR:")
print(classification_report(true_labels, true_preds, labels=display_labels, digits=4))

QA

In [None]:
import torch
from transformers import (
    AutoModelForQuestionAnswering, 
    AutoTokenizer, 
    TrainingArguments, 
    Trainer,
    DefaultDataCollator
)
from peft import get_peft_model, AdaLoraConfig, TaskType
from datasets import load_from_disk
import numpy as np

# --- 1. SETUP DATA & MODEL ---
checkpoint = "prajjwal1/bert-tiny"
dataset_raw = load_from_disk("processed_data/squad_processed") 

# --- PERBAIKAN DI SINI ---
# Karena dataset_raw tidak punya key 'train', kita buat split sendiri
# 90% buat Training, 10% buat Validasi/Ujian
print("Sedang membagi dataset menjadi Train & Validation...")
dataset = dataset_raw.train_test_split(test_size=0.1)

print(f"‚úÖ Data Siap! Train: {len(dataset['train'])}, Test: {len(dataset['test'])}")

# Load Model QA
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)

# --- 2. CONFIG ADALORA ---
BATCH_SIZE = 32
NUM_EPOCHS = 20 
train_samples = len(dataset["train"]) # Sekarang sudah aman pakai key 'train'

# Rumus Steps
total_steps = (train_samples // BATCH_SIZE) * NUM_EPOCHS

peft_config = AdaLoraConfig(
    task_type=TaskType.QUESTION_ANS, # <--- PENTING: QA
    inference_mode=False, 
    r=8, 
    lora_alpha=32, 
    target_modules=["query", "value"],
    lora_dropout=0.01, 
    bias="none",
    init_r=12, 
    target_r=8, 
    beta1=0.85, 
    beta2=0.85, 
    tinit=int(total_steps * 0.1),  
    tfinal=int(total_steps * 0.2), 
    deltaT=10, 
    total_step=total_steps
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# --- 3. DATA COLLATOR ---
data_collator = DefaultDataCollator()

# --- 4. CUSTOM TRAINER (ANTI BUG) ---
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        if "num_items_in_batch" in inputs:
            del inputs["num_items_in_batch"]
        return super().compute_loss(model, inputs, return_outputs)

# --- 5. TRAINING ARGUMENTS ---
training_args = TrainingArguments(
    output_dir="model_output/squad_adalora",
    learning_rate=1e-3,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=10,
    report_to="none"
)

trainer = CustomTrainer( 
    model=model,
    args=training_args,
    train_dataset=dataset["train"], # Ambil dari hasil split tadi
    eval_dataset=dataset["test"],   # Ambil dari hasil split tadi
    tokenizer=AutoTokenizer.from_pretrained(checkpoint),
    data_collator=data_collator,
)

print("Mulai Training QA SQuAD...")
trainer.train()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# ==========================================
# 1. VISUALISASI KURVA TRAINING (VERSI AMAN)
# ==========================================
print("üìä Sedang membuat Grafik Kurva Belajar NERP...")

# 1. Ambil history
history = trainer.state.log_history
df = pd.DataFrame(history)

# 2. Pisahkan Data
train_history = df[df['loss'].notna()]
val_history = df[df['eval_loss'].notna()]

# --- DEBUGGING: CEK KOLOM APA YANG ADA ---
print("Kolom yang tersedia di log validasi:", val_history.columns.tolist())

plt.figure(figsize=(16, 6))
sns.set_theme(style="whitegrid")

# --- Grafik 1: LOSS ---
plt.subplot(1, 2, 1)
plt.plot(train_history['epoch'], train_history['loss'], 
         label='Training Loss', color='blue', alpha=0.4, linestyle='--')
plt.plot(val_history['epoch'], val_history['eval_loss'], 
         label='Validation Loss', color='red', linewidth=2, marker='o')

plt.title('Kurva Loss (Makin Rendah Makin Bagus)', fontsize=12, fontweight='bold')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()


plt.show()

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, DefaultDataCollator
from datasets import load_from_disk
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import pandas as pd
import numpy as np

# ==========================================
# 1. SETUP DATA QA (YANG SUDAH BENAR)
# ==========================================
print("üöÄ Mempersiapkan Data QA...")

# Load Data (Ini adalah Dataset QA Single)
dataset_raw = load_from_disk("processed_data/squad_processed") # Pastikan path ini benar

# KITA PECAH JADI DUA (Train & Test)
# Biar bisa dipanggil dataset['train'] dan dataset['test']
dataset = dataset_raw.train_test_split(test_size=0.1) 
print(f"‚úÖ Data Siap! Train: {len(dataset['train'])}, Test: {len(dataset['test'])}")

# ==========================================
# 2. SETUP MODEL & TOKENIZER
# ==========================================
checkpoint = "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Pakai Base Model QA (Belum fine-tuned, jadi jawaban mungkin masih ngaco, tapi formatnya benar)
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# ==========================================
# 3. VISUALISASI JAWABAN
# ==========================================
print("\nüîç Mulai Visualisasi QA...")

data_collator = DefaultDataCollator()
# Ambil 10 sampel dari data test
eval_loader = DataLoader(dataset["test"].select(range(10)), batch_size=1, collate_fn=data_collator)

results = []

with torch.no_grad():
    for batch in tqdm(eval_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Forward Pass (Model QA)
        outputs = model(**batch)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits
        
        # 1. PREDISKI MODEL
        answer_start = torch.argmax(start_logits)
        answer_end = torch.argmax(end_logits) + 1
        
        input_ids = batch["input_ids"][0].cpu().tolist()
        pred_tokens = input_ids[answer_start : answer_end]
        pred_text = tokenizer.decode(pred_tokens, skip_special_tokens=True)
        
        # 2. KUNCI JAWABAN (DARI DATASET)
        # Ambil posisi start/end asli dari data
        if "start_positions" in batch:
            true_start = batch["start_positions"][0].item()
            true_end = batch["end_positions"][0].item() + 1
            true_tokens = input_ids[true_start : true_end]
            true_text = tokenizer.decode(true_tokens, skip_special_tokens=True)
        else:
            true_text = "N/A"

        results.append({
            "Jawaban Model": pred_text,
            "Kunci Jawaban": true_text,
            "Cek": "‚úÖ" if pred_text == true_text and pred_text != "" else "‚ùå"
        })

# Tampilkan Tabel
df = pd.DataFrame(results)
print("\n=== HASIL VISUALISASI QA (SQuAD) ===")
pd.set_option('display.max_colwidth', None)
print(df)

In [None]:
import os
save_path = "saved_models/squad_adalora_best"
os.makedirs(save_path, exist_ok=True)

trainer.save_model(save_path)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.save_pretrained(save_path)

print(f"‚úÖ Model QA tersimpan di: {save_path}")

NLG

In [None]:
import torch
from transformers import (
    AutoModelForSeq2SeqLM, 
    AutoTokenizer, 
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from peft import get_peft_model, AdaLoraConfig, TaskType
from datasets import load_from_disk
import evaluate
import numpy as np

# --- 1. SETUP DATA & MODEL ---
model_checkpoint = "google/mt5-small"
dataset_raw = load_from_disk("processed_data/wikilingua_mt5_processed") 

# --- CEK STRUKTUR & PERBAIKAN ---
print(f"Struktur Awal: {dataset_raw}")

# Jika dataset bukan Dictionary (tidak punya key 'train'), kita split manual
if "train" not in dataset_raw:
    print("‚ö†Ô∏è Dataset belum di-split. Melakukan split otomatis 90/10...")
    dataset = dataset_raw.train_test_split(test_size=0.1)
else:
    dataset = dataset_raw

print(f"‚úÖ Data Siap! Train: {len(dataset['train'])}, Test: {len(dataset['test'])}")

# Load Tokenizer & Model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# --- 2. CONFIG ADALORA (Khusus T5) ---
BATCH_SIZE = 8 
NUM_EPOCHS = 10 
# Sekarang aman akses ["train"] karena sudah di-split di atas
train_samples = len(dataset["train"]) 
total_steps = (train_samples // BATCH_SIZE) * NUM_EPOCHS

peft_config = AdaLoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM, # <--- Task Type SEQ2SEQ
    inference_mode=False, 
    r=8, 
    lora_alpha=32, 
    target_modules=["q", "v"], # Target Module T5
    lora_dropout=0.01, 
    bias="none",
    init_r=12, 
    target_r=8, 
    beta1=0.85, 
    beta2=0.85, 
    tinit=int(total_steps * 0.1),  
    tfinal=int(total_steps * 0.2), 
    deltaT=10, 
    total_step=total_steps
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# --- 3. DATA COLLATOR ---
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# --- 4. METRIC (ROUGE) ---
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Ganti -100 dengan pad token
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

# --- 5. TRAINING ARGUMENTS ---
training_args = Seq2SeqTrainingArguments(
    output_dir="model_output/wikilingua_adalora",
    learning_rate=1e-3,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    predict_with_generate=True, # Wajib untuk summarization
    fp16=False,                 
    logging_steps=10,
    report_to="none"
)

# --- 6. TRAINER ---
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Mulai Training Summarization...")
trainer.train()

In [None]:
# 1. Simpan
save_path = "saved_models/wikilingua_adalora_best"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print("‚úÖ Model Summarization Tersimpan!")
