In [None]:
# ============================================================
# AMBIL CONTOH SALAH PREDIKSI DARI MODEL YANG SUDAH DISIMPAN
# ============================================================

import json
import csv
import os
from typing import List, Dict, Any

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ============================================================
# KONFIGURASI PATH
# ============================================================

# Folder model kamu (isi: model.safetensors, config.json, vocab.txt, dst)
MODEL_DIR = "./best_fold_model_minilm"

DATA_PATH = "./dataset_chatbot.json"

# File label_names
LABEL_FILE = os.path.join(MODEL_DIR, "label_names.json")

# Berapa contoh salah prediksi yang mau ditampilkan
MAX_EXAMPLES = 3

# Panjang max token
MAX_LENGTH = 64

# ============================================================
# LOAD DEVICE
# ============================================================

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ============================================================
# LOAD TOKENIZER & MODEL
# ============================================================

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR).to(device)
model.eval()

# ============================================================
# LOAD LABEL MAPPING
# ============================================================

# Fungsi bantu: baca label_names.json
def load_label_names(path: str) -> List[str]:
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # label_names.json kamu bisa bentuk:
    # 1) list: ["intent_a", "intent_b", ...]
    # 2) dict: {"0":"intent_a","1":"intent_b"} atau {"intent_a":0,...}
    if isinstance(data, list):
        return data

    if isinstance(data, dict):
        # kalau key-nya angka (string) -> urutkan berdasarkan id
        if all(str(k).isdigit() for k in data.keys()):
            return [data[str(i)] for i in sorted(map(int, data.keys()))]

        # kalau key-nya nama label -> balik mapping
        # urutkan berdasarkan id
        if all(isinstance(v, int) for v in data.values()):
            inv = {v: k for k, v in data.items()}
            return [inv[i] for i in sorted(inv.keys())]

    raise ValueError("Format label_names.json tidak dikenali.")

# Ambil label list dari file, kalau tidak ada -> pakai label2id dari config model
if os.path.exists(LABEL_FILE):
    label_list = load_label_names(LABEL_FILE)
else:
    # fallback: ambil dari config
    # (biasanya config.id2label ada)
    id2label = model.config.id2label
    if isinstance(id2label, dict) and len(id2label) > 0:
        # id2label sering key-nya string angka
        label_list = [id2label[str(i)] if str(i) in id2label else id2label[i] for i in range(len(id2label))]
    else:
        raise FileNotFoundError("Tidak menemukan label_names.json dan config model tidak punya id2label.")

label2id = {label: idx for idx, label in enumerate(label_list)}

# ============================================================
# LOAD DATASET JSON
# ============================================================

with open(DATA_PATH, "r", encoding="utf-8") as f:
    dataset = json.load(f)

# dataset harus list of dict
if not isinstance(dataset, list):
    raise ValueError("Dataset JSON harus berupa list of objects.")

# Fungsi bantu untuk ambil field pertanyaan dan intent
def get_field(item: Dict[str, Any], keys: List[str]) -> Any:
    for k in keys:
        if k in item and item[k] is not None:
            return item[k]
    return None

# ============================================================
# INFERENCE + AMBIL YANG SALAH PREDIKSI
# ============================================================

wrong_cases = []

for item in dataset:
    # Ambil pertanyaan
    question = get_field(item, ["question", "pertanyaan", "Pertanyaan"])
    # Ambil label asli
    true_intent = get_field(item, ["intent", "Intent", "label"])

    # Skip kalau data tidak lengkap
    if not question or not true_intent:
        continue

    # Kalau intent di dataset tidak ada di label model, skip (biar tidak error)
    if true_intent not in label2id:
        continue

    # Tokenisasi
    enc = tokenizer(
        str(question),
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

    # Pindah ke device
    enc = {k: v.to(device) for k, v in enc.items()}

    # Prediksi
    with torch.no_grad():
        outputs = model(**enc)
        logits = outputs.logits.squeeze(0)              # shape: [num_labels]
        probs = F.softmax(logits, dim=-1)               # probability tiap label
        pred_id = int(torch.argmax(probs).item())
        pred_intent = label_list[pred_id]
        conf = float(probs[pred_id].item())

    # Bandingkan
    if pred_intent != true_intent:
        wrong_cases.append({
            "question": question,
            "true_intent": true_intent,
            "pred_intent": pred_intent,
            "confidence": round(conf, 4)
        })

# Urutkan: yang paling yakin tapi salah (menarik untuk analisis)
wrong_cases.sort(key=lambda x: x["confidence"], reverse=True)

unique_examples = []
seen_pred = set()

for ex in wrong_cases:
    # pastikan pred_intent belum pernah muncul
    if ex["pred_intent"] in seen_pred:
        continue

    unique_examples.append(ex)
    seen_pred.add(ex["pred_intent"])

    if len(unique_examples) >= MAX_EXAMPLES:
        break

examples = unique_examples

# ============================================================
# TAMPILKAN DI TERMINAL
# ============================================================

print(f"Total data terbaca       : {len(dataset)}")
print(f"Total salah prediksi     : {len(wrong_cases)}")
print(f"Contoh yang ditampilkan  : {len(examples)}\n")

for i, ex in enumerate(examples, 1):
    print(f"[{i}] Q: {ex['question']}")
    print(f"    True: {ex['true_intent']}")
    print(f"    Pred: {ex['pred_intent']} (conf={ex['confidence']})")
    print("-" * 60)


  from .autonotebook import tqdm as notebook_tqdm
The tokenizer you are loading from './best_fold_model_minilm' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


Total data terbaca       : 2040
Total salah prediksi     : 584
Contoh yang ditampilkan  : 3

[1] Q: Gimana cara saya lihat transkrip nilai?
    True: prosedur_akses_cetak_transkrip_nilai
    Pred: prosedur_pengajuan_legalisir_transkrip_nilai (conf=0.0565)
------------------------------------------------------------
[2] Q: Bagaimana proses lengkap pengajuan hingga pelaksanaan program magang bagi mahasiswa TI Unpad?
    True: alur_pengajuan_dan_pelaksanaan_program_magang_hingga_selesai
    Pred: prosedur_pengajuan_program_magang (conf=0.0402)
------------------------------------------------------------
[3] Q: Bagaimana cara mengetahui apakah pembayaran UKT sudah berhasil atau belum?
    True: prosedur_cek_status_pembayaran_ukt
    Pred: konsekuensi_tidak_bayar_ukt_tepat_waktu (conf=0.0387)
------------------------------------------------------------


In [None]:
# ============================================================
# AMBIL CONTOH SALAH PREDIKSI DARI MODEL YANG SUDAH DISIMPAN
# (tanpa retraining)
# ============================================================

import json
import csv
import os
from typing import List, Dict, Any

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ============================================================
# KONFIGURASI PATH
# ============================================================

# Folder model kamu (isi: model.safetensors, config.json, vocab.txt, dst)
MODEL_DIR = "./best_fold_model_indobert"

DATA_PATH = "./dataset_chatbot.json" 

# File label_names
LABEL_FILE = os.path.join(MODEL_DIR, "label_names.json")


# Berapa contoh salah prediksi yang mau ditampilkan
MAX_EXAMPLES = 3

# Panjang max token
MAX_LENGTH = 64

# ============================================================
# LOAD DEVICE
# ============================================================

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ============================================================
# LOAD TOKENIZER & MODEL
# ============================================================

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR).to(device)
model.eval()

# ============================================================
# LOAD LABEL MAPPING
# ============================================================

# Fungsi bantu: baca label_names.json
def load_label_names(path: str) -> List[str]:
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # label_names.json kamu bisa bentuk:
    # 1) list: ["intent_a", "intent_b", ...]
    # 2) dict: {"0":"intent_a","1":"intent_b"} atau {"intent_a":0,...}
    if isinstance(data, list):
        return data

    if isinstance(data, dict):
        # kalau key-nya angka (string) -> urutkan berdasarkan id
        if all(str(k).isdigit() for k in data.keys()):
            return [data[str(i)] for i in sorted(map(int, data.keys()))]

        # kalau key-nya nama label -> balik mapping
        # urutkan berdasarkan id
        if all(isinstance(v, int) for v in data.values()):
            inv = {v: k for k, v in data.items()}
            return [inv[i] for i in sorted(inv.keys())]

    raise ValueError("Format label_names.json tidak dikenali.")

# Ambil label list dari file, kalau tidak ada -> pakai label2id dari config model
if os.path.exists(LABEL_FILE):
    label_list = load_label_names(LABEL_FILE)
else:
    # fallback: ambil dari config
    # (biasanya config.id2label ada)
    id2label = model.config.id2label
    if isinstance(id2label, dict) and len(id2label) > 0:
        # id2label sering key-nya string angka
        label_list = [id2label[str(i)] if str(i) in id2label else id2label[i] for i in range(len(id2label))]
    else:
        raise FileNotFoundError("Tidak menemukan label_names.json dan config model tidak punya id2label.")

label2id = {label: idx for idx, label in enumerate(label_list)}

# ============================================================
# LOAD DATASET JSON
# ============================================================

with open(DATA_PATH, "r", encoding="utf-8") as f:
    dataset = json.load(f)

# dataset harus list of dict
if not isinstance(dataset, list):
    raise ValueError("Dataset JSON harus berupa list of objects.")

# Fungsi bantu untuk ambil field pertanyaan dan intent
def get_field(item: Dict[str, Any], keys: List[str]) -> Any:
    for k in keys:
        if k in item and item[k] is not None:
            return item[k]
    return None

# ============================================================
# INFERENCE + AMBIL YANG SALAH PREDIKSI
# ============================================================

wrong_cases = []

for item in dataset:
    # Ambil pertanyaan
    question = get_field(item, ["question", "pertanyaan", "Pertanyaan"])
    # Ambil label asli
    true_intent = get_field(item, ["intent", "Intent", "label"])

    # Skip kalau data tidak lengkap
    if not question or not true_intent:
        continue

    # Kalau intent di dataset tidak ada di label model, skip (biar tidak error)
    if true_intent not in label2id:
        continue

    # Tokenisasi
    enc = tokenizer(
        str(question),
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

    # Pindah ke device
    enc = {k: v.to(device) for k, v in enc.items()}

    # Prediksi
    with torch.no_grad():
        outputs = model(**enc)
        logits = outputs.logits.squeeze(0)              # shape: [num_labels]
        probs = F.softmax(logits, dim=-1)               # probability tiap label
        pred_id = int(torch.argmax(probs).item())
        pred_intent = label_list[pred_id]
        conf = float(probs[pred_id].item())

    # Bandingkan
    if pred_intent != true_intent:
        wrong_cases.append({
            "question": question,
            "true_intent": true_intent,
            "pred_intent": pred_intent,
            "confidence": round(conf, 4)
        })

# Urutkan: yang paling yakin tapi salah (menarik untuk analisis)
wrong_cases.sort(key=lambda x: x["confidence"], reverse=True)

# Ambil contoh secukupnya
examples = wrong_cases[:MAX_EXAMPLES]

# ============================================================
# TAMPILKAN DI TERMINAL
# ============================================================

print(f"Total data terbaca       : {len(dataset)}")
print(f"Total salah prediksi     : {len(wrong_cases)}")
print(f"Contoh yang ditampilkan  : {len(examples)}\n")

for i, ex in enumerate(examples, 1):
    print(f"[{i}] Q: {ex['question']}")
    print(f"    True: {ex['true_intent']}")
    print(f"    Pred: {ex['pred_intent']} (conf={ex['confidence']})")
    print("-" * 60)


Total data terbaca       : 2040
Total salah prediksi     : 27
Contoh yang ditampilkan  : 3

[1] Q: LiVE Unpad itu bisa dipakai buat ngapain aja?
    True: info_fitur_platform_elearning
    Pred: info_akses_lms_mobile (conf=0.7648)
------------------------------------------------------------
[2] Q: Cara bayar UKT lewat mana aja?
    True: info_metode_pembayaran_ukt
    Pred: prosedur_pembayaran_ukt (conf=0.7426)
------------------------------------------------------------
[3] Q: Repository Tugas Akhir itu apa sih?
    True: info_repository_tugas_akhir
    Pred: info_durasi_akses_repository_tugas_akhir (conf=0.7248)
------------------------------------------------------------


In [5]:
# ============================================================
# CONFIDENCE THRESHOLD ANALYSIS
# Menghitung error rate dan fallback rate berdasarkan confidence
# ============================================================
# Mengimpor library json untuk membaca file JSON
import json
# Mengimpor library os untuk pengelolaan path file
import os
# Mengimpor defaultdict untuk inisialisasi dictionary otomatis
from collections import defaultdict
# Mengimpor type hint untuk kejelasan struktur data
from typing import List, Dict, Any
# Mengimpor library PyTorch
import torch
# Mengimpor fungsi softmax dari PyTorch
import torch.nn.functional as F
# Mengimpor tokenizer dan model dari HuggingFace Transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ============================================================
# KONFIGURASI
# ============================================================
MODEL_DIR = "./best_fold_model_indobert"
DATA_PATH = "./dataset_chatbot.json"
# Menentukan path file mapping label
LABEL_FILE = os.path.join(MODEL_DIR, "label_names.json")
# Menentukan panjang maksimum token input
MAX_LENGTH = 64
# Menentukan rentang confidence untuk analisis error rate
CONFIDENCE_BINS = [
    (0.60, 0.69),   # Rentang confidence rendah
    (0.70, 0.79),   # Rentang confidence menengah
    (0.80, 1.00),   # Rentang confidence tinggi
]
# Menentukan threshold confidence yang dianalisis fallback-nya
THRESHOLDS = [0.6,0.7, 0.8]
# ============================================================
# DEVICE
# ============================================================
# Menentukan device (GPU jika tersedia, jika tidak CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# ============================================================
# LOAD MODEL & TOKENIZER
# ============================================================
# Memuat tokenizer berdasarkan direktori model
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
# Memuat model klasifikasi dan memindahkannya ke device
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR).to(device)
# Mengatur model ke mode evaluasi
model.eval()
# ============================================================
# LOAD LABEL NAMES
# ============================================================
# Fungsi untuk memuat daftar label dari file JSON
def load_label_names(path: str) -> List[str]:
    # Membuka file label_names.json
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Jika format berupa list
    if isinstance(data, list):
        return data

    # Jika format berupa dictionary
    if isinstance(data, dict):
        # Jika key berupa angka (string)
        if all(str(k).isdigit() for k in data.keys()):
            return [data[str(i)] for i in sorted(map(int, data.keys()))]

        # Jika value berupa integer (id label)
        if all(isinstance(v, int) for v in data.values()):
            inv = {v: k for k, v in data.items()}
            return [inv[i] for i in sorted(inv.keys())]

    # Error jika format tidak dikenali
    raise ValueError("Format label_names.json tidak dikenali.")

# Mengecek apakah file label tersedia
if os.path.exists(LABEL_FILE):
    # Memuat label dari file
    label_list = load_label_names(LABEL_FILE)
else:
    # Mengambil label dari konfigurasi model
    id2label = model.config.id2label
    label_list = [id2label[str(i)] for i in range(len(id2label))]

# Membuat mapping dari label ke id
label2id = {label: idx for idx, label in enumerate(label_list)}

# ============================================================
# LOAD DATASET
# ============================================================

# Membuka file dataset JSON
with open(DATA_PATH, "r", encoding="utf-8") as f:
    dataset = json.load(f)

# Fungsi untuk mengambil field dengan beberapa kemungkinan nama
def get_field(item: Dict[str, Any], keys: List[str]) -> Any:
    # Iterasi setiap kemungkinan nama field
    for k in keys:
        # Mengembalikan nilai jika field ditemukan dan tidak None
        if k in item and item[k] is not None:
            return item[k]
    # Mengembalikan None jika tidak ditemukan
    return None

# ============================================================
# INFERENSI & SIMPAN HASIL
# ============================================================

# List untuk menyimpan hasil prediksi
results = []

# Iterasi setiap data pada dataset
for item in dataset:
    # Mengambil teks pertanyaan
    question = get_field(item, ["question", "pertanyaan", "Pertanyaan"])

    # Mengambil label asli
    true_intent = get_field(item, ["intent", "Intent", "label"])

    # Melewati data jika tidak lengkap
    if not question or not true_intent:
        continue

    # Melewati data jika label tidak dikenali model
    if true_intent not in label2id:
        continue

    # Melakukan tokenisasi teks pertanyaan
    enc = tokenizer(
        str(question),
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

    # Memindahkan tensor ke device
    enc = {k: v.to(device) for k, v in enc.items()}

    # Melakukan inferensi tanpa gradien
    with torch.no_grad():
        # Menghasilkan output model
        outputs = model(**enc)

        # Mengambil logits hasil klasifikasi
        logits = outputs.logits.squeeze(0)

        # Menghitung probabilitas dengan softmax
        probs = F.softmax(logits, dim=-1)

        # Mengambil indeks label dengan probabilitas tertinggi
        pred_id = int(torch.argmax(probs).item())

        # Mengambil nama intent hasil prediksi
        pred_intent = label_list[pred_id]

        # Mengambil confidence prediksi
        confidence = float(probs[pred_id].item())

    # Menyimpan hasil confidence dan kebenaran prediksi
    results.append({
        "confidence": confidence,
        "correct": pred_intent == true_intent
    })

# Menghitung total data yang dianalisis
total_data = len(results)

# ============================================================
# ERROR RATE PER RENTANG CONFIDENCE
# ============================================================

# Dictionary untuk menyimpan statistik error per rentang
bin_stats = defaultdict(lambda: {"total": 0, "wrong": 0})

# Iterasi setiap hasil prediksi
for r in results:
    # Mengambil nilai confidence
    conf = r["confidence"]

    # Menentukan rentang confidence
    for low, high in CONFIDENCE_BINS:
        if low <= conf <= high:
            # Menentukan label rentang
            key = f"{low:.2f}-{high:.2f}" if high < 1.0 else ">=0.80"

            # Menambah jumlah data pada rentang tersebut
            bin_stats[key]["total"] += 1

            # Menambah jumlah salah prediksi jika tidak benar
            if not r["correct"]:
                bin_stats[key]["wrong"] += 1

# ============================================================
# FALLBACK RATE PER THRESHOLD
# ============================================================

# Dictionary untuk menyimpan statistik fallback
fallback_stats = {}

# Iterasi setiap threshold
for t in THRESHOLDS:
    # Menghitung jumlah data dengan confidence di bawah threshold
    fallback_count = sum(1 for r in results if r["confidence"] < t)

    # Menghitung fallback rate
    fallback_rate = fallback_count / total_data if total_data > 0 else 0.0

    # Menyimpan hasil fallback
    fallback_stats[t] = {
        "count": fallback_count,
        "rate": fallback_rate
    }

# ============================================================
# TAMPILKAN HASIL
# ============================================================

# Menampilkan judul analisis
print("\n===== CONFIDENCE THRESHOLD ANALYSIS =====")

# Menampilkan total data yang dianalisis
print(f"Total data dianalisis : {total_data}\n")

# Menampilkan header tabel error rate
print("Confidence Range | Total | Wrong | Error Rate")
print("-" * 55)

# Menampilkan hasil error rate per rentang
for key in ["0.60-0.69", "0.70-0.79", ">=0.80"]:
    stat = bin_stats.get(key, {"total": 0, "wrong": 0})
    total = stat["total"]
    wrong = stat["wrong"]
    error_rate = (wrong / total) if total > 0 else 0.0
    print(f"{key:15} | {total:5} | {wrong:5} | {error_rate:.2%}")

# Garis pemisah
print("-" * 55)

# Menampilkan header tabel fallback rate
print("\nFallback Rate per Threshold")
print("Threshold | Fallback Count | Fallback Rate")
print("-" * 55)

# Menampilkan hasil fallback rate
for t in THRESHOLDS:
    fs = fallback_stats[t]
    print(f"{t:<9} | {fs['count']:14} | {fs['rate']:.2%}")

# Garis penutup
print("-" * 55)



===== CONFIDENCE THRESHOLD ANALYSIS =====
Total data dianalisis : 2040

Confidence Range | Total | Wrong | Error Rate
-------------------------------------------------------
0.60-0.69       |    14 |     1 | 7.14%
0.70-0.79       |    32 |     3 | 9.38%
>=0.80          |  1919 |     0 | 0.00%
-------------------------------------------------------

Fallback Rate per Threshold
Threshold | Fallback Count | Fallback Rate
-------------------------------------------------------
0.6       |             66 | 3.24%
0.7       |             81 | 3.97%
0.8       |            121 | 5.93%
-------------------------------------------------------
