In [7]:
import pandas as pd
import json
import re
from pathlib import Path
from collections import Counter

In [2]:
# Load Query dan Ground Truth
with open("D:/SEMESTER 6/PROJECT CBR/data/eval/queries.json", "r", encoding="utf-8") as f:
    queries = json.load(f)

# Load Dataset untuk Case ID → Ringkasan Fakta
df = pd.read_csv("D:/SEMESTER 6/PROJECT CBR/data/processed/cases.csv")
df['ringkasan_fakta'] = df['ringkasan_fakta'].fillna("")

In [3]:
# TF-IDF Vectorizer dan Matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['ringkasan_fakta'])

# Retrieve Fungsi Umum
def retrieve_tfidf(query: str, k: int = 5):
    q_vec = vectorizer.transform([query.lower()])
    sim = cosine_similarity(q_vec, tfidf_matrix).flatten()
    top_k = sim.argsort()[-k:][::-1]
    return df.iloc[top_k]['case_id'].tolist()

# Evaluasi Fungsi: Hitung Precision@k, Recall@k, F1@k
def eval_retrieval(model_name: str, retrieve_func, queries: list, k=5):
    rows = []

    for q in queries:
        query_id = q['query_id']
        query_text = q['query']
        ground_truth = set(q['ground_truth'])

        retrieved = retrieve_func(query_text, k=k)
        retrieved_set = set(retrieved)

        true_positives = ground_truth.intersection(retrieved_set)
        precision = len(true_positives) / len(retrieved_set) if retrieved_set else 0
        recall = len(true_positives) / len(ground_truth) if ground_truth else 0
        if precision + recall == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)

        rows.append({
            "model": model_name,
            "query_id": query_id,
            "precision@k": round(precision, 3),
            "recall@k": round(recall, 3),
            "f1@k": round(f1, 3)
        })

    return pd.DataFrame(rows)

# Evaluasi Model BERT dan SVM

# Untuk IndoBERT
try:
    from transformers import AutoTokenizer, AutoModel
    import torch

    tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
    model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")
    model.eval()

    def get_embedding(text):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
        return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

    doc_embeddings = [get_embedding(text) for text in df['ringkasan_fakta']]
    import numpy as np
    doc_embeddings = np.array(doc_embeddings)

    def retrieve_bert(query, k=5):
        query_emb = get_embedding(query)
        sim = cosine_similarity([query_emb], doc_embeddings).flatten()
        top_k = sim.argsort()[-k:][::-1]
        return df.iloc[top_k]['case_id'].tolist()

except Exception as e:
    print("⚠️ IndoBERT model belum dimuat:", e)
    retrieve_bert = None

# Jalankan Evaluasi
metrics_tfidf = eval_retrieval("TF-IDF", retrieve_tfidf, queries)

if retrieve_bert:
    metrics_bert = eval_retrieval("IndoBERT", retrieve_bert, queries)
    df_all = pd.concat([metrics_tfidf, metrics_bert], ignore_index=True)
else:
    df_all = metrics_tfidf

# Simpan Hasil ke CSV
Path("D:/SEMESTER 6/PROJECT CBR/data/eval").mkdir(parents=True, exist_ok=True)
df_all.to_csv("D:/SEMESTER 6/PROJECT CBR/data/eval/retrieval_metrics.csv", index=False)
print(" retrieval_metrics.csv disimpan di D:/SEMESTER 6/PROJECT CBR/data/eval")

# Print Tabel Rata-Rata
avg = df_all.groupby('model')[['precision@k', 'recall@k', 'f1@k']].mean().round(3)
print("\n Rata-rata metrik per model:")
print(avg)

 retrieval_metrics.csv disimpan di D:/SEMESTER 6/PROJECT CBR/data/eval

 Rata-rata metrik per model:
          precision@k  recall@k  f1@k
model                                
IndoBERT          0.0       0.0   0.0
TF-IDF            0.0       0.0   0.0


In [8]:
# Fungsi Ringkasan Fakta
def ringkasan_fakta(teks):
    kalimat = str(teks).split('. ')
    return '. '.join(kalimat[:2]).strip()

# Fungsi Argumen Hukum
def argumen_hukum(teks):
    kalimat = [k for k in str(teks).split('. ') if 'pasal' in k.lower()]
    return '. '.join(kalimat[:2]).strip()

# Jumlah kata
def jumlah_kata(teks):
    return len(str(teks).split())

# Bag-of-Words (ambil top 5 kata paling sering)
def bag_of_words(teks, top_n=5):
    words = re.findall(r'\b\w+\b', str(teks).lower())
    stopwords = {'yang', 'dan', 'di', 'ke', 'dengan', 'untuk', 'dari', 'pada', 'adalah'}
    words = [w for w in words if w not in stopwords]
    common = Counter(words).most_common(top_n)
    return ', '.join([w for w, _ in common])

# QA-pairs sederhana
def qa_pair(fakta, amar):
    if not isinstance(fakta, str) or not isinstance(amar, str):
        return "-"
    tanya = f"Apa yang terjadi dalam kasus ini? {fakta}"
    jawab = f"Apa hasil putusannya? {amar}"
    return f"Q: {tanya}\nA: {jawab}"

# Tambahkan kolom-kolom baru
df['case_id'] = df.index + 1
df['ringkasan_fakta'] = df['text_pdf'].apply(ringkasan_fakta)
df['argumen_hukum'] = df['text_pdf'].apply(argumen_hukum)
df['text_length'] = df['text_pdf'].apply(jumlah_kata)
df['bag_of_words'] = df['text_pdf'].apply(bag_of_words)
df['qa_pair'] = df.apply(lambda x: qa_pair(x['ringkasan_fakta'], x['amar']), axis=1)

# Pilih kolom akhir yang ingin disimpan
cols = ['case_id', 'nomor', 'tanggal_register', 'klasifikasi', 'ringkasan_fakta',
        'argumen_hukum', 'amar', 'hakim_ketua', 'text_pdf', 'text_length', 'bag_of_words', 'qa_pair']

# Buat folder 'data/processed' jika belum ada
Path(r"D:/SEMESTER 6/PROJECT CBR/data/processed").mkdir(parents=True, exist_ok=True)

# Simpan ke CSV
df[cols].to_csv("D:/SEMESTER 6/PROJECT CBR/data/processed/cases.csv", index=False, encoding='utf-8')

# Simpan juga ke JSON
df[cols].to_json("D:/SEMESTER 6/PROJECT CBR/data/processed/cases.json", orient='records', force_ascii=False, indent=2)