In [79]:
import re, json, torch, numpy as np, pandas as pd
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from transformers import AutoTokenizer, AutoModel

In [80]:
# Persiapan Data
BASE = Path(r"D:/SEMESTER 6/PROJECT CBR")
DATA = BASE / "data"
PROC = DATA / "processed"
EVAL = DATA / "eval"
PROC.mkdir(parents=True, exist_ok=True)
EVAL.mkdir(parents=True, exist_ok=True)

cases_fp = PROC / "cases.csv"
df = pd.read_csv(cases_fp)
df["text_pdf"] = df["text_pdf"].fillna("").astype(str)

In [81]:
# Clean + Ringkasan
def clean_header(t: str) -> str:
    t = re.sub(r"Mahkamah Agung.*?\n", " ", t, flags=re.I|re.S)
    t = re.sub(r"Direktori Putusan.*?\n", " ", t, flags=re.I|re.S)
    t = re.sub(r"Halaman \d+\s*", " ", t, flags=re.I)
    return t

def ringkasan_fakta(t: str, n=3) -> str:
    kal = [k.strip() for k in re.split(r"\.\s+", clean_header(str(t))) if len(k.strip()) > 20]
    return ". ".join(kal[:n])

df["ringkasan_fakta"] = df["text_pdf"].apply(ringkasan_fakta)

In [82]:
# TF-IDF vectorizer (single instance)
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=1,
    max_df=0.95,
    token_pattern=r"(?u)\b[a-zA-Z]{2,}\b"
)
X_tfidf = vectorizer.fit_transform(df["ringkasan_fakta"])
print("TF-IDF features:", X_tfidf.shape[1])

TF-IDF features: 8866


In [83]:
# Cosine Retrieval (TF-IDF)
def retrieve_cosine(q: str, k=5):
    sims = cosine_similarity(vectorizer.transform([q]), X_tfidf).flatten()
    top  = sims.argsort()[::-1][:k]
    return df.iloc[top]["case_id"].tolist()

In [85]:
# SVM Retrieval (TF-IDF)
X_tr, X_te, y_tr, y_te = train_test_split(X_tfidf, df["klasifikasi"], test_size=0.3, random_state=42)
svm_clf = LinearSVC().fit(X_tr, y_tr)

def retrieve_svm(q: str, k=5):
    cls = svm_clf.predict(vectorizer.transform([q]))[0]
    sub = df[df["klasifikasi"] == cls]
    if sub.empty: return []
    sims = cosine_similarity(vectorizer.transform([q]), vectorizer.transform(sub["ringkasan_fakta"])).flatten()
    top  = sims.argsort()[::-1][:k]
    return sub.iloc[top]["case_id"].tolist()

In [86]:
# IndoBERT Embedding
print("⏳ Memuat IndoBERT…")
tok = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
bert = AutoModel.from_pretrained("indobenchmark/indobert-base-p1").eval()

@torch.no_grad()
def emb(text: str):
    t = tok(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    return bert(**t).last_hidden_state[:, 0, :].squeeze().cpu().numpy()

print("🔄 Menghitung embedding dokumen…")
doc_emb = np.vstack([emb(t) for t in df["ringkasan_fakta"]])

def retrieve_bert(q: str, k=5):
    qv   = emb(q)
    sims = cosine_similarity([qv], doc_emb).flatten()
    top  = sims.argsort()[::-1][:k]
    return df.iloc[top]["case_id"].tolist()

⏳ Memuat IndoBERT…
🔄 Menghitung embedding dokumen…
