In [40]:
import re, json, pandas as pd, numpy as np
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
# Persiapan & Load Data
BASE = Path(r"D:/SEMESTER 6/PROJECT CBR")
DATA = BASE / "data"
PROC = DATA / "processed"
EVAL = DATA / "eval"
RES  = DATA / "results"
RES.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(PROC / "cases.csv")
df["amar"] = df["amar"].fillna("").astype(str)
df["ringkasan_fakta"] = df["ringkasan_fakta"].fillna("").astype(str)

In [42]:
# TF-IDF Vectorizer untuk Reuse

vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                            token_pattern=r"(?u)\b[a-zA-Z]{2,}\b",
                            min_df=1, max_df=0.95)
tfidf_matrix = vectorizer.fit_transform(df["ringkasan_fakta"])

In [43]:
# Dictionary: {case_id: amar_putusan}
case_solutions = dict(zip(df["case_id"], df["amar"]))


In [44]:
# Retrieval Function (TF-IDF cosine)
def retrieve(query: str, k=5):
    vec = vectorizer.transform([query])
    sims = cosine_similarity(vec, tfidf_matrix).flatten()
    top_idx = sims.argsort()[::-1][:k]
    case_ids = df.iloc[top_idx]["case_id"].tolist()
    sim_scores = sims[top_idx]
    return case_ids, sim_scores


In [45]:
# Predict Outcome – Majority Vote
def predict_majority(query: str, k=5):
    case_ids, _ = retrieve(query, k)
    solutions = [case_solutions.get(cid, "") for cid in case_ids if cid in case_solutions]
    count = Counter(solutions)
    pred = count.most_common(1)[0][0] if count else ""
    return pred, case_ids

In [46]:
# Predict Outcome – Weighted Similarity
def predict_weighted(query: str, k=5):
    case_ids, scores = retrieve(query, k)
    weights = {}
    for cid, score in zip(case_ids, scores):
        sol = case_solutions.get(cid, "")
        if sol not in weights:
            weights[sol] = 0
        weights[sol] += score
    pred = max(weights, key=weights.get) if weights else ""
    return pred, case_ids


In [47]:
# Evaluasi Manual – 5 Query Baru
queries = json.loads((EVAL / "queries.json").read_text(encoding="utf-8"))

rows = []
for q in queries:
    qid, qtext = q["query_id"], q["query"]

    pred_majority, cases_maj = predict_majority(qtext)
    pred_weighted, cases_weight = predict_weighted(qtext)

    rows.append({
        "query_id": qid,
        "query": qtext,
        "method": "majority",
        "predicted_solution": pred_majority,
        "top_5_case_ids": cases_maj
    })
    rows.append({
        "query_id": qid,
        "query": qtext,
        "method": "weighted",
        "predicted_solution": pred_weighted,
        "top_5_case_ids": cases_weight
    })


In [48]:
# Simpan ke CSV
df_out = pd.DataFrame(rows)
df_out.to_csv(RES / "predictions.csv", index=False)

print("✅ Disimpan ke:", RES / "predictions.csv")
print(df_out.head(10))

✅ Disimpan ke: D:\SEMESTER 6\PROJECT CBR\data\results\predictions.csv
   query_id                                 query    method  \
0         1  terdakwa mencuri motor di malam hari  majority   
1         1  terdakwa mencuri motor di malam hari  weighted   
2         2       kasus penggelapan dana koperasi  majority   
3         2       kasus penggelapan dana koperasi  weighted   
4         3   tersangka membawa narkoba dalam tas  majority   
5         3   tersangka membawa narkoba dalam tas  weighted   
6         4              sengketa jual beli tanah  majority   
7         4              sengketa jual beli tanah  weighted   
8         5          kekerasan dalam rumah tangga  majority   
9         5          kekerasan dalam rumah tangga  weighted   

  predicted_solution          top_5_case_ids  
0          Lain-lain   [57, 89, 94, 108, 80]  
1          Lain-lain   [57, 89, 94, 108, 80]  
2          Lain-lain  [112, 111, 30, 31, 32]  
3          Lain-lain  [112, 111, 30, 31, 32]  
4