In [6]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
import torch
import numpy as np
import json
import pandas as pd

In [7]:
# Load data
df = pd.read_csv("D:/SEMESTER 6/PROJECT CBR/data/processed/cases.csv")

In [8]:
# Pastikan kolom tidak ada NaN
df['ringkasan_fakta'] = df['ringkasan_fakta'].fillna("").astype(str)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X_vector = vectorizer.fit_transform(df['ringkasan_fakta'])

# Simpan versi full matrix untuk cosine similarity (retrieval)
tfidf_matrix_full = vectorizer.transform(df['ringkasan_fakta'])

In [9]:
# SVM Classifier
X_train, X_test, y_train, y_test = train_test_split(
    X_vector, df['klasifikasi'], test_size=0.3, random_state=42
)

svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

# Evaluasi model klasifikasi
print("=== Evaluasi SVM Klasifikasi ===")
print(classification_report(y_test, svm_model.predict(X_test)))

=== Evaluasi SVM Klasifikasi ===
                                           precision    recall  f1-score   support

Perdata Agama 
 Perdata Agama  Perceraian       1.00      1.00      1.00        34

                                 accuracy                           1.00        34
                                macro avg       1.00      1.00      1.00        34
                             weighted avg       1.00      1.00      1.00        34



In [10]:
# IndoBERT Embedding
print("⏳ Memuat model IndoBERT...")
model_name = "indobenchmark/indobert-base-p1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)
bert_model.eval()

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

print("🔄 Menghitung embedding dokumen...")
doc_embeddings = np.array([get_embedding(text) for text in df['ringkasan_fakta']])

⏳ Memuat model IndoBERT...
🔄 Menghitung embedding dokumen...


In [11]:
# Preprocessing Query
def preprocess_query(query):
    return query.lower()

In [12]:
# Retrieval Functions

# TF-IDF + cosine
def retrieve_tfidf(query: str, k: int = 5):
    query_clean = preprocess_query(query)
    query_vec = vectorizer.transform([query_clean])
    sims = cosine_similarity(query_vec, tfidf_matrix_full).flatten()
    top_k = sims.argsort()[-k:][::-1]
    return df.iloc[top_k]['case_id'].tolist()

# IndoBERT + cosine
def retrieve_bert(query: str, k: int = 5):
    query_embedding = get_embedding(query)
    sims = cosine_similarity([query_embedding], doc_embeddings).flatten()
    top_k = sims.argsort()[-k:][::-1]
    return df.iloc[top_k]['case_id'].tolist()

# SVM klasifikasi
def retrieve_svm_class(query: str, k: int = 5):
    query_vec = vectorizer.transform([preprocess_query(query)])
    label = svm_model.predict(query_vec)[0]
    filtered = df[df['klasifikasi'] == label]
    return label, filtered['case_id'].tolist()[:k]

In [13]:
# Pastikan folder eval tersedia
Path("D:/SEMESTER 6/PROJECT CBR/data/eval").mkdir(parents=True, exist_ok=True)

# Daftar query uji dan ground-truth
queries = [
    {"query_id": 1, "query": "terdakwa mencuri motor di malam hari", "ground_truth": [1]},
    {"query_id": 2, "query": "kasus penggelapan dana koperasi", "ground_truth": [3]},
    {"query_id": 3, "query": "tersangka membawa narkoba dalam tas", "ground_truth": [5]},
    {"query_id": 4, "query": "sengketa jual beli tanah", "ground_truth": [9]},
    {"query_id": 5, "query": "kekerasan dalam rumah tangga", "ground_truth": [12]}
]

# Simpan ke file JSON
with open("D:/SEMESTER 6/PROJECT CBR/data/eval/queries.json", "w", encoding="utf-8") as f:
    json.dump(queries, f, indent=2, ensure_ascii=False)

print("D:/SEMESTER 6/PROJECT CBR/data/eval/queries.jso berhasil dibuat.")

D:/SEMESTER 6/PROJECT CBR/data/eval/queries.jso berhasil dibuat.
