In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# === TAHAP 3: CASE RETRIEVAL ===

import pandas as pd
import numpy as np
import os
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity

# === Load Data ===
df = pd.read_csv('/content/drive/MyDrive/ProyekA/data/processed/cases.csv')
df['ringkasan_fakta'] = df['ringkasan_fakta'].fillna('')

# === TF-IDF + SVM ===
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(df['ringkasan_fakta'])
y = df['pasal'].astype(str)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("=== TF-IDF + SVM Evaluation ===")
print(classification_report(y_test, y_pred))

# === IndoBERT Embedding ===
!pip install -q transformers sentencepiece

from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")

def bert_embed(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()

# Hitung BERT embedding sekali saja
df['embedding'] = df['ringkasan_fakta'].apply(lambda x: bert_embed(x)[0])
X_embed = np.vstack(df['embedding'].values)

# === Fungsi retrieve(query, k, mode) ===
def retrieve(query: str, k: int = 5, mode: str = 'tfidf') -> list:
    if mode == 'bert':
        q_vec = bert_embed(query)  # ✅ gunakan BERT
        sims = cosine_similarity(q_vec, X_embed)[0]
    else:
        q_vec = vectorizer.transform([query])  # ✅ gunakan TF-IDF
        sims = cosine_similarity(q_vec, X_tfidf).flatten()

    topk_idx = sims.argsort()[-k:][::-1]
    return df.iloc[topk_idx]['case_id'].astype(str).tolist()

# === Buat Query Uji & Simpan queries.json ===
eval_queries = [
    {
        "query_id": "q001",
        "query_text": "Terdakwa membawa sabu seberat 3 gram dalam plastik kecil",
        "ground_truth": "001"
    },
    {
        "query_id": "q002",
        "query_text": "Tersangka mengedarkan ekstasi di tempat hiburan malam",
        "ground_truth": "002"
    },
    {
        "query_id": "q003",
        "query_text": "Polisi menangkap pelaku narkoba saat razia di terminal",
        "ground_truth": "003"
    },
    {
        "query_id": "q004",
        "query_text": "Petugas menemukan ganja di dalam jok motor terdakwa",
        "ground_truth": "004"
    },
    {
        "query_id": "q005",
        "query_text": "Tersangka menyimpan sabu di laci rumahnya untuk dijual",
        "ground_truth": "005"
    }
]

# Simpan ke /data/eval/queries.json
eval_path = "/content/drive/MyDrive/ProyekA/data/eval"
os.makedirs(eval_path, exist_ok=True)
with open(os.path.join(eval_path, "queries.json"), "w", encoding="utf-8") as f:
    json.dump(eval_queries, f, indent=2, ensure_ascii=False)

print("✓ File queries.json berhasil disimpan.")
