In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import glob
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import json
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
import torch

In [3]:
# === SETUP ===
# Load data kasus
df = pd.read_csv("/content/drive/MyDrive/ProyekA/data/processed/cases.csv")
df['ringkasan_fakta'] = df['ringkasan_fakta'].fillna('')
df['pasal'] = df['pasal'].fillna('UNKNOWN').astype(str).str.lower().str.strip().str[:100]

# === FIX case_id agar konsisten ===
df["case_id"] = df.index + 1
df["case_id"] = df["case_id"].astype(str).str.zfill(3)  # '001', '002', ...

# === TF-IDF Vectorizer ===
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(df["ringkasan_fakta"])

# === IndoBERT Setup ===
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")

def bert_embed(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()

# === Precompute Embedding BERT ===
print("⏳ Menghitung IndoBERT embeddings...")
df['embedding'] = df['ringkasan_fakta'].apply(lambda x: bert_embed(x)[0])
X_embed = np.vstack(df['embedding'].values)

# === Mapping case_id → pasal ===
case_solutions = dict(zip(df["case_id"], df["pasal"]))

# === RETRIEVAL ===
def retrieve(query: str, k: int = 5, mode: str = 'tfidf') -> list:
    if mode == 'bert':
        q_vec = bert_embed(query)
        sims = cosine_similarity(q_vec, X_embed)[0]
    else:
        q_vec = vectorizer.transform([query])
        sims = cosine_similarity(q_vec, X_tfidf).flatten()

    topk_idx = sims.argsort()[-k:][::-1]
    return df.iloc[topk_idx]['case_id'].astype(str).tolist()

# === PREDIKSI ===
def predict_outcome(query, k=5, mode='tfidf', weighted=True):
    top_k_ids = retrieve(query, k=k, mode=mode)

    if not weighted:
        solusi_list = [case_solutions.get(cid, 'UNKNOWN') for cid in top_k_ids]
        predicted = Counter(solusi_list).most_common(1)[0][0]
    else:
        if mode == 'bert':
            q_vec = bert_embed(query)
            sims = cosine_similarity(q_vec, X_embed)[0]
        else:
            q_vec = vectorizer.transform([query])
            sims = cosine_similarity(q_vec, X_tfidf).flatten()

        topk_idx = sims.argsort()[-k:][::-1]
        top_k_ids = df.iloc[topk_idx]['case_id'].astype(str).tolist()
        solusi_list = [case_solutions.get(cid, 'UNKNOWN') for cid in top_k_ids]
        weights = sims[topk_idx]

        score = {}
        for sol, w in zip(solusi_list, weights):
            score[sol] = score.get(sol, 0) + w
        predicted = max(score.items(), key=lambda x: x[1])[0]

    return predicted, top_k_ids

# === LOAD QUERIES & KONVERSI GROUND TRUTH ===
with open("/content/drive/MyDrive/ProyekA/data/eval/queries.json", encoding="utf-8") as f:
    queries = json.load(f)

# Pastikan ground_truth diubah dari case_id ke pasal yang cocok
for q in queries:
    cid = str(q['ground_truth']).zfill(3)
    q['ground_truth'] = case_solutions.get(cid, "UNKNOWN")

# === JALANKAN PREDIKSI ===
results = []
for q in queries:
    pred, top_k = predict_outcome(q["query_text"], k=5, mode='bert', weighted=True)
    results.append({
        "query_id": q["query_id"],
        "predicted_solution": pred,
        "top_5_case_ids": top_k,
        "ground_truth": q["ground_truth"],
        "query_text": q["query_text"]
    })

# === SIMPAN HASIL ===
os.makedirs("/content/drive/MyDrive/ProyekA/data/results", exist_ok=True)
pd.DataFrame(results).to_csv("/content/drive/MyDrive/ProyekA/data/results/predictions.csv", index=False)
print("✅ Hasil prediksi berhasil disimpan ke predictions.csv")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

⏳ Menghitung IndoBERT embeddings...


model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

✅ Hasil prediksi berhasil disimpan ke predictions.csv
