In [8]:
# 04_Solution_Reuse.ipynb

import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import json
import re
from collections import Counter
import joblib
from scipy.sparse import load_npz
import nltk
nltk.download('stopwords')
from google.colab import drive

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# --- 1. Konfigurasi Path ---
BASE_DRIVE_PATH = "/content/drive/MyDrive/Semester 6/PK/UAS" # Sesuaikan jika berbeda
PATH_PROCESSED_DATA = os.path.join(BASE_DRIVE_PATH, "data/processed")
PATH_EVAL_DATA = os.path.join(BASE_DRIVE_PATH, "data/eval")
PATH_MODELS_CACHE = os.path.join(BASE_DRIVE_PATH, "models_cache") # Opsional
PATH_RESULTS = os.path.join(BASE_DRIVE_PATH, "data/results")
os.makedirs(PATH_RESULTS, exist_ok=True)

In [4]:
# --- 2. Muat Semua Data, Model, dan Embeddings ---
print("Memuat semua komponen dari tahap sebelumnya...")

# Muat data kasus
CASES_REPRESENTED_CSV = os.path.join(PATH_PROCESSED_DATA, "cases_represented.csv")
df_cases = pd.read_csv(CASES_REPRESENTED_CSV)
df_cases.set_index('case_id', inplace=True)

# Muat komponen BERT
try:
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    MODEL_NAME = 'indobenchmark/indobert-base-p1'
    bert_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=PATH_MODELS_CACHE)
    bert_model = AutoModel.from_pretrained(MODEL_NAME, cache_dir=PATH_MODELS_CACHE)
    bert_model.to(DEVICE)
    bert_model.eval()
    bert_case_embeddings = np.load(os.path.join(PATH_PROCESSED_DATA, "case_embeddings_bert.npy"))
    with open(os.path.join(PATH_PROCESSED_DATA, "case_ids_bert.json"), 'r') as f:
        bert_case_ids = json.load(f)
    print("Komponen BERT berhasil dimuat.")
except Exception as e:
    print(f"Error saat memuat komponen BERT: {e}. Pastikan Tahap 3 (BERT) sudah dijalankan.")
    exit()

# Muat komponen TF-IDF
try:
    tfidf_vectorizer = joblib.load(os.path.join(PATH_PROCESSED_DATA, "tfidf_vectorizer.pkl"))
    tfidf_matrix = load_npz(os.path.join(PATH_PROCESSED_DATA, "tfidf_matrix.npz"))
    with open(os.path.join(PATH_PROCESSED_DATA, "case_ids_tfidf.json"), 'r') as f:
        tfidf_case_ids = json.load(f)
    print("Komponen TF-IDF berhasil dimuat.")
except Exception as e:
    print(f"Error saat memuat komponen TF-IDF: {e}. Pastikan Tahap 3 (TF-IDF) sudah dijalankan.")
    exit()

# Muat queries
QUERIES_JSON_FILE = os.path.join(PATH_EVAL_DATA, "queries.json")
try:
    with open(QUERIES_JSON_FILE, 'r', encoding='utf-8') as f:
        test_queries = json.load(f)
except FileNotFoundError:
    print(f"Error: File {QUERIES_JSON_FILE} tidak ditemukan. Buat contoh manual.")
    test_queries = [{"query_id": "Q_DEMO_01", "query_text": "Istri ditinggal suami tanpa nafkah."}]

Memuat semua komponen dari tahap sebelumnya...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Komponen BERT berhasil dimuat.
Komponen TF-IDF berhasil dimuat.


In [9]:
# --- 3. Definisikan Ulang Fungsi-Fungsi dari Tahap 3 ---
# Preprocessing untuk TF-IDF
stop_words = list(nltk.corpus.stopwords.words('indonesian'))
def preprocess_for_tfidf(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    words = text.split()
    return " ".join([word for word in words if word not in stop_words and len(word) > 2])

# Fungsi get_bert_embedding
def get_bert_embedding(text, model, tokenizer, device):
    encoded_input = tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    with torch.no_grad():
        outputs = model(**encoded_input)
    attention_mask, last_hidden_states = encoded_input['attention_mask'], outputs.last_hidden_state
    mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()
    sum_embeddings = torch.sum(last_hidden_states * mask_expanded, 1)
    sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
    return (sum_embeddings / sum_mask).cpu().numpy().flatten()

# Fungsi retrieval TF-IDF
def retrieve_cases_tfidf(query_text, k=5):
    processed_query = preprocess_for_tfidf(query_text)
    query_vector = tfidf_vectorizer.transform([processed_query])
    similarities = cosine_similarity(query_vector, tfidf_matrix)[0]
    top_k_indices = np.argsort(similarities)[-k:][::-1]
    top_k_case_ids = [tfidf_case_ids[i] for i in top_k_indices]
    top_k_scores = [similarities[i] for i in top_k_indices]
    return top_k_case_ids, top_k_scores

# Fungsi retrieval BERT
def retrieve_cases_bert(query_text, k=5):
    query_embedding = get_bert_embedding(query_text, bert_model, bert_tokenizer, DEVICE).reshape(1, -1)
    similarities = cosine_similarity(query_embedding, bert_case_embeddings)[0]
    top_k_indices = np.argsort(similarities)[-k:][::-1]
    top_k_case_ids = [bert_case_ids[i] for i in top_k_indices]
    top_k_scores = [similarities[i] for i in top_k_indices]
    return top_k_case_ids, top_k_scores

In [10]:
# --- 4. Fungsi Prediksi / Solution Reuse (DIMODIFIKASI) ---
def classify_amar_outcome(amar_text):
    if not isinstance(amar_text, str): return "TIDAK DIKETAHUI"
    amar_text_lower = amar_text.lower()
    if re.search(r"mengabulkan\s+gugatan\s+penggugat", amar_text_lower): return "MENGABULKAN GUGATAN"
    if re.search(r"menolak\s+gugatan\s+penggugat", amar_text_lower): return "MENOLAK GUGATAN"
    if re.search(r"menyatakan\s+gugatan\s+(?:penggugat|para penggugat)\s+tidak\s+dapat\s+diterima", amar_text_lower): return "GUGATAN TIDAK DAPAT DITERIMA (NO)"
    return "LAIN-LAIN"

def predict_outcome(query_text, k=5, vote_method='weighted_similarity', retrieval_method='bert'):
    """
    Memprediksi 'solusi' untuk query baru berdasarkan kasus serupa.
    Dapat menggunakan metode retrieval 'bert' atau 'tfidf'.
    """
    # Langkah 1: Pilih fungsi retrieval yang sesuai
    if retrieval_method == 'bert':
        top_k_ids, top_k_scores = retrieve_cases_bert(query_text, k)
    elif retrieval_method == 'tfidf':
        top_k_ids, top_k_scores = retrieve_cases_tfidf(query_text, k)
    else:
        raise ValueError("Metode retrieval tidak valid. Pilih 'bert' atau 'tfidf'.")

    if not top_k_ids:
        return "Tidak ada kasus serupa yang ditemukan", []

    # Langkah 2 & 3: Ekstrak, klasifikasi, dan lakukan voting (logika ini tetap sama)
    solutions = [classify_amar_outcome(df_cases.loc[case_id, 'amar_putusan']) for case_id in top_k_ids if case_id in df_cases.index]

    predicted_solution = "Tidak dapat diprediksi"
    if vote_method == 'majority_vote' and solutions:
        vote_counts = Counter(solutions)
        if vote_counts: predicted_solution = vote_counts.most_common(1)[0][0]

    elif vote_method == 'weighted_similarity' and solutions:
        weighted_scores = {}
        for i, category in enumerate(solutions):
            if category not in weighted_scores: weighted_scores[category] = 0
            weighted_scores[category] += top_k_scores[i]
        if weighted_scores: predicted_solution = max(weighted_scores, key=weighted_scores.get)

    return predicted_solution, top_k_ids

In [11]:
# --- 5. Demo Manual & Penyimpanan Hasil ---
print("\n--- Demo dan Penyimpanan Hasil Prediksi (Untuk Kedua Model) ---")

prediction_results_all = []
if test_queries:
    for query_data in test_queries:
        query_id = query_data['query_id']
        query_text = query_data['query_text']
        print(f"\n==================== Memproses Query ID: {query_id} ====================")

        # Jalankan untuk kedua metode retrieval
        for method in ['bert', 'tfidf']:
            print(f"\n--- Menggunakan Metode Retrieval: {method.upper()} ---")
            predicted_solution, top_5_ids = predict_outcome(query_text, k=5, retrieval_method=method)

            print(f"-> Top 5 Similar Case IDs: {top_5_ids}")
            print(f"-> Predicted Outcome/Solution: '{predicted_solution}'")

            # Simpan hasil untuk CSV
            prediction_results_all.append({
                "query_id": query_id,
                "retrieval_method": method.upper(), # Tambahkan kolom ini
                "predicted_solution": predicted_solution,
                "top_5_case_ids": json.dumps(top_5_ids)
            })
else:
    print("Tidak ada query untuk diuji.")

# Simpan hasil prediksi ke file CSV dengan format baru
if prediction_results_all:
    df_predictions = pd.DataFrame(prediction_results_all)
    PREDICTIONS_CSV_FILE = os.path.join(PATH_RESULTS, "predictions_comparison.csv")
    df_predictions.to_csv(PREDICTIONS_CSV_FILE, index=False, encoding='utf-8-sig')
    print(f"\n\n==================================================================")
    print(f"Hasil perbandingan prediksi berhasil disimpan ke: {PREDICTIONS_CSV_FILE}")
    print("Cuplikan hasil perbandingan prediksi:")
    print(df_predictions)

print("\n--- Tahap 4 (Versi Perbandingan) Selesai ---")


--- Demo dan Penyimpanan Hasil Prediksi (Untuk Kedua Model) ---


--- Menggunakan Metode Retrieval: BERT ---
-> Top 5 Similar Case IDs: [30, 2, 29, 7, 23]
-> Predicted Outcome/Solution: 'LAIN-LAIN'

--- Menggunakan Metode Retrieval: TFIDF ---
-> Top 5 Similar Case IDs: [4, 12, 23, 14, 34]
-> Predicted Outcome/Solution: 'LAIN-LAIN'


--- Menggunakan Metode Retrieval: BERT ---
-> Top 5 Similar Case IDs: [2, 30, 22, 23, 12]
-> Predicted Outcome/Solution: 'LAIN-LAIN'

--- Menggunakan Metode Retrieval: TFIDF ---
-> Top 5 Similar Case IDs: [10, 22, 4, 14, 1]
-> Predicted Outcome/Solution: 'LAIN-LAIN'


--- Menggunakan Metode Retrieval: BERT ---
-> Top 5 Similar Case IDs: [30, 7, 2, 22, 10]
-> Predicted Outcome/Solution: 'LAIN-LAIN'

--- Menggunakan Metode Retrieval: TFIDF ---
-> Top 5 Similar Case IDs: [1, 10, 33, 2, 7]
-> Predicted Outcome/Solution: 'LAIN-LAIN'


Hasil perbandingan prediksi berhasil disimpan ke: /content/drive/MyDrive/Semester 6/PK/UAS/data/results/predictions_comparison.c