# TAHAP 4
## Penalaran Komputer UAS
## **Anggota:**
## Haidar Dimas Heryanto - 202210370311088
## Zeedan Mustami Argani - 202210370311104

In [1]:
# 04_Solution_Reuse.ipynb

import os
import pandas as pd
import numpy as np
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter # For majority vote
import nltk
import re

In [2]:
# Install necessary libraries
!pip install pandas requests beautifulsoup4 pdfminer.six lxml > /dev/null 2>&1

import argparse
import io
import os
import re
import time
import urllib
from concurrent.futures import ThreadPoolExecutor, wait
from datetime import date
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pdfminer import high_level # For PDF text extraction
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# --- NLTK Setup (for word counting) ---
try:
    nltk.data.find('tokenizers/punkt')
    # Add this line to also check and download 'punkt_tab'
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    print("NLTK 'punkt' or 'punkt_tab' not found. Downloading...")
    nltk.download('punkt')
    nltk.download('punkt_tab') # Download the missing resource
    print("NLTK 'punkt' and 'punkt_tab' downloaded.")
except Exception as e:
     print(f"An unexpected error occurred during NLTK setup: {e}")

Mounted at /content/drive
NLTK 'punkt' or 'punkt_tab' not found. Downloading...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...


NLTK 'punkt' and 'punkt_tab' downloaded.


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [3]:
# For BERT (if chosen for retrieval)
# !pip install transformers sentence-transformers > /dev/null 2>&1 # Already installed in N3 if used
import torch
from transformers import AutoTokenizer, AutoModel

from google.colab import drive

# --- Configuration Section ---
# !!! IMPORTANT: Ensure these paths match your Google Drive structure
# and the outputs from previous notebooks !!!
BASE_DRIVE_PATH = "/content/drive/MyDrive/Penalaran Komputer UAS/" # Change to your project folder

# Paths for input data
PATH_PROCESSED_INPUT = os.path.join(BASE_DRIVE_PATH, "data/processed")
PROCESSED_CSV_FILENAME = "cases_processed.csv"
PATH_EVAL_INPUT = os.path.join(BASE_DRIVE_PATH, "data/eval")
QUERIES_JSON_FILENAME = "queries.json"

# Path for output results
PATH_RESULTS_OUTPUT = os.path.join(BASE_DRIVE_PATH, "data/results")
os.makedirs(PATH_RESULTS_OUTPUT, exist_ok=True)
PREDICTIONS_CSV_FILENAME = "predictions.csv"

# BERT Model (must be same as in Notebook 3 if using BERT)
BERT_MODEL_NAME = 'indobenchmark/indobert-base-p1'
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Field in processed_cases_df to be used as "solution"
SOLUTION_FIELD = 'amar_kategori' # As per PDF, can be 'amar putusan' or 'ringkasan dakwaan'

# --- NLTK Setup (for preprocessing if needed for TF-IDF) ---
# Corrected exception handling to catch LookupError
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError: # Catch the actual LookupError raised by nltk.data.find
    print("NLTK 'punkt' or 'stopwords' not found. Downloading...")
    nltk.download('punkt_tab', quiet=True)
    nltk.download('stopwords', quiet=True)
    print("NLTK resources downloaded.")
except Exception as e:
     print(f"An unexpected error occurred during NLTK setup: {e}")


# Check if stopwords were downloaded successfully
try:
    indonesian_stopwords = nltk.corpus.stopwords.words('indonesian')
except LookupError:
    print("Could not load Indonesian stopwords. Please check NLTK download.")
    indonesian_stopwords = []

Using device: cpu
NLTK 'punkt' or 'stopwords' not found. Downloading...
NLTK resources downloaded.


In [4]:
# --- Helper Functions (Reused/Adapted from Notebook 3) ---
def preprocess_text_for_tfidf(text):
    if pd.isna(text) or not text: return ""
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in indonesian_stopwords and word.isalpha()]
    return " ".join(words)

def load_data_and_models():
    """Loads processed cases, TF-IDF vectorizer, case vectors, BERT components, etc."""
    global df_cases, fitted_tfidf_vectorizer, case_vectors_tfidf, case_ids_global_tfidf
    global bert_tokenizer, bert_model, case_embeddings_bert, case_ids_global_bert
    global RETRIEVAL_TEXT_SOURCE_COLUMN

    # Load processed cases
    processed_data_filepath = os.path.join(PATH_PROCESSED_INPUT, PROCESSED_CSV_FILENAME)
    try:
        df_cases = pd.read_csv(processed_data_filepath)
        print(f"Successfully loaded processed data from: {processed_data_filepath} with shape {df_cases.shape}")

        # Determine retrieval text source (consistent with N3)
        if 'ringkasan_fakta' in df_cases.columns and not df_cases['ringkasan_fakta'].isna().all():
            RETRIEVAL_TEXT_SOURCE_COLUMN = 'ringkasan_fakta'
        else:
            RETRIEVAL_TEXT_SOURCE_COLUMN = 'text_full'
        df_cases[RETRIEVAL_TEXT_SOURCE_COLUMN] = df_cases[RETRIEVAL_TEXT_SOURCE_COLUMN].fillna('')
        print(f"Using '{RETRIEVAL_TEXT_SOURCE_COLUMN}' for retrieval text source.")

        if SOLUTION_FIELD not in df_cases.columns:
            print(f"Error: Solution field '{SOLUTION_FIELD}' not found in df_cases. Prediction will fail.")
            # Fallback or handle error
            df_cases[SOLUTION_FIELD] = "SOLUTION_NOT_AVAILABLE"


    except FileNotFoundError:
        print(f"Error: Processed data file not found at {processed_data_filepath}.")
        return False
    except Exception as e:
        print(f"Error loading processed data: {e}")
        return False

    # Initialize TF-IDF components (re-fit based on loaded data)
    print("\n[Initializing and Fitting TF-IDF Components...]")
    tfidf_vectorizer_local = TfidfVectorizer(preprocessor=preprocess_text_for_tfidf, max_df=0.95, min_df=2, ngram_range=(1,2))
    texts_for_tfidf = df_cases[RETRIEVAL_TEXT_SOURCE_COLUMN].tolist()
    try:
        case_vectors_tfidf = tfidf_vectorizer_local.fit_transform(texts_for_tfidf)
        fitted_tfidf_vectorizer = tfidf_vectorizer_local # Assign to global
        case_ids_global_tfidf = df_cases['case_id'].tolist()
        print(f"TF-IDF fitting complete. Shape: {case_vectors_tfidf.shape}")
    except Exception as e:
        print(f"Error fitting TF-IDF: {e}")
        fitted_tfidf_vectorizer = None # Ensure it's None if failed

    # Initialize BERT components (re-generate embeddings or load if saved - for simplicity, re-generate)
    print("\n[Initializing BERT Components and Generating Embeddings...]")
    try:
        bert_tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
        bert_model = AutoModel.from_pretrained(BERT_MODEL_NAME).to(DEVICE)
        bert_model.eval()
        print(f"BERT model '{BERT_MODEL_NAME}' loaded successfully.")

        texts_for_bert = df_cases[RETRIEVAL_TEXT_SOURCE_COLUMN].tolist()
        embeddings_list = []
        BERT_EMBEDDING_DIM = bert_model.config.hidden_size # Get dim from model
        for i, text in enumerate(texts_for_bert):
            if (i + 1) % 20 == 0 or i == len(texts_for_bert) - 1 : print(f"  BERT embedding for doc {i+1}/{len(texts_for_bert)}")
            inputs = bert_tokenizer(str(text).strip() if pd.notna(text) else "", return_tensors='pt', max_length=512, truncation=True, padding='max_length')
            inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
            with torch.no_grad(): outputs = bert_model(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy().squeeze()
            embeddings_list.append(cls_embedding if cls_embedding.size > 0 else np.zeros(BERT_EMBEDDING_DIM))

        if embeddings_list:
            case_embeddings_bert = np.array(embeddings_list)
            case_ids_global_bert = df_cases['case_id'].tolist()
            print(f"BERT embeddings generation complete. Shape: {case_embeddings_bert.shape}")
        else: case_embeddings_bert = None

    except Exception as e:
        print(f"Could not load/run BERT model '{BERT_MODEL_NAME}': {e}. BERT retrieval will not be available.")
        bert_tokenizer, bert_model, case_embeddings_bert = None, None, None
    return True


# --- Retrieval Function (Adapted from N3 to return scores) ---
def retrieve_cases_with_scores(query_text, retrieval_method="tfidf", k=5):
    """Retrieves top-k similar case IDs and their similarity scores."""
    results = [] # List of (case_id, score)
    if retrieval_method == "tfidf":
        if fitted_tfidf_vectorizer is None or case_vectors_tfidf is None: return []
        processed_query = preprocess_text_for_tfidf(query_text)
        query_vector = fitted_tfidf_vectorizer.transform([processed_query])
        similarities = cosine_similarity(query_vector, case_vectors_tfidf).flatten()
        top_k_indices = similarities.argsort()[-k:][::-1]
        results = [(case_ids_global_tfidf[i], similarities[i]) for i in top_k_indices]

    elif retrieval_method == "bert":
        if case_embeddings_bert is None or not bert_model: return []
        BERT_EMBEDDING_DIM = bert_model.config.hidden_size
        query_input_text = str(query_text).strip()
        if not query_input_text: query_embedding = np.zeros(BERT_EMBEDDING_DIM)
        else:
            inputs = bert_tokenizer(query_input_text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
            inputs = {k_in: v_in.to(DEVICE) for k_in, v_in in inputs.items()}
            with torch.no_grad(): outputs = bert_model(**inputs)
            query_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy().squeeze()

        query_embedding = query_embedding.reshape(1, -1)
        similarities = cosine_similarity(query_embedding, case_embeddings_bert).flatten()
        top_k_indices = similarities.argsort()[-k:][::-1]
        results = [(case_ids_global_bert[i], similarities[i]) for i in top_k_indices]
    return results

# --- Tahap 4.a: Ekstrak Solusi ---
def get_solution_for_case(case_id, df_cases_local, solution_field_local=SOLUTION_FIELD):
    """Extracts the solution text for a given case_id."""
    solution_series = df_cases_local.loc[df_cases_local['case_id'] == case_id, solution_field_local]
    return solution_series.iloc[0] if not solution_series.empty else "SOLUTION_NOT_FOUND"

# --- Tahap 4.b: Algoritma Prediksi ---
def majority_vote_solution(retrieved_solutions_texts):
    """Determines the most common solution text by majority vote."""
    if not retrieved_solutions_texts: return "NO_SOLUTION_RETRIEVED"
    vote_counts = Counter(retrieved_solutions_texts)
    most_common = vote_counts.most_common(1)
    return most_common[0][0] if most_common else "TIE_OR_EMPTY_VOTE"

def weighted_similarity_solution(retrieved_cases_with_scores_list, df_cases_local, solution_field_local=SOLUTION_FIELD):
    """Determines solution based on weighted similarity scores."""
    if not retrieved_cases_with_scores_list: return "NO_SOLUTION_RETRIEVED_FOR_WEIGHTED"

    solution_scores = {} # {'solution_text_A': total_score, 'solution_text_B': total_score}
    for case_id, score in retrieved_cases_with_scores_list:
        solution_text = get_solution_for_case(case_id, df_cases_local, solution_field_local)
        if solution_text == "SOLUTION_NOT_FOUND" or solution_text == "SOLUTION_NOT_AVAILABLE": continue # Skip if no valid solution
        solution_scores[solution_text] = solution_scores.get(solution_text, 0) + score

    if not solution_scores: return "NO_VALID_SOLUTIONS_FOR_WEIGHTING"
    # Return the solution text with the highest aggregated score
    return max(solution_scores, key=solution_scores.get)


# --- Tahap 4.c: Implementasi Fungsi predict_outcome ---
def predict_outcome(query_text, df_cases_local, retrieval_method="tfidf", k_retrieve=5,
                    prediction_algorithm="majority_vote", solution_field_param=SOLUTION_FIELD):
    """
    Predicts an outcome for a query based on retrieved similar cases.
    Returns predicted_solution_text and list of top_k_case_ids.
    """
    top_k_cases_with_scores = retrieve_cases_with_scores(query_text, retrieval_method=retrieval_method, k=k_retrieve) #

    if not top_k_cases_with_scores:
        return "NO_SIMILAR_CASES_FOUND", []

    top_k_case_ids_only = [case_id for case_id, score in top_k_cases_with_scores]
    retrieved_solutions_texts = [get_solution_for_case(cid, df_cases_local, solution_field_param) for cid in top_k_case_ids_only] #
    # Filter out cases where solution might not be found (though get_solution_for_case handles this)
    valid_solutions = [s for s in retrieved_solutions_texts if s not in ["SOLUTION_NOT_FOUND", "SOLUTION_NOT_AVAILABLE"]]


    predicted_solution = "PREDICTION_FAILED"
    if not valid_solutions:
         predicted_solution = "NO_VALID_SOLUTIONS_IN_TOP_K"
    elif prediction_algorithm == "majority_vote": #
        predicted_solution = majority_vote_solution(valid_solutions)
    elif prediction_algorithm == "weighted_similarity": #
        # We need to pass the original scores along with case_ids to the weighting function
        valid_cases_for_weighting = []
        for case_id, score in top_k_cases_with_scores:
            solution_text_temp = get_solution_for_case(case_id, df_cases_local, solution_field_param)
            if solution_text_temp not in ["SOLUTION_NOT_FOUND", "SOLUTION_NOT_AVAILABLE"]:
                valid_cases_for_weighting.append((case_id, score)) # Use only cases with valid solutions

        if valid_cases_for_weighting:
            predicted_solution = weighted_similarity_solution(valid_cases_for_weighting, df_cases_local, solution_field_param)
        else:
            predicted_solution = "NO_VALID_SOLUTIONS_FOR_WEIGHTING_IN_TOP_K"
    else:
        predicted_solution = "UNKNOWN_PREDICTION_ALGORITHM"

    return predicted_solution, top_k_case_ids_only


# --- Tahap 4.d: Demo Manual & Output ---
def run_prediction_demo_and_save():
    global df_cases # Ensure df_cases is accessible
    queries_json_filepath = os.path.join(PATH_EVAL_INPUT, QUERIES_JSON_FILENAME)
    try:
        with open(queries_json_filepath, 'r', encoding='utf-8') as f:
            queries_for_demo = json.load(f)
    except FileNotFoundError:
        print(f"Error: Queries file not found at {queries_json_filepath}. Cannot run demo.")
        # Create some dummy queries if file not found for demo purposes
        queries_for_demo = [{"query_id": "DemoQ1", "query_text": "Contoh query pidana militer tentang desersi"}]
        if df_cases is not None and not df_cases.empty: # try to get a real query text from df_cases
            queries_for_demo[0]["query_text"] = df_cases[RETRIEVAL_TEXT_SOURCE_COLUMN].iloc[0][:100] # use first case's text
        print("Using dummy queries for demo as queries.json was not found.")


    predictions_log = []
    retrieval_choice = "tfidf" # Choose 'tfidf' or 'bert'
    # Check if BERT is usable
    if bert_model is None or case_embeddings_bert is None:
        print("BERT model/embeddings not available, defaulting retrieval to TF-IDF for predictions.")
        retrieval_choice = "tfidf"
    if fitted_tfidf_vectorizer is None:
        print("TF-IDF model not available. Predictions might fail.")


    print(f"\n--- Running Prediction Demo (using {retrieval_choice} for retrieval and {SOLUTION_FIELD} as solution) ---")

    # As per PDF: "Siapkan 5 contoh kasus baru"  - we'll use queries from queries.json
    for query_data in queries_for_demo[:5]: # Demo with up to 5 queries
        query_id = query_data['query_id']
        query_text = query_data['query_text']
        print(f"\nProcessing Query ID: {query_id} - Query: \"{query_text[:100]}...\"")

        # Predict using majority vote
        predicted_sol_majority, top_k_ids_majority = predict_outcome(
            query_text, df_cases, retrieval_method=retrieval_choice, k_retrieve=5,
            prediction_algorithm="majority_vote"
        )
        print(f"  Predicted Solution (Majority Vote): {predicted_sol_majority[:200]}...") # Print preview
        predictions_log.append({
            "query_id": query_id,
            "query_text_preview": query_text[:100]+"...",
            "retrieval_method": retrieval_choice,
            "prediction_algorithm": "majority_vote",
            "predicted_solution": predicted_sol_majority,
            "top_5_case_ids": ", ".join(top_k_ids_majority)
        })

        # Predict using weighted similarity
        predicted_sol_weighted, top_k_ids_weighted = predict_outcome(
            query_text, df_cases, retrieval_method=retrieval_choice, k_retrieve=5,
            prediction_algorithm="weighted_similarity"
        )
        print(f"  Predicted Solution (Weighted Similarity): {predicted_sol_weighted[:200]}...") # Print preview
        predictions_log.append({
            "query_id": query_id,
            "query_text_preview": query_text[:100]+"...",
            "retrieval_method": retrieval_choice,
            "prediction_algorithm": "weighted_similarity",
            "predicted_solution": predicted_sol_weighted,
            "top_5_case_ids": ", ".join(top_k_ids_weighted) # Assuming same top-k for both algos here for simplicity
        })

    # Save predictions to CSV
    if predictions_log:
        df_predictions = pd.DataFrame(predictions_log)
        predictions_csv_filepath = os.path.join(PATH_RESULTS_OUTPUT, PREDICTIONS_CSV_FILENAME)
        df_predictions.to_csv(predictions_csv_filepath, index=False, encoding='utf-8')
        print(f"\nPredictions saved to: {predictions_csv_filepath}")
        display(df_predictions)
    else:
        print("No predictions were logged.")

In [5]:
# --- Main Execution ---
if __name__ == "__main__":
    print("Starting Tahap 4: Solution Reuse")
    drive.mount('/content/drive', force_remount=True)

    # Load data and initialize/fit models
    # This is crucial as Notebook 4 reuses retrieval components.
    # These components need the case base data.
    if load_data_and_models():
        run_prediction_demo_and_save()
    else:
        print("Failed to load data or initialize models. Cannot run prediction demo.")

    print("\nTahap 4: Solution Reuse - Complete.")

Starting Tahap 4: Solution Reuse
Mounted at /content/drive
Successfully loaded processed data from: /content/drive/MyDrive/Penalaran Komputer UAS/data/processed/cases_processed.csv with shape (50, 24)
Using 'ringkasan_fakta' for retrieval text source.
Error: Solution field 'amar_kategori' not found in df_cases. Prediction will fail.

[Initializing and Fitting TF-IDF Components...]
TF-IDF fitting complete. Shape: (50, 292)

[Initializing BERT Components and Generating Embeddings...]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

BERT model 'indobenchmark/indobert-base-p1' loaded successfully.


model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

  BERT embedding for doc 20/50
  BERT embedding for doc 40/50
  BERT embedding for doc 50/50
BERT embeddings generation complete. Shape: (50, 768)

--- Running Prediction Demo (using tfidf for retrieval and amar_kategori as solution) ---

Processing Query ID: Q001_PO - Query: "Terdakwa merekrut korban dengan janji palsu pekerjaan sebagai pramusaji di kota besar, namun sesampa..."
  Predicted Solution (Majority Vote): NO_VALID_SOLUTIONS_IN_TOP_K...
  Predicted Solution (Weighted Similarity): NO_VALID_SOLUTIONS_IN_TOP_K...

Processing Query ID: Q002_PO - Query: "Kasus melibatkan seorang asisten rumah tangga (ART) yang direkrut untuk bekerja di luar negeri. Pela..."
  Predicted Solution (Majority Vote): NO_VALID_SOLUTIONS_IN_TOP_K...
  Predicted Solution (Weighted Similarity): NO_VALID_SOLUTIONS_IN_TOP_K...

Processing Query ID: Q003_PO - Query: "Pelaku memberikan pinjaman uang kepada keluarga korban dengan dalih untuk biaya pengobatan. Sebagai ..."
  Predicted Solution (Majority Vote): N

Unnamed: 0,query_id,query_text_preview,retrieval_method,prediction_algorithm,predicted_solution,top_5_case_ids
0,Q001_PO,Terdakwa merekrut korban dengan janji palsu pe...,tfidf,majority_vote,NO_VALID_SOLUTIONS_IN_TOP_K,"case_027, case_020, case_038, case_082, case_095"
1,Q001_PO,Terdakwa merekrut korban dengan janji palsu pe...,tfidf,weighted_similarity,NO_VALID_SOLUTIONS_IN_TOP_K,"case_027, case_020, case_038, case_082, case_095"
2,Q002_PO,Kasus melibatkan seorang asisten rumah tangga ...,tfidf,majority_vote,NO_VALID_SOLUTIONS_IN_TOP_K,"case_027, case_020, case_038, case_087, case_097"
3,Q002_PO,Kasus melibatkan seorang asisten rumah tangga ...,tfidf,weighted_similarity,NO_VALID_SOLUTIONS_IN_TOP_K,"case_027, case_020, case_038, case_087, case_097"
4,Q003_PO,Pelaku memberikan pinjaman uang kepada keluarg...,tfidf,majority_vote,NO_VALID_SOLUTIONS_IN_TOP_K,"case_027, case_038, case_020, case_095, case_082"
5,Q003_PO,Pelaku memberikan pinjaman uang kepada keluarg...,tfidf,weighted_similarity,NO_VALID_SOLUTIONS_IN_TOP_K,"case_027, case_038, case_020, case_095, case_082"
6,Q004_PO,Tindak pidana perdagangan anak di bawah umur y...,tfidf,majority_vote,NO_VALID_SOLUTIONS_IN_TOP_K,"case_027, case_038, case_096, case_020, case_087"
7,Q004_PO,Tindak pidana perdagangan anak di bawah umur y...,tfidf,weighted_similarity,NO_VALID_SOLUTIONS_IN_TOP_K,"case_027, case_038, case_096, case_020, case_087"
8,Q005_PO,Terdakwa menggunakan media sosial untuk memika...,tfidf,majority_vote,NO_VALID_SOLUTIONS_IN_TOP_K,"case_027, case_020, case_038, case_082, case_096"
9,Q005_PO,Terdakwa menggunakan media sosial untuk memika...,tfidf,weighted_similarity,NO_VALID_SOLUTIONS_IN_TOP_K,"case_027, case_020, case_038, case_082, case_096"



Tahap 4: Solution Reuse - Complete.
