# TAHAP 3
## Penalaran Komputer UAS
## **Anggota:**
## Haidar Dimas Heryanto - 202210370311088
## Zeedan Mustami Argani - 202210370311104

In [None]:
import os
import pandas as pd
import numpy as np
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split # For splitting data
import nltk
import re

In [None]:
# Install necessary libraries
!pip install pandas requests beautifulsoup4 pdfminer.six lxml > /dev/null 2>&1

import argparse
import io
import os
import re
import time
import urllib
from concurrent.futures import ThreadPoolExecutor, wait
from datetime import date
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pdfminer import high_level # For PDF text extraction
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# --- NLTK Setup (for word counting) ---
try:
    nltk.data.find('tokenizers/punkt')
    # Add this line to also check and download 'punkt_tab'
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    print("NLTK 'punkt' or 'punkt_tab' not found. Downloading...")
    nltk.download('punkt')
    nltk.download('punkt_tab') # Download the missing resource
    print("NLTK 'punkt' and 'punkt_tab' downloaded.")
except Exception as e:
     print(f"An unexpected error occurred during NLTK setup: {e}")

In [None]:
# For BERT (if chosen)
# !pip install transformers sentence-transformers > /dev/null 2>&1 # sentence-transformers is often easier for embeddings
import torch
from transformers import AutoTokenizer, AutoModel
# from sentence_transformers import SentenceTransformer # Alternative for easier embeddings

from google.colab import drive

In [None]:
# For BERT (if chosen)
# !pip install transformers sentence-transformers > /dev/null 2>&1 # sentence-transformers is often easier for embeddings
import torch
from transformers import AutoTokenizer, AutoModel
# from sentence_transformers import SentenceTransformer # Alternative for easier embeddings

from google.colab import drive
import nltk # Import NLTK here as well if used in this cell
import re # Import re if used in this cell

# --- Configuration Section ---
# !!! IMPORTANT: Ensure these paths match your Google Drive structure
# and the outputs from Notebook 2 !!!
BASE_DRIVE_PATH = "/content/drive/MyDrive/Penalaran Komputer UAS/" # Change to your project folder

# Path for input processed data from Notebook 2
PATH_PROCESSED_INPUT = os.path.join(BASE_DRIVE_PATH, "data/processed")
PROCESSED_CSV_FILENAME = "cases_processed.csv" # Assuming this is the output from N2

# Path for evaluation data output
PATH_EVAL_OUTPUT = os.path.join(BASE_DRIVE_PATH, "data/eval")
os.makedirs(PATH_EVAL_OUTPUT, exist_ok=True)
QUERIES_JSON_FILENAME = "queries.json"

# BERT Model (example)
BERT_MODEL_NAME = 'indobenchmark/indobert-base-p1'
# Or using SentenceTransformer: 'paraphrase-multilingual-MiniLM-L12-v2' or an Indonesian specific one if available

# Determine device for PyTorch (BERT)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# --- NLTK Setup (for preprocessing if needed) ---
try:
    nltk.data.find('tokenizers/punkt')
    # Catch LookupError specifically if a resource is not found
    try:
         nltk.data.find('corpora/stopwords')
    except LookupError:
         print("NLTK 'stopwords' not found. Downloading...")
         nltk.download('stopwords', quiet=True)
         print("NLTK 'stopwords' downloaded.")
except LookupError:
    # This block handles the case where 'punkt' is not found
    print("NLTK 'punkt' not found. Downloading...")
    nltk.download('punkt', quiet=True)
    print("NLTK 'punkt' downloaded.")
except Exception as e:
     print(f"An unexpected error occurred during NLTK setup: {e}")

# Now, safely load stopwords
indonesian_stopwords = nltk.corpus.stopwords.words('indonesian')

In [None]:
# --- Helper Functions ---
def preprocess_text_for_tfidf(text):
    """Basic preprocessing for TF-IDF: lowercase, remove punctuation, remove stopwords."""
    if pd.isna(text) or not text:
        return ""
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in indonesian_stopwords and word.isalpha()]
    return " ".join(words)

def load_processed_data(filepath):
    """Loads the processed data CSV."""
    try:
        df = pd.read_csv(filepath)
        print(f"Successfully loaded processed data from: {filepath} with shape {df.shape}")
        # Ensure the text column to be used for retrieval is not all NaN
        # Choose one: 'text_full', 'ringkasan_fakta', 'argumen_hukum_utama'
        # For this example, let's use 'ringkasan_fakta' as it's more concise than 'text_full'
        # and potentially more focused than 'argumen_hukum_utama' for general similarity.
        # Or, you can combine them.
        if 'ringkasan_fakta' in df.columns and df['ringkasan_fakta'].isna().all():
            print("Warning: 'ringkasan_fakta' is all NaN. Falling back to 'text_full'.")
            df['retrieval_text_source'] = df['text_full']
        elif 'ringkasan_fakta' in df.columns:
            df['retrieval_text_source'] = df['ringkasan_fakta']
        else:
            print("Warning: 'ringkasan_fakta' not found. Using 'text_full'.")
            df['retrieval_text_source'] = df['text_full']

        # Handle potential NaN values in the chosen source text by filling with empty string
        df['retrieval_text_source'] = df['retrieval_text_source'].fillna('')

        return df
    except FileNotFoundError:
        print(f"Error: Processed data file not found at {filepath}. Please run Notebook 2 first.")
        return None
    except Exception as e:
        print(f"Error loading processed data: {e}")
        return None

In [None]:
# --- Tahap 3.a: Representasi Vektor ---

# Option 1: TF-IDF
print("\n--- Initializing TF-IDF Components ---")
tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocess_text_for_tfidf, max_df=0.95, min_df=2, ngram_range=(1,2))
# max_df: ignore terms that appear in more than 95% of the documents
# min_df: ignore terms that appear in less than 2 documents
# ngram_range: consider unigrams and bigrams

# Global variables to store fitted vectorizer and case vectors for TF-IDF
fitted_tfidf_vectorizer = None
case_vectors_tfidf = None
case_ids_global_tfidf = None # To store the case_ids corresponding to case_vectors_tfidf

In [None]:
# Option 2: BERT Embeddings
print("\n--- Initializing BERT Components ---")
try:
    bert_tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
    bert_model = AutoModel.from_pretrained(BERT_MODEL_NAME).to(DEVICE)
    bert_model.eval() # Set model to evaluation mode
    print(f"BERT model '{BERT_MODEL_NAME}' loaded successfully.")
except Exception as e:
    print(f"Could not load BERT model '{BERT_MODEL_NAME}': {e}. BERT retrieval will not be available.")
    bert_tokenizer = None
    bert_model = None

# Global variables for BERT embeddings
case_embeddings_bert = None # To store NumPy array of embeddings
case_ids_global_bert = None # To store the case_ids corresponding to case_embeddings_bert
BERT_EMBEDDING_DIM = 768 # For indobert-base models

In [None]:
def get_bert_embedding(text, tokenizer, model, device, max_length=512):
    """Generates embedding for a text using a pre-trained BERT model."""
    if not tokenizer or not model:
        return np.zeros(BERT_EMBEDDING_DIM) # Return zero vector if model not loaded

    # Preprocess: BERT has its own tokenizer, typically minimal cleaning like whitespace norm.
    text = str(text).strip()
    if not text: # Handle empty string
        return np.zeros(BERT_EMBEDDING_DIM)

    inputs = tokenizer(text, return_tensors='pt', max_length=max_length, truncation=True, padding='max_length')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the embedding of the [CLS] token
    cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy().squeeze()
    return cls_embedding

def fit_vector_models(df_cases):
    """Fits TF-IDF and generates BERT embeddings for the case base."""
    global fitted_tfidf_vectorizer, case_vectors_tfidf, case_ids_global_tfidf
    global case_embeddings_bert, case_ids_global_bert

    if df_cases is None or df_cases.empty:
        print("DataFrame is empty. Cannot fit vector models.")
        return

    print("\n[Fitting TF-IDF Vectorizer...]")
    # Use the 'retrieval_text_source' column prepared in load_processed_data
    # Ensure there are no NaNs by filling with empty string (already done in load_processed_data)
    texts_for_tfidf = df_cases['retrieval_text_source'].tolist()
    try:
        case_vectors_tfidf = tfidf_vectorizer.fit_transform(texts_for_tfidf)
        fitted_tfidf_vectorizer = tfidf_vectorizer
        case_ids_global_tfidf = df_cases['case_id'].tolist()
        print(f"TF-IDF fitting complete. Shape of case_vectors_tfidf: {case_vectors_tfidf.shape}")
    except Exception as e:
        print(f"Error fitting TF-IDF: {e}")


    print("\n[Generating BERT Embeddings... (this may take a while)]")
    if bert_model and bert_tokenizer:
        # Ensure 'retrieval_text_source' is used, and it's clean
        texts_for_bert = df_cases['retrieval_text_source'].tolist()
        embeddings_list = []
        for i, text in enumerate(texts_for_bert):
            # Give some progress feedback
            if (i + 1) % 10 == 0 or i == len(texts_for_bert) - 1:
                 print(f"  Generating BERT embedding for document {i+1}/{len(texts_for_bert)}...")
            embedding = get_bert_embedding(text, bert_tokenizer, bert_model, DEVICE)
            embeddings_list.append(embedding)

        if embeddings_list:
            case_embeddings_bert = np.array(embeddings_list)
            case_ids_global_bert = df_cases['case_id'].tolist()
            print(f"BERT embeddings generation complete. Shape: {case_embeddings_bert.shape}")
        else:
            print("No BERT embeddings were generated.")
    else:
        print("BERT model not loaded. Skipping BERT embedding generation.")

In [None]:
# --- Tahap 3.c: Fungsi Retrieval ---
# As per PDF: def retrieve(query: str, k: int = 5) -> List[case_id]

def retrieve_cases(query_text, retrieval_method="tfidf", k=5):
    """
    Retrieves top-k similar case IDs for a given query text.
    Methods: "tfidf" or "bert".
    """
    if retrieval_method == "tfidf":
        if fitted_tfidf_vectorizer is None or case_vectors_tfidf is None:
            print("TF-IDF model not fitted. Please run `fit_vector_models` first.")
            return []
        # Preprocess query same way as documents
        processed_query = preprocess_text_for_tfidf(query_text)
        query_vector = fitted_tfidf_vectorizer.transform([processed_query])
        similarities = cosine_similarity(query_vector, case_vectors_tfidf).flatten()
        # Get top-k indices
        top_k_indices = similarities.argsort()[-k:][::-1]
        top_k_case_ids = [case_ids_global_tfidf[i] for i in top_k_indices]
        top_k_scores = [similarities[i] for i in top_k_indices]
        print(f"Retrieved using TF-IDF. Scores: {top_k_scores}")
        return top_k_case_ids

    elif retrieval_method == "bert":
        if case_embeddings_bert is None or not bert_model:
            print("BERT embeddings not generated or model not loaded. Please run `fit_vector_models` or check BERT setup.")
            return []
        query_embedding = get_bert_embedding(query_text, bert_tokenizer, bert_model, DEVICE)
        query_embedding = query_embedding.reshape(1, -1) # Reshape for cosine_similarity
        similarities = cosine_similarity(query_embedding, case_embeddings_bert).flatten()
        top_k_indices = similarities.argsort()[-k:][::-1]
        top_k_case_ids = [case_ids_global_bert[i] for i in top_k_indices]
        top_k_scores = [similarities[i] for i in top_k_indices]
        print(f"Retrieved using BERT. Scores: {top_k_scores}")
        return top_k_case_ids
    else:
        print(f"Unknown retrieval_method: {retrieval_method}. Choose 'tfidf' or 'bert'.")
        return []

In [None]:
# --- Tahap 3.b: Splitting Data ---
# The PDF mentions splitting data. For a pure retrieval system where the entire dataset forms the case base,
# a formal train/test split of the *cases themselves* might not be for training the retrieval model (like TF-IDF or BERT embeddings)
# but rather for evaluation purposes (e.g. if some cases are held out as queries).
# Here, we consider the entire loaded dataset as our "case base".
# The "test queries" will be defined separately in `queries.json`.

# --- Tahap 3.d: Pengujian Awal & queries.json ---
def create_and_save_sample_queries(filepath, df_cases_sample=None):
    """Creates sample queries and saves them to a JSON file."""
    # Create some sample queries. These should ideally be based on actual potential use cases
    # or derived from a small subset of your data if you split it.
    sample_queries_data = [
        {
            "query_id": "Q001_PO",
            "query_text": "Terdakwa merekrut korban dengan janji palsu pekerjaan sebagai pramusaji di kota besar, namun sesampainya di tujuan, korban justru disekap dan dipaksa menjadi pekerja seks komersial untuk keuntungan terdakwa.",
            "ground_truth_ids": ["case_027", "case_046", "case_068", "case_086", "case_095"], # Isi manual sesuai data Anda
            "ground_truth_solution": "Menolak Permohonan" #Contoh
        },
        {
            "query_id": "Q002_PO",
            "query_text": "Kasus melibatkan seorang asisten rumah tangga (ART) yang direkrut untuk bekerja di luar negeri. Pelaku menahan paspor dan alat komunikasi korban, memaksanya bekerja tanpa upah selama berbulan-bulan, dan melakukan kekerasan fisik jika korban menolak.",
            "ground_truth_ids": ["case_078", "case_020"], # Isi manual sesuai data Anda
            "ground_truth_solution": "Menolak Permohonan"
        },
        {
            "query_id": "Q003_PO",
            "query_text": "Pelaku memberikan pinjaman uang kepada keluarga korban dengan dalih untuk biaya pengobatan. Sebagai jaminan, anak korban dibawa untuk dipekerjakan. Namun, utang tersebut terus membengkak sehingga korban terjerat dalam situasi kerja paksa untuk melunasi utang.",
            "ground_truth_ids": ["case_067", "case_047"], # Isi manual sesuai data Anda
            "ground_truth_solution": "Menolak Permohonan"
        },
        {
            "query_id": "Q004_PO",
            "query_text": "Tindak pidana perdagangan anak di bawah umur yang dipekerjakan secara paksa di sebuah perkebunan. Para korban tidak diberi upah yang layak, dipaksa bekerja melebihi jam kerja normal, dan ditempatkan dalam kondisi hidup yang tidak manusiawi.",
            "ground_truth_ids": ["case_049", "case_086", "case_027", "case_046", "case_095", "case_020", "case_078", "case_068", "case_072"], # Isi manual sesuai data Anda
            "ground_truth_solution":"Menolak Permohonan"
        },
        {
            "query_id": "Q005_PO",
            "query_text": "Terdakwa menggunakan media sosial untuk memikat korban dengan menawarkan menjadi 'talent' atau model. Setelah korban bertemu, ia diancam dan dipaksa membuat konten pornografi yang hasilnya dijual oleh terdakwa secara online untuk keuntungan pribadi.",
            "ground_truth_ids": ["case_049", "case_072", "case_078"], # Isi manual sesuai data Anda
            "ground_truth_solution": "Menolak Permohonan"
        },
        {
            "query_id": "Q006_PO",
            "query_text": "Pelaku menggunakan Instagram dan TikTok untuk merekrut remaja perempuan dengan tawaran menjadi model atau influencer terkenal. Setelah bertemu, korban dibawa ke sebuah rumah, difoto dan direkam dalam kondisi tidak senonoh, kemudian dipaksa melayani klien dengan ancaman akan menyebar foto tersebut.",
            "ground_truth_ids": ["case_096", "case_089"], # Isi manual sesuai data Anda
            "ground_truth_solution": "Menolak Permohonan"
        },
        {
            "query_id": "Q007_PO",
            "query_text": "Sebuah sindikat perdagangan orang terorganisir yang melibatkan beberapa pelaku dengan peran berbeda: satu sebagai perekrut di desa, satu sebagai pengemudi, dan satu lagi sebagai penampung yang melakukan eksploitasi seksual terhadap korban.",
            "ground_truth_ids": ["case_067", "case_066", "case_056", "case_051", "case_047"], # Isi manual sesuai data Anda
            "ground_truth_solution": "Menolak Permohonan"
        }
    ]

    # Try to add some ground truth IDs if df_cases_sample is provided
    # if df_cases_sample is not None and not df_cases_sample.empty and 'case_id' in df_cases_sample.columns:
    #     all_case_ids = df_cases_sample['case_id'].tolist()
    #     if len(all_case_ids) >= 2:
    #         sample_queries_data[0]["ground_truth_ids"] = [all_case_ids[0]] # Example
    #         if 'kasus narkotika' in df_cases_sample.iloc[0].get('retrieval_text_source','').lower() : # simple check
    #              sample_queries_data[0]["ground_truth_ids"] = [df_cases_sample.iloc[0]['case_id']]

    #     if len(all_case_ids) >= 5: # for Q002
    #         # A more sophisticated way would be to find a case with "ganja"
    #         for idx, row_q_sample in df_cases_sample.iterrows():
    #             if "ganja" in str(row_q_sample.get('retrieval_text_source','')).lower():
    #                 sample_queries_data[1]["ground_truth_ids"] = [row_q_sample['case_id']]
    #                 break # found one
    #         if not sample_queries_data[1]["ground_truth_ids"] and len(all_case_ids) > 1: # fallback
    #              sample_queries_data[1]["ground_truth_ids"] = [all_case_ids[1]]


    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(sample_queries_data, f, indent=4, ensure_ascii=False)
        print(f"Sample queries saved to: {filepath}")
    except Exception as e:
        print(f"Error saving sample queries: {e}")
    return sample_queries_data

In [None]:
# --- Main Execution Logic ---
if __name__ == "__main__":
    print("Starting Tahap 3: Case Retrieval")
    drive.mount('/content/drive', force_remount=True)

    # 1. Load Data
    processed_data_filepath = os.path.join(PATH_PROCESSED_INPUT, PROCESSED_CSV_FILENAME)
    df_cases = load_processed_data(processed_data_filepath)

    if df_cases is not None and not df_cases.empty:
        # 2. Fit Vector Models (TF-IDF and/or BERT)
        # This step prepares the case base for retrieval.
        fit_vector_models(df_cases)

        # 3. Create and Save Sample Queries for Initial Testing
        queries_json_filepath = os.path.join(PATH_EVAL_OUTPUT, QUERIES_JSON_FILENAME)
        # Pass a sample of df_cases to try to populate ground_truth_ids somewhat intelligently
        sample_queries = create_and_save_sample_queries(queries_json_filepath, df_cases.head())


        # 4. Test Retrieval Function
        print("\n--- Testing Retrieval Function ---")
        if sample_queries:
            test_query_text = sample_queries[0]['query_text'] # Test with the first sample query
            k_results = 3 # Retrieve top 3 for testing

            print(f"\nQuery: '{test_query_text}'")

            # Test TF-IDF Retrieval
            if fitted_tfidf_vectorizer:
                print(f"\nRetrieving top {k_results} using TF-IDF...")
                retrieved_ids_tfidf = retrieve_cases(test_query_text, retrieval_method="tfidf", k=k_results)
                print(f"TF-IDF Retrieved Case IDs: {retrieved_ids_tfidf}")
                # You can display details of retrieved cases:
                if retrieved_ids_tfidf:
                    display(df_cases[df_cases['case_id'].isin(retrieved_ids_tfidf)][['case_id', 'no_perkara', 'retrieval_text_source']].head())


            # Test BERT Retrieval
            if case_embeddings_bert is not None:
                print(f"\nRetrieving top {k_results} using BERT...")
                retrieved_ids_bert = retrieve_cases(test_query_text, retrieval_method="bert", k=k_results)
                print(f"BERT Retrieved Case IDs: {retrieved_ids_bert}")
                if retrieved_ids_bert:
                     display(df_cases[df_cases['case_id'].isin(retrieved_ids_bert)][['case_id', 'no_perkara', 'retrieval_text_source']].head())

        else:
            print("No sample queries loaded to test retrieval.")

        # Note on SVM/Naive Bayes for "classification/retrieval"
        print("\n--- Note on SVM/Naive Bayes for Classification/Retrieval ---")
        print("The project mentions using SVM or Naive Bayes on TF-IDF for classification/retrieval.")
        print(" - For CLASSIFICATION: If your cases have labels (e.g., 'outcome: guilty/not_guilty', 'type: Perdagangan Orang '),")
        print("   you could train SVM/Naive Bayes on the TF-IDF vectors to predict these labels for new cases.")
        print("   This requires a labeled dataset and splitting into train/test for the classifier.")
        print(" - For RETRIEVAL using these classifiers: One approach could be to classify a query to a specific category,")
        print("   and then retrieve all cases from the case base belonging to that predicted category.")
        print("   This is different from direct similarity-based retrieval like cosine similarity.")
        print("   Implementation of this classification task is beyond the scope of this initial retrieval notebook but can be an extension.")

    else:
        print("Failed to load data. Cannot proceed with retrieval.")

    print("\nTahap 3: Case Retrieval - Complete.")