# TAHAP 2
## Penalaran Komputer UAS
## **Anggota:**
## Haidar Dimas Heryanto - 202210370311088
## Zeedan Mustami Argani - 202210370311104

In [None]:
import os
import pandas as pd
import re
from google.colab import drive
import nltk # Using NLTK for tokenization to count words

In [None]:
# --- Configuration Section ---
BASE_DRIVE_PATH = "/content/drive/MyDrive/Penalaran Komputer UAS/" # Change to your project folder

# Paths for input data from Notebook 1
PATH_RAW_TEXT_INPUT = os.path.join(BASE_DRIVE_PATH, "data/raw")
PATH_INITIAL_SCRAPER_CSV_INPUT = os.path.join(BASE_DRIVE_PATH, "Scraper_CSVs")

# Path for output processed data
PATH_PROCESSED_OUTPUT = os.path.join(BASE_DRIVE_PATH, "data/processed")
os.makedirs(PATH_PROCESSED_OUTPUT, exist_ok=True)

# --- NLTK Setup (for word counting) ---
try:
    nltk.data.find('tokenizers/punkt')
    # Add this line to also check and download 'punkt_tab'
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    print("NLTK 'punkt' or 'punkt_tab' not found. Downloading...")
    nltk.download('punkt')
    nltk.download('punkt_tab') # Download the missing resource
    print("NLTK 'punkt' and 'punkt_tab' downloaded.")
except Exception as e:
     print(f"An unexpected error occurred during NLTK setup: {e}")

In [None]:
# --- Helper Functions ---

def load_latest_scraper_csv(csv_folder_path):
    """Loads the most recently created CSV file from the specified folder."""
    try:
        csv_files = [f for f in os.listdir(csv_folder_path) if f.endswith('.csv')]
        if not csv_files:
            print(f"No CSV files found in {csv_folder_path}")
            return None
        # Find the latest file based on filename pattern (if includes date) or modification time
        # Assuming filenames might include dates like 'putusan_ma_KEYWORD_YYYY-MM-DD.csv'
        csv_files.sort(key=lambda name: os.path.getmtime(os.path.join(csv_folder_path, name)), reverse=True)
        latest_csv_filename = csv_files[0]
        print(f"Loading latest scraper CSV: {latest_csv_filename}")
        return pd.read_csv(os.path.join(csv_folder_path, latest_csv_filename))
    except Exception as e:
        print(f"Error loading latest CSV: {e}")
        return None

def read_raw_text_file(filename, raw_text_folder):
    """Reads the content of a raw text file."""
    if not filename or filename.strip().upper() == "A":
        print(f"Skipping invalid filename: '{filename}'")
        return ""

    filepath = os.path.join(raw_text_folder, filename)

    if not os.path.exists(filepath):
        print(f"Warning: Raw text file does not exist: {filepath}")
        return ""

    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        print(f"Error reading raw text file {filepath}: {e}")
        return ""

def count_words(text):
    """Counts words in a text using NLTK tokenization."""
    if pd.isna(text) or not text:
        return 0
    tokens = nltk.word_tokenize(str(text))
    return len(tokens)

def extract_section_heuristic(text, keywords_start, keywords_end=None, limit_chars=5000, verbose=False):
    """
    Extracts a section of text based on start and optional end keywords using heuristics.
    - If start keyword found but end keyword not found: extract until limit_chars or end of text.
    - If both found: extract between them.
    - If none found: return "".

    Parameters:
        text (str): Full document text.
        keywords_start (list): List of regex patterns (strings) for start.
        keywords_end (list): List of regex patterns (strings) for end (optional).
        limit_chars (int): Max character length of extracted segment.
        verbose (bool): If True, print debug information.

    Returns:
        str: Extracted text section.
    """
    if pd.isna(text) or not text:
        return ""

    text_lower = text.lower()
    start_index = -1
    matched_start_keyword = None

    # 1. Cari start keyword
    for kw_pattern in keywords_start:
        match = re.search(kw_pattern, text_lower)
        if match:
            start_index = match.end()
            matched_start_keyword = kw_pattern
            break

    if start_index == -1:
        if verbose:
            print("[extract_section_heuristic] Start keyword not found.")
        return ""

    # 2. Cari end keyword (opsional)
    end_index = len(text)
    matched_end_keyword = None

    if keywords_end:
        found_end = False
        for kw_pattern in keywords_end:
            match = re.search(kw_pattern, text_lower[start_index:])
            if match:
                temp_end_index = start_index + match.start()
                if temp_end_index < end_index:
                    end_index = temp_end_index
                    matched_end_keyword = kw_pattern
                    found_end = True

        if not found_end and verbose:
            print(f"[extract_section_heuristic] End keyword not found. Extracting until limit ({limit_chars}).")

    extracted = text[start_index:end_index].strip()

    if verbose:
        print(f"[extract_section_heuristic] Start match: {matched_start_keyword}")
        print(f"[extract_section_heuristic] End match: {matched_end_keyword}")
        print(f"[extract_section_heuristic] Extracted length: {len(extracted)} chars")

    return extracted[:limit_chars]

In [None]:
# --- Main Processing Logic ---
print("Starting Tahap 2: Case Representation")

# 1. Load initial data (CSV from scraper and raw text files)
print(f"\n[1. Loading Data]")
df_initial = load_latest_scraper_csv(PATH_INITIAL_SCRAPER_CSV_INPUT)

# --- Filter rows: Remove rows with 'A' or NaN in 'nama_file_raw_text' ---
if 'nama_file_raw_text' in df_initial.columns:
    df_initial = df_initial[
        df_initial['nama_file_raw_text'].notna() &  # Bukan NaN
        ~df_initial['nama_file_raw_text'].str.contains(r'\bA\b', na=False)  # Tidak mengandung huruf A tunggal
    ]
    print(f"Filtered data, remaining rows after removing invalid 'nama_file_raw_text': {len(df_initial)}")

if df_initial is None or df_initial.empty:
    print("Could not load initial scraper CSV. Please ensure Notebook 1 ran successfully and the CSV exists.")
    # Exit or handle error appropriately
else:
    print(f"Loaded initial data with {len(df_initial)} records.")
    # Ensure 'nama_file_raw_text' column exists
    if 'nama_file_raw_text' not in df_initial.columns:
        print("ERROR: 'nama_file_raw_text' column not found in the CSV. This column is needed to load full text.")
        # Exit or handle
    else:
        # Load full text for each case
        df_initial['full_text_putusan'] = df_initial['nama_file_raw_text'].apply(
            lambda x: read_raw_text_file(x, PATH_RAW_TEXT_INPUT) if pd.notna(x) else ""
        )
        print("Full text loaded into DataFrame.")

        # Initialize new columns for extracted features
        df_initial['ringkasan_fakta'] = ""
        df_initial['argumen_hukum_utama'] = ""
        df_initial['pasal_digunakan_extracted'] = "" # For refined pasal extraction
        df_initial['pihak_terlibat_extracted'] = ""   # For refined pihak extraction

        # 2. Metadata Extraction & Refinement
        # Most metadata (nomor perkara, tanggal, jenis perkara) is already in df_initial from the scraper.
        # We might want to refine 'pasal_digunakan' and 'nama_pihak' if the initial scrape was basic.
        print(f"\n[2. Extracting/Refining Metadata & Text Features]")

        # Keywords for extracting "Ringkasan Fakta"
        # These are examples and might need adjustment based on actual document structures
        fakta_keywords_start = [
            r"tentang pokok sengketa pengajuan peninjauan kembali",
            r"alasan dan penjelasan permohonan banding",
            r"alasan permohonan banding",
            r"pokok sengketa;",
            r"kronologis sengketa pajak",
            r"menimbang,\s*bahwa\s*terdakwa\s*diajukan\s*ke\s*persidangan",
            r"menimbang,\s*bahwa\s*penggugat\s*dalam\s*surat\s*gugatannya",
            r"duduk\s*perkara:",
            r"fakta-fakta\s*hukum\s*yang\s*terungkap",
            r"menimbang,\s*bahwa\s*untuk\s*membuktikan\s*dalil-dalilnya",
            r"uraian\s*singkat\s*mengenai\s*kejadian",
            r"tentang\s*duduk\s*perkara"
        ]
        fakta_keywords_end = [ # Stop before legal considerations or verdict
            r"pertimbangan hukum",
            r"tentang pertimbangan hukum",
            r"menimbang, bahwa terhadap alasan-alasan peninjauan kembali",
            r"menimbang,\s*bahwa\s*selanjutnya\s*majelis\s*hakim\s*akan\s*mempertimbangkan",
            r"pertimbangan\s*hukum",
            r"tentang\s*pertimbangan\s*hukum",
            r"amar\s*putusan",
            r"mengadili",
            r"permohonan kasasi", r"duduk\s*sengketa", r"pertimbangan\s*permohonan",
            r"permohonan pemohon", r"kejadian perkara", r"uraian kejadian"
        ]

        # Keywords for "Argumen Hukum Utama" (Pertimbangan Hukum)
        argumen_keywords_start = [
            r"pertimbangan hukum", # Header utama
            r"menimbang, bahwa terhadap alasan-alasan peninjauan kembali tersebut, mahkamah agung berpendapat",
            r"menimbang, bahwa alasan-alasan permohonan pemohon peninjauan kembali tidak dapat dibenarkan",
            r"menimbang, bahwa alasan-alasan permohonan pemohon peninjauan kembali dapat dibenarkan",
            r"pertimbangan\s*hukum",
            r"tentang\s*pertimbangan\s*hukum",
            r"menimbang,\s*bahwa\s*terhadap\s*eksepsi", # Start of legal reasoning
            r"menimbang,\s*bahwa\s*majelis\s*hakim\s*berpendapat",
            r"menimbang,\s*bahwa\s*oleh\s*karena\s*itu\s*dengan\s*memperhatikan"
        ]
        argumen_keywords_end = [ # Stop before the final verdict/amar
            r"memperhatikan pasal-pasal dari undang-undang",
            r"mengadili,",
            r"amar\s*putusan",
            r"mengadili",
            r"memutuskan",
            r"menetapkan"
        ]

        # Regex for "Pasal Digunakan" (example, very basic, often complex)
        # Looks for patterns like "Pasal X ayat (Y) UU No. Z Tahun A" or KUHP/KUHAP etc.
        pasal_regex_patterns = [
            r"pasal\s*\d+\s*(ayat\s*\(?\s*\d+\s*\)?\s*)?(huruf\s*[a-z]\s*)?\s*(uu|undang-undang)\s*(nomor|no\.)?\s*\d+\s*tahun\s*\d+",
            r"pasal\s*\d+\s*(ayat\s*\(?\s*\d+\s*\)?\s*)?\s*kuhp(?:idana)?",
            r"pasal\s*\d+\s*(ayat\s*\(?\s*\d+\s*\)?\s*)?\s*kuhperdata",
            r"peraturan pemerintah\s*(nomor|no\.)?\s*\d+\s*tahun\s*\d+",
            r"peraturan menteri keuangan\s*(nomor|no\.)?[\s\w./-]+", # Mencakup format seperti 78/PMK.03/2010
            r"keputusan direktur jenderal pajak\s*(nomor|no\.)?[\s\w./-]+", # Mencakup format seperti KEP-539/PJ./2001
            r"surat edaran\s*(direktur jenderal pajak)?\s*(nomor|no\.)?[\s\w./-]+" # Mencakup format seperti SE-90/PJ/2011
        ]

        # Function to extract ringkasan fakta with fallback mechanism
        def extract_ringkasan_fakta_enhanced(full_text, raw_filename, case_id):
            """
            Enhanced function to extract ringkasan fakta with multiple fallback levels:
            1. Try to extract from full_text (main putusan)
            2. If not found, try to extract from raw text file directly
            3. If still not found, return "TIDAK TERDETEKSI"
            """
            print(f"  -> Extracting ringkasan fakta for case_id: {case_id}")

            # Level 1: Try extracting from main full_text
            ringkasan_level1 = extract_section_heuristic(
                full_text, fakta_keywords_start, fakta_keywords_end, limit_chars=4000
            )

            if ringkasan_level1 and ringkasan_level1.strip() and len(ringkasan_level1.strip()) > 50:
                print(f"  -> Ringkasan fakta found in main text (Level 1)")
                return ringkasan_level1

            # Level 2: Try reading raw file directly and extract from there
            print(f"  -> Ringkasan fakta not found in main text, trying raw file (Level 2)...")
            try:
                if pd.notna(raw_filename) and raw_filename.strip():
                    raw_text_direct = read_raw_text_file(raw_filename, PATH_RAW_TEXT_INPUT)
                    if raw_text_direct and raw_text_direct.strip():
                        ringkasan_level2 = extract_section_heuristic(
                            raw_text_direct, fakta_keywords_start, fakta_keywords_end, limit_chars=4000
                        )

                        if ringkasan_level2 and ringkasan_level2.strip() and len(ringkasan_level2.strip()) > 50:
                            print(f"  -> Ringkasan fakta found in raw file (Level 2)")
                            return ringkasan_level2
                        else:
                            print(f"  -> Ringkasan fakta not found in raw file either")
                    else:
                        print(f"  -> Raw file is empty or could not be read")
                else:
                    print(f"  -> Raw filename is invalid")
            except Exception as e:
                print(f"  -> Error reading raw file: {str(e)}")

            # Level 3: Not found anywhere
            print(f"  -> Ringkasan fakta TIDAK TERDETEKSI after all attempts")
            return "TIDAK TERDETEKSI"

        for index, row in df_initial.iterrows():
            full_text = str(row['full_text_putusan'])
            full_text_lower = full_text.lower() # Gunakan versi lowercase untuk pencarian
            raw_filename = row.get('nama_file_raw_text', '')
            case_id = row.get('case_id', f'Index-{index}')

            print(f"Processing case_id: {case_id}...")

            # Ekstraksi Pihak Terlibat (Lebih Akurat)
            # Pola: PEMOHON... melawan: TERMOHON...
            pihak_match = re.search(
                r"(pemohon peninjauan kembali.*?)(?:melawan:|lawan)(.*?)(?:mahkamah agung tersebut;)",
                full_text,
                re.IGNORECASE | re.DOTALL
            )
            if pihak_match:
                pemohon_text = re.sub(r'\s+', ' ', pihak_match.group(1)).strip()
                termohon_text = re.sub(r'\s+', ' ', pihak_match.group(2)).strip()
                df_initial.loc[index, 'pihak_terlibat_extracted'] = f"Pemohon: {pemohon_text} vs Termohon: {termohon_text}"
            else:
                # Fallback jika pola utama tidak ditemukan
                df_initial.loc[index, 'pihak_terlibat_extracted'] = row.get('nama_pihak', 'N/A')

            # Extract Ringkasan Fakta (menggunakan enhanced function dengan fallback)
            df_initial.loc[index, 'ringkasan_fakta'] = extract_ringkasan_fakta_enhanced(
                full_text, raw_filename, case_id
            )

            # Extract Argumen Hukum Utama (menggunakan keywords baru)
            df_initial.loc[index, 'argumen_hukum_utama'] = extract_section_heuristic(
                full_text, argumen_keywords_start, argumen_keywords_end, limit_chars=5000
            ) or "TIDAK TERDETEKSI"

            # Extract Pasal Digunakan (menggunakan regex baru)
            found_pasal_list = []
            # Cari di seluruh teks (versi lowercase)
            for pattern in pasal_regex_patterns:
                matches = re.findall(pattern, full_text_lower)
                for match in matches:
                    pasal_text = match if isinstance(match, str) else " ".join(filter(None, match))
                    normalized_pasal = ' '.join(pasal_text.split()).strip()
                    if normalized_pasal and normalized_pasal not in found_pasal_list:
                        found_pasal_list.append(normalized_pasal)

            # Gabungkan hasil dan bersihkan dari duplikat
            df_initial.loc[index, 'pasal_digunakan_extracted'] = "; ".join(sorted(list(set(found_pasal_list)))) if found_pasal_list else row.get('pasal_digunakan', '')

        # 3. Feature Engineering
        print(f"\n[3. Performing Feature Engineering]")

        # Calculate length (jumlah kata) for key text fields
        df_initial['jumlah_kata_full_text'] = df_initial['full_text_putusan'].apply(count_words)
        df_initial['jumlah_kata_ringkasan_fakta'] = df_initial['ringkasan_fakta'].apply(count_words)
        df_initial['jumlah_kata_argumen_hukum'] = df_initial['argumen_hukum_utama'].apply(count_words)
        print("Word counts calculated.")

        # Bag-of-Words (BoW) - Will be implicitly handled by TF-IDF in Tahap 3.
        # For this stage, we can note its conceptual presence or skip explicit generation
        # to avoid large sparse matrices in this intermediate CSV.
        # If needed, one could tokenize and store counts, but it's often not stored directly.
        print("Conceptual Bag-of-Words representation will be handled in later stages (e.g., TF-IDF).")

        # QA-pairs sederhana - This is an advanced feature.
        # For a "sederhana" system, this could be:
        # - Placeholder: Indicating it's a potential future enhancement.
        # - Heuristic: Extracting questions from "Pertimbangan Hukum" if any explicit questions are posed.
        # For now, we'll add a placeholder column.
        df_initial['qa_pairs_sederhana'] = "NOT_IMPLEMENTED" # Placeholder
        print("QA-pairs (sederhana) marked as NOT_IMPLEMENTED (advanced feature).")

        # 4. Prepare Final DataFrame and Save
        print(f"\n[4. Preparing and Saving Processed Data]")

        # Select and rename columns to match PDF example where possible
        # "case_id", "no_perkara", "tanggal", "ringkasan_fakta", "pasal", "pihak", "text_full"
        df_processed = df_initial.rename(columns={
            'nomor_perkara': 'no_perkara',
            'tanggal_putusan': 'tanggal', # Assuming tanggal_putusan is the main date
            'pasal_digunakan_extracted': 'pasal', # Using the extracted/refined one
            'pihak_terlibat_extracted': 'pihak',   # Using the extracted/refined one
            'full_text_putusan': 'text_full' # Full text is important
        })

        # Ensure all required columns from PDF example are present, add if missing
        required_cols = ["case_id", "no_perkara", "tanggal", "ringkasan_fakta", "pasal", "pihak", "text_full"]
        for col in required_cols:
            if col not in df_processed.columns:
                df_processed[col] = df_initial.get(col, pd.NA) # Get from original if renamed, else NA

        # Add other valuable columns (metadata from scraper, engineered features)
        # Keep original 'jenis_perkara', 'amar_putusan', etc.
        # Keep word counts
        additional_cols_to_keep = [
            'judul_putusan', 'jenis_perkara', 'tingkat_proses', 'kata_kunci',
            'tahun_dokumen', 'tanggal_register', 'lembaga_peradilan', 'amar_putusan',
            'link_sumber', 'link_pdf', 'nama_file_pdf', 'nama_file_raw_text',
            'jumlah_kata_full_text', 'jumlah_kata_ringkasan_fakta', 'jumlah_kata_argumen_hukum',
            'argumen_hukum_utama', # Retain this important feature
            'qa_pairs_sederhana'
        ]

        final_columns_ordered = required_cols + [col for col in additional_cols_to_keep if col in df_processed.columns and col not in required_cols]
        # Ensure no duplicate columns and all are present
        final_columns_ordered = sorted(list(set(final_columns_ordered)), key=final_columns_ordered.index)
        df_processed = df_processed[final_columns_ordered]

        # Save to CSV
        processed_csv_filename = "cases_processed.csv"
        processed_csv_filepath = os.path.join(PATH_PROCESSED_OUTPUT, processed_csv_filename)
        df_processed.to_csv(processed_csv_filepath, index=False, encoding='utf-8')
        print(f"Processed data saved to: {processed_csv_filepath}")

        # Save to JSON (optional, as per PDF )
        processed_json_filename = "cases_processed.json"
        processed_json_filepath = os.path.join(PATH_PROCESSED_OUTPUT, processed_json_filename)
        df_processed.to_json(processed_json_filepath, orient='records', indent=4, force_ascii=False)
        print(f"Processed data also saved to: {processed_json_filepath}")

        print("\n--- Sample of Processed Data ---")
        display(df_processed.head())
        print(f"\nColumns in processed DataFrame: {df_processed.columns.tolist()}")
        print(f"Shape of processed DataFrame: {df_processed.shape}")

        # Summary statistics untuk ringkasan fakta
        print("\n--- Ringkasan Fakta Extraction Summary ---")
        ringkasan_status = df_processed['ringkasan_fakta'].apply(
            lambda x: 'TERDETEKSI' if x != 'TIDAK TERDETEKSI' else 'TIDAK TERDETEKSI'
        ).value_counts()
        print(f"Ringkasan Fakta Status:\n{ringkasan_status}")

        # Show average length of detected ringkasan fakta
        detected_ringkasan = df_processed[df_processed['ringkasan_fakta'] != 'TIDAK TERDETEKSI']['ringkasan_fakta']
        if not detected_ringkasan.empty:
            avg_length = detected_ringkasan.apply(len).mean()
            print(f"Average length of detected ringkasan fakta: {avg_length:.0f} characters")

print("\nTahap 2: Case Representation - Complete.")