In [25]:
!pip install pandas requests beautifulsoup4 pdfminer.six lxml > /dev/null 2>&1

In [26]:
import argparse
import io
import os
import re
import time
import urllib
from concurrent.futures import ThreadPoolExecutor, wait
from datetime import date
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pdfminer import high_level # For PDF text extraction
from google.colab import drive

In [27]:
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
# --- Configuration Section ---
# !!! IMPORTANT: YOU NEED TO SET THESE VARIABLES !!!
BASE_DRIVE_PATH = "/content/drive/MyDrive/Semester 6/PK/UAS" # Change to your project folder
# Example: "https://putusan3.mahkamahagung.go.id/search.html?q=narkotika&jenis_perkara=pidana_khusus"
# Go to Mahkamah Agung website, search for your chosen case type, and copy the URL of the search results page.
# This URL should ideally allow pagination (e.g., by changing a 'page=' parameter).
MA_SEARCH_RESULT_URL = "https://putusan3.mahkamahagung.go.id/search.html?q=perceraian" # e.g., "https://putusan3.mahkamahagung.go.id/search.html?q=narkotika"
# A keyword to name output files, can be derived from your search or case type
KEYWORD_FOR_FILENAMING = "perceraian" # e.g., "narkotika" or "wanprestasi"
MIN_DOCUMENTS_TO_SCRAPE = 35 # As per project requirement
MAX_SCRAPING_WORKERS = 4 # For ThreadPoolExecutor

In [29]:
# Define paths using BASE_DRIVE_PATH
PATH_RAW_TEXT_OUTPUT = os.path.join(BASE_DRIVE_PATH, "data/raw")
PATH_PDF_DOWNLOAD = os.path.join(BASE_DRIVE_PATH, "PDFs_Putusan")
PATH_INITIAL_SCRAPER_CSV = os.path.join(BASE_DRIVE_PATH, "Scrap_CSVs")
PATH_LOGS = os.path.join(BASE_DRIVE_PATH, "logs")

# Create directories if they don't exist
os.makedirs(PATH_RAW_TEXT_OUTPUT, exist_ok=True)
os.makedirs(PATH_PDF_DOWNLOAD, exist_ok=True)
os.makedirs(PATH_INITIAL_SCRAPER_CSV, exist_ok=True)
os.makedirs(PATH_LOGS, exist_ok=True)

In [30]:
# Optional: Cleaning log file
CLEANING_LOG_FILE = os.path.join(PATH_LOGS, "cleaning.log")

def log_cleaning_action(message):
    """Appends a message to the cleaning log file."""
    with open(CLEANING_LOG_FILE, "a", encoding="utf-8") as f:
        f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {message}\n")
    print(message)

In [31]:
# --- Scraper Code (Adapted from your scraping.py) ---

def open_page_bs(link):
    """Opens a page and returns a BeautifulSoup object."""
    count = 0
    while count < 3:
        try:
            response = requests.get(link, timeout=30)
            response.raise_for_status() # Raise an exception for HTTP errors
            return BeautifulSoup(response.text, "lxml")
        except requests.exceptions.RequestException as e:
            log_cleaning_action(f"Error opening page {link}: {e}. Retrying {count+1}/3")
            count += 1
            time.sleep(5)
    return None

def get_detail_from_table(soup, keyword):
    """Extracts detail text from a table cell next to the keyword cell."""
    try:
        text = (
            soup.find(lambda tag: tag.name == "td" and keyword in tag.text)
            .find_next_sibling("td") # More robust than find_next()
            .get_text(separator=" ", strip=True)
        )
        return text
    except AttributeError: # Handles cases where keyword or next cell is not found
        return ""

def download_pdf_from_url(pdf_url, download_path):
    """Downloads a PDF from a URL and saves it."""
    try:
        response = urllib.request.urlopen(pdf_url, timeout=60)
        # Sanitize filename
        original_filename = response.info().get_filename()
        if original_filename:
            # Replace problematic characters for filenames
            safe_filename = re.sub(r'[\\/*?:"<>|]', "_", original_filename)
        else:
            # Create a filename if not provided (e.g., from the URL path)
            safe_filename = pdf_url.split('/')[-1]
            if not safe_filename.lower().endswith(".pdf"):
                 safe_filename += ".pdf"
        safe_filename = safe_filename.replace("/", "_").replace(" ", "_")


        file_content = response.read()
        filepath = os.path.join(download_path, safe_filename)
        with open(filepath, "wb") as out_file:
            out_file.write(file_content)
        log_cleaning_action(f"Successfully downloaded PDF: {safe_filename} to {download_path}")
        return io.BytesIO(file_content), safe_filename, filepath
    except Exception as e:
        log_cleaning_action(f"Failed to download PDF from {pdf_url}: {e}")
        return None, None, None

def basic_text_cleaning_ma(text):
    """Basic text cleaning for Mahkamah Agung documents."""
    if not isinstance(text, str):
        return ""
    # Remove standard MA disclaimers (extend as needed)
    text = text.replace("M a h ka m a h A g u n g R e p u blik In d o n esia\n", "")
    text = text.replace("Disclaimer\n", "")
    text = text.replace("Kepaniteraan Mahkamah Agung Republik Indonesia berusaha untuk selalu mencantumkan informasi paling kini dan akurat sebagai bentuk komitmen Mahkamah Agung untuk pelayanan publik, transparansi dan akuntabilitas\n","")
    text = text.replace("pelaksanaan fungsi peradilan. Namun dalam hal-hal tertentu masih dimungkinkan terjadi permasalahan teknis terkait dengan akurasi dan keterkinian informasi yang kami sajikan, hal mana akan terus kami perbaiki dari waktu kewaktu.\n","")
    text = text.replace("Dalam hal Anda menemukan inakurasi informasi yang termuat pada situs ini atau informasi yang seharusnya ada, namun belum tersedia, maka harap segera hubungi Kepaniteraan Mahkamah Agung RI melalui :\n","")
    text = text.replace("Email : kepaniteraan@mahkamahagung.go.id Telp : 021-384 3348 (ext.318)\n", "") # Corrected typo in original

    # Further cleaning as per Tahap 1.b.ii
    # Remove headers/footers - this is tricky and might need pattern-based removal.
    # For now, we'll rely on pdfminer's extraction and the specific MA disclaimers.
    # More sophisticated header/footer removal might involve regex based on common patterns
    # (e.g., "Putusan Nomor ... / Halaman ... dari ...")

    # Normalize whitespace (replace multiple spaces/newlines with a single one)
    text = re.sub(r'\s+', ' ', text).strip()
    # Lowercase (optional, consider if case is important for later stages)
    # text = text.lower() # As per, making it optional here.
    # Remove punctuation (optional, can be done here or in preprocessing for TF-IDF/BERT)
    # text = re.sub(r'[^\w\s]', '', text) # Example: removes all except word chars and space

    log_cleaning_action("Performed basic text cleaning (MA disclaimers, whitespace).")
    return text

def convert_pdf_to_cleaned_text(pdf_content_stream):
    """Extracts text from PDF and applies cleaning."""
    try:
        raw_text = high_level.extract_text(pdf_content_stream)
        cleaned_text = basic_text_cleaning_ma(raw_text) # Apply MA specific cleaning
        return cleaned_text
    except Exception as e:
        log_cleaning_action(f"Error extracting text from PDF: {e}")
        return ""

# Global list to store all extracted data for the final single CSV
all_scraped_data = []
# Global counter for raw text files
raw_text_file_counter = 1
# Lock for thread-safe operations on shared resources (if any become complex)
# from threading import Lock
# data_lock = Lock()

def extract_decision_data(decision_page_url, unique_id_counter):
    """
    Extracts metadata and text from a single court decision page.
    Saves cleaned full text to a .txt file.
    Returns a dictionary of extracted data.
    """
    global raw_text_file_counter

    log_cleaning_action(f"Processing decision: {decision_page_url}")
    soup = open_page_bs(decision_page_url)
    if not soup:
        log_cleaning_action(f"Failed to open page: {decision_page_url}. Skipping.")
        return None

    # Attempt to find the main table containing decision details
    table = soup.find("table", {"class": "table"}) # Common class for such tables
    if not table:
        # Fallback: Try finding a table with 'Nomor Register' or 'Nomor Putusan'
        table = soup.find(lambda tag: tag.name == "table" and ("Nomor Register" in tag.text or "Nomor Putusan" in tag.text))

    if not table:
        log_cleaning_action(f"Could not find details table on {decision_page_url}. Skipping.")
        return None

    # --- Metadata Extraction (similar to your script, using get_detail_from_table) ---
    #
    judul_putusan = table.find("h2").text.strip() if table.find("h2") else decision_page_url.split('/')[-2] # Use part of URL if no h2
    if table.find("h2"): table.find("h2").decompose() # Remove h2 after extracting

    nomor_perkara = get_detail_from_table(table, "Nomor") or get_detail_from_table(table, "Nomor Register") or get_detail_from_table(table, "Nomor Putusan")
    tanggal_putusan = get_detail_from_table(table, "Tanggal Dibacakan") # Usually the decision date
    if not tanggal_putusan: tanggal_putusan = get_detail_from_table(table, "Tanggal Putusan")
    jenis_perkara = get_detail_from_table(table, "Klasifikasi") # 'Klasifikasi' often holds case type
    if not jenis_perkara: jenis_perkara = get_detail_from_table(table, "Jenis Perkara")

    # Extracting 'Pasal Digunakan' might require parsing the text body or specific sections
    # For now, we'll leave a placeholder or try to find it in metadata if available
    pasal_digunakan = get_detail_from_table(table, "Kaidah") # 'Kaidah' sometimes lists articles
    if not pasal_digunakan : pasal_digunakan = get_detail_from_table(table,"Pasal") # If there's a direct 'Pasal' field

    # Pihak can be complex (Penggugat/Tergugat, Terdakwa/Penuntut).
    # This might need more advanced parsing if not explicitly listed.
    # Simple approach:
    pihak_1_label_td = table.find("td", string=re.compile(r"Pemohon|Penggugat|Penuntut", re.IGNORECASE))
    pihak_1 = pihak_1_label_td.find_next_sibling("td").get_text(strip=True) if pihak_1_label_td else "N/A"

    pihak_2_label_td = table.find("td", string=re.compile(r"Termohon|Tergugat|Terdakwa", re.IGNORECASE))
    pihak_2 = pihak_2_label_td.find_next_sibling("td").get_text(strip=True) if pihak_2_label_td else "N/A"
    nama_pihak = f"{pihak_1} vs {pihak_2}"


    # Other metadata from your script (can be added if available and needed)
    tingkat_proses = get_detail_from_table(table, "Tingkat Proses")
    kata_kunci = get_detail_from_table(table, "Kata Kunci")
    tahun_dokumen = get_detail_from_table(table, "Tahun")
    tanggal_register = get_detail_from_table(table, "Tanggal Register")
    lembaga_peradilan = get_detail_from_table(table, "Lembaga Peradilan")
    amar_putusan = get_detail_from_table(table, "Amar") # Important for Tahap 4

    # --- PDF Processing and Text Extraction ---
    pdf_link_tag = soup.find("a", href=re.compile(r"/pdf/|/content/pdf/|/download/pdf", re.IGNORECASE))
    full_text_putusan = ""
    pdf_filename_ondisk = ""
    pdf_download_url = ""

    if pdf_link_tag:
        pdf_url_relative = pdf_link_tag["href"]
        # Construct absolute URL if relative
        if pdf_url_relative.startswith("/"):
            base_ma_url = "https://putusan3.mahkamahagung.go.id" # Adjust if domain changes
            pdf_download_url = base_ma_url + pdf_url_relative
        else:
            pdf_download_url = pdf_url_relative

        log_cleaning_action(f"Found PDF link: {pdf_download_url}")
        pdf_content_stream, pdf_filename_original, _ = download_pdf_from_url(pdf_download_url, PATH_PDF_DOWNLOAD)

        if pdf_content_stream:
            pdf_filename_ondisk = pdf_filename_original
            extracted_text_from_pdf = convert_pdf_to_cleaned_text(pdf_content_stream) #
            full_text_putusan = extracted_text_from_pdf

            # Further cleaning (Tahap 1.a.ii, 1.b.ii)
            # Normalisasi karakter (spasi sudah di basic_text_cleaning_ma, lowercase can be done later)
            # Tokenisasi will be done in Tahap 2 or 3
            # Hapus header/footer, nomor halaman, watermark (partially done in basic_text_cleaning_ma, advanced needs regex)
            # Example for removing page numbers like "Halaman X dari Y" (very basic)
            full_text_putusan = re.sub(r'Halaman\s+\d+\s+dari\s+\d+', '', full_text_putusan, flags=re.IGNORECASE)
            full_text_putusan = re.sub(r'Putusan Nomor:?\s*[\w\/\.\-]+', '', full_text_putusan, flags=re.IGNORECASE) # Remove "Putusan Nomor..."
            full_text_putusan = full_text_putusan.strip()


            # Validate text integrity (Tahap 1.a.ii - Validasi)  (simple check)
            if len(full_text_putusan) < 500: # Arbitrary threshold for minimal content
                log_cleaning_action(f"WARNING: Extracted text for {nomor_perkara} seems too short ({len(full_text_putusan)} chars). May not be complete.")
            else:
                log_cleaning_action(f"Extracted and cleaned text for {nomor_perkara}. Length: {len(full_text_putusan)} chars.")

            # Save cleaned full text to /data/raw/case_XXX.txt
            # Ensure unique filename using a counter or the decision number if valid for filename
            # Sanitize nomor_perkara for use as a filename
            safe_nomor_perkara = re.sub(r'[\\/*?:"<>|]', "_", nomor_perkara) if nomor_perkara else f"case_{unique_id_counter:03d}"
            # Limit length and avoid issues
            safe_nomor_perkara = (safe_nomor_perkara[:50] + '...') if len(safe_nomor_perkara) > 50 else safe_nomor_perkara

            raw_text_filename = f"case_{safe_nomor_perkara.replace('/', '_')}_{unique_id_counter:03d}.txt"
            raw_text_filepath = os.path.join(PATH_RAW_TEXT_OUTPUT, raw_text_filename)

            try:
                with open(raw_text_filepath, "w", encoding="utf-8") as f:
                    f.write(full_text_putusan)
                log_cleaning_action(f"Saved cleaned text to: {raw_text_filepath}")
            except Exception as e:
                log_cleaning_action(f"Error saving raw text file {raw_text_filepath}: {e}")
                # Fallback filename if nomor_perkara causes issues
                raw_text_filename_fallback = f"case_{unique_id_counter:03d}.txt"
                raw_text_filepath_fallback = os.path.join(PATH_RAW_TEXT_OUTPUT, raw_text_filename_fallback)
                try:
                    with open(raw_text_filepath_fallback, "w", encoding="utf-8") as f:
                        f.write(full_text_putusan)
                    log_cleaning_action(f"Saved cleaned text with fallback name: {raw_text_filepath_fallback}")
                except Exception as e_fallback:
                    log_cleaning_action(f"Critical error saving raw text file even with fallback name: {e_fallback}")

            # Increment counter for next file
            # Handled by the loop that calls this function to ensure unique IDs even with threading issues

    else:
        log_cleaning_action(f"No PDF link found on {decision_page_url}")
        full_text_putusan = "NOT_AVAILABLE (NO_PDF_LINK)"


    # Prepare data for CSV
    decision_data = {
        "case_id": f"case_{unique_id_counter:03d}",
        "judul_putusan": judul_putusan,
        "nomor_perkara": nomor_perkara,
        "tanggal_putusan": tanggal_putusan,
        "jenis_perkara": jenis_perkara, # From 'Klasifikasi'
        "pasal_digunakan": pasal_digunakan, # Placeholder, needs better extraction
        "nama_pihak": nama_pihak, # Placeholder
        "tingkat_proses": tingkat_proses,
        "kata_kunci": kata_kunci,
        "tahun_dokumen": tahun_dokumen,
        "tanggal_register": tanggal_register,
        "lembaga_peradilan": lembaga_peradilan,
        "amar_putusan": amar_putusan,
        "link_sumber": decision_page_url,
        "link_pdf": pdf_download_url,
        "nama_file_pdf": pdf_filename_ondisk,
        "nama_file_raw_text": os.path.basename(raw_text_filepath if 'raw_text_filepath' in locals() and pdf_content_stream else "N/A"),
        "full_text_putusan_preview": full_text_putusan[:200] + "..." # Preview for CSV
        # Full text is in the .txt file. Avoid large text in CSV if possible.
    }
    return decision_data

In [32]:
def run_scraping_process(base_search_url, keyword_for_naming, num_documents_target):
    """
    Manages the scraping of multiple pages and decisions.
    """
    global all_scraped_data # Use the global list
    global raw_text_file_counter # Use the global counter

    if not base_search_url or base_search_url == "YOUR_CHOSEN_SEARCH_URL":
        print("ERROR: MA_SEARCH_RESULT_URL is not set. Please provide a valid search URL.")
        return

    log_cleaning_action(f"Starting scraping process for keyword: {keyword_for_naming}")
    log_cleaning_action(f"Target number of documents: {num_documents_target}")

    collected_decision_links = set()
    current_page = 1
    documents_collected_count = 0

    # --- Part 1: Collect all unique decision links ---
    while documents_collected_count < num_documents_target:
        # Construct page URL (MA specific, may need adjustment based on actual URL structure)
        # Common patterns: &page=NUMBER or ?page=NUMBER or /page/NUMBER
        if "&page=" in base_search_url:
            page_url = re.sub(r'&page=\d+', f'&page={current_page}', base_search_url)
        elif "?page=" in base_search_url:
             page_url = re.sub(r'\?page=\d+', f'?page={current_page}', base_search_url)
        elif not "page=" in base_search_url : # if no page parameter, try adding it
            separator = '&' if '?' in base_search_url else '?'
            page_url = f"{base_search_url}{separator}page={current_page}"
        else: # Default if unsure
            page_url = f"{base_search_url}&page={current_page}" # Assume this structure if others don't match

        # If it's the first page and the base_search_url already implies page 1
        if current_page == 1 and ("&page=1" in base_search_url or "?page=1" in base_search_url):
            page_url = base_search_url


        log_cleaning_action(f"Accessing search results page: {page_url}")
        soup = open_page_bs(page_url)
        if not soup:
            log_cleaning_action(f"Failed to load page {current_page}. Stopping link collection.")
            break

        # Find links to individual decisions (adjust selector if needed)
        # Links are usually within <a> tags, href containing "/direktori/putusan/"
        decision_links_on_page = soup.find_all("a", href=re.compile(r"/direktori/putusan/.+\.html", re.IGNORECASE))

        if not decision_links_on_page:
            log_cleaning_action(f"No more decision links found on page {current_page}. Total links so far: {len(collected_decision_links)}")
            break # No more links, assume end of results

        new_links_found_on_page = 0
        for link_tag in decision_links_on_page:
            relative_url = link_tag["href"]
            # Construct absolute URL
            if relative_url.startswith("/"):
                base_ma_url = "https://putusan3.mahkamahagung.go.id" # Adjust if domain changes
                full_decision_url = base_ma_url + relative_url
            else: # Should ideally always be relative from MA site
                full_decision_url = relative_url

            if full_decision_url not in collected_decision_links:
                collected_decision_links.add(full_decision_url)
                new_links_found_on_page +=1
                # documents_collected_count will be based on successful processing later
            if len(collected_decision_links) >= num_documents_target * 1.5 : # Collect a bit more to account for processing failures
                 break
        log_cleaning_action(f"Page {current_page}: Found {len(decision_links_on_page)} potential links. Added {new_links_found_on_page} new unique links.")

        if len(collected_decision_links) >= num_documents_target * 1.5:
            log_cleaning_action(f"Collected enough links ({len(collected_decision_links)}). Moving to data extraction.")
            break

        current_page += 1
        if current_page > 50 : # Safety break for too many pages (e.g. if num_documents_target is very high)
            log_cleaning_action("Reached page 50, stopping link collection phase.")
            break
        time.sleep(2) # Be respectful to the server

    log_cleaning_action(f"Total unique decision links collected: {len(collected_decision_links)}")
    if not collected_decision_links:
        log_cleaning_action("No decision links found. Please check MA_SEARCH_RESULT_URL and website structure.")
        return

    # --- Part 2: Process collected links using ThreadPoolExecutor ---
    # Ensure raw_text_file_counter is correctly managed if threading is complex
    # For simplicity here, we'll pass a unique ID to extract_decision_data
    # based on enumeration, assuming it's sufficient for unique file naming.
    # A more robust approach might involve a thread-safe counter.

    processed_results = []
    links_to_process = list(collected_decision_links)[:num_documents_target] # Process up to target number

    # Using ThreadPoolExecutor to speed up I/O bound tasks (downloading, page requests)
    with ThreadPoolExecutor(max_workers=MAX_SCRAPING_WORKERS) as executor:
        # Each future will execute extract_decision_data for a link
        # We pass 'i+1' as a unique_id_counter for each task
        futures = [executor.submit(extract_decision_data, link, i + 1) for i, link in enumerate(links_to_process)]

        for future in futures: # Iterate through futures as they complete (or in order)
            try:
                result = future.result(timeout=300) # Timeout for each task
                if result:
                    processed_results.append(result)
                    documents_collected_count +=1 # Count successful extractions
                    log_cleaning_action(f"Successfully processed and got data for case_id: {result.get('case_id', 'N/A')}")
                else:
                    log_cleaning_action("A task returned no result (None).")
            except Exception as e:
                log_cleaning_action(f"A scraping task failed: {e}")

    all_scraped_data.extend(processed_results)
    log_cleaning_action(f"Successfully processed {documents_collected_count} documents.")


    # --- Part 3: Save all collected data to a single CSV ---
    if all_scraped_data:
        df = pd.DataFrame(all_scraped_data)
        # Define CSV filename
        today_date = date.today().strftime("%Y-%m-%d")
        csv_filename = f"putusan_ma_{keyword_for_naming.replace(' ', '_')}_{today_date}.csv"
        csv_filepath = os.path.join(PATH_INITIAL_SCRAPER_CSV, csv_filename)

        df.to_csv(csv_filepath, index=False, encoding="utf-8")
        log_cleaning_action(f"All scraped data saved to: {csv_filepath}")
        log_cleaning_action(f"Total documents in CSV: {len(df)}")
        display(df.head())
    else:
        log_cleaning_action("No data was scraped successfully to save to CSV.")

    log_cleaning_action("Scraping and initial processing complete.")
    log_cleaning_action(f"Cleaned text files saved in: {PATH_RAW_TEXT_OUTPUT}")
    log_cleaning_action(f"PDFs downloaded to: {PATH_PDF_DOWNLOAD}")

In [33]:
# --- Main Execution ---
if __name__ == "__main__":
    print("Starting Tahap 1: Membangun Case Base (Scraping & Cleaning)")

    # Validate essential user-set variables
    if MA_SEARCH_RESULT_URL == "YOUR_CHOSEN_SEARCH_URL" or not MA_SEARCH_RESULT_URL:
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print("!!! ERROR: Please set 'MA_SEARCH_RESULT_URL' in the Configuration Section. !!!")
        print("!!! This should be the URL from the Mahkamah Agung website after searching for !!!")
        print("!!! your chosen 'jenis perkara'.                                               !!!")
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    elif not KEYWORD_FOR_FILENAMING:
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print("!!! ERROR: Please set 'KEYWORD_FOR_FILENAMING' in the Configuration Section.   !!!")
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    else:
        run_scraping_process(MA_SEARCH_RESULT_URL, KEYWORD_FOR_FILENAMING, MIN_DOCUMENTS_TO_SCRAPE)
        print(f"Scraping finished. Check the logs and output folders in: {BASE_DRIVE_PATH}")
        print(f"Raw text files should be in: {PATH_RAW_TEXT_OUTPUT}")
        print(f"Optional cleaning log: {CLEANING_LOG_FILE}") #
        print(f"Output CSV from scraper: {PATH_INITIAL_SCRAPER_CSV}")

# Example of how to list created raw text files:
print("\n--- Listing Sample Raw Text Files ---")
try:
    sample_raw_files = os.listdir(PATH_RAW_TEXT_OUTPUT)[:5]
    if sample_raw_files:
        for f_name in sample_raw_files:
            print(os.path.join(PATH_RAW_TEXT_OUTPUT, f_name))
    else:
        print(f"No files found in {PATH_RAW_TEXT_OUTPUT}. Scraping might have failed or yielded no documents.")
except FileNotFoundError:
    print(f"Directory not found: {PATH_RAW_TEXT_OUTPUT}. Please ensure BASE_DRIVE_PATH is correct and scraping ran.")

Starting Tahap 1: Membangun Case Base (Scraping & Cleaning)
Starting scraping process for keyword: perceraian
Target number of documents: 35
Accessing search results page: https://putusan3.mahkamahagung.go.id/search.html?q=perceraian&page=1
Page 1: Found 23 potential links. Added 23 new unique links.
Accessing search results page: https://putusan3.mahkamahagung.go.id/search.html?q=perceraian&page=2
Page 2: Found 23 potential links. Added 20 new unique links.
Accessing search results page: https://putusan3.mahkamahagung.go.id/search.html?q=perceraian&page=3
Page 3: Found 23 potential links. Added 10 new unique links.
Collected enough links (53). Moving to data extraction.
Total unique decision links collected: 53
Processing decision: https://putusan3.mahkamahagung.go.id/direktori/putusan/ad616f008f8d6b544507d50d48fa8e5b.html
Processing decision: https://putusan3.mahkamahagung.go.id/direktori/putusan/zaee8cf23ea3c67e98b0313335373335.html
Processing decision: https://putusan3.mahkamahagun

Unnamed: 0,case_id,judul_putusan,nomor_perkara,tanggal_putusan,jenis_perkara,pasal_digunakan,nama_pihak,tingkat_proses,kata_kunci,tahun_dokumen,tanggal_register,lembaga_peradilan,amar_putusan,link_sumber,link_pdf,nama_file_pdf,nama_file_raw_text,full_text_putusan_preview
0,case_001,Putusan PN KUTAI BARAT Nomor 28/Pdt.G/2020/PN ...,28/Pdt.G/2020/PN Sdw,11 Mei 2020,Perdata Perdata Agama Perceraian,—,N/A vs N/A,Pertama,Perceraian,2020,15 April 2020,PN KUTAI BARAT,Lain-lain,https://putusan3.mahkamahagung.go.id/direktori...,https://putusan3.mahkamahagung.go.id/direktori...,putusan_28_pdt.g_2020_pn_sdw_20250605131926.pdf,case_28_Pdt.G_2020_PN Sdw_001.txt,Direktori Putusan Mahkamah Agung Republik Indo...
1,case_002,Putusan PN NEGARA Nomor 229/Pdt.G/2023/PN Nga ...,229/Pdt.G/2023/PN Nga,24 Nopember 2023,Perdata Perdata Agama Perceraian,—,N/A vs N/A,Pertama,Perceraian,2023,2 Nopember 2023,PN NEGARA,Lain-lain,https://putusan3.mahkamahagung.go.id/direktori...,https://putusan3.mahkamahagung.go.id/direktori...,putusan_229_pdt.g_2023_pn_nga_20250605131936.pdf,case_229_Pdt.G_2023_PN Nga_002.txt,Direktori Putusan Mahkamah Agung Republik Indo...
2,case_003,Putusan PN Cikarang Nomor 40/Pdt.G/2019/PN Ckr...,40/Pdt.G/2019/PN Ckr,25 April 2019,Perdata Perdata Agama Perceraian,—,N/A vs N/A,Pertama,Perceraian,2019,21 Februari 2019,PN Cikarang,Gugur,https://putusan3.mahkamahagung.go.id/direktori...,https://putusan3.mahkamahagung.go.id/direktori...,putusan_40_pdt.g_2019_pn_ckr_20250605131933.pdf,case_40_Pdt.G_2019_PN Ckr_003.txt,Direktori Putusan Mahkamah Agung Republik Indo...
3,case_004,Putusan PA SIDIKALANG Nomor 9/Pdt.G/2025/PA.Sd...,9/Pdt.G/2025/PA.Sdk,3 Juni 2025,Perdata Agama Perdata Agama Perceraian,—,N/A vs N/A,Pertama,Cerai Talak,2025,30 Januari 2025,PA SIDIKALANG,Lain-lain,https://putusan3.mahkamahagung.go.id/direktori...,https://putusan3.mahkamahagung.go.id/direktori...,putusan_9_pdt.g_2025_pa.sdk_20250605132013.pdf,case_9_Pdt.G_2025_PA.Sdk_004.txt,Direktori Putusan Mahkamah Agung Republik Indo...
4,case_006,Putusan PN BANGLI Nomor 64/Pdt.G/2023/PN Bli T...,64/Pdt.G/2023/PN Bli,13 Juni 2023,Perdata Perdata Agama Perceraian,—,N/A vs N/A,Pertama,Perceraian,2023,9 Mei 2023,PN BANGLI,Lain-lain,https://putusan3.mahkamahagung.go.id/direktori...,,,A,NOT_AVAILABLE (NO_PDF_LINK)...


Scraping and initial processing complete.
Cleaned text files saved in: /content/drive/MyDrive/Semester 6/PK/UAS/data/raw
PDFs downloaded to: /content/drive/MyDrive/Semester 6/PK/UAS/PDFs_Putusan
Scraping finished. Check the logs and output folders in: /content/drive/MyDrive/Semester 6/PK/UAS
Raw text files should be in: /content/drive/MyDrive/Semester 6/PK/UAS/data/raw
Optional cleaning log: /content/drive/MyDrive/Semester 6/PK/UAS/logs/cleaning.log
Output CSV from scraper: /content/drive/MyDrive/Semester 6/PK/UAS/Scrap_CSVs

--- Listing Sample Raw Text Files ---
/content/drive/MyDrive/Semester 6/PK/UAS/data/raw/case_28_Pdt.G_2020_PN Sdw_001.txt
/content/drive/MyDrive/Semester 6/PK/UAS/data/raw/case_40_Pdt.G_2019_PN Ckr_003.txt
/content/drive/MyDrive/Semester 6/PK/UAS/data/raw/case_229_Pdt.G_2023_PN Nga_002.txt
/content/drive/MyDrive/Semester 6/PK/UAS/data/raw/case_9_Pdt.G_2025_PA.Sdk_004.txt
/content/drive/MyDrive/Semester 6/PK/UAS/data/raw/case_27_Pdt.G_2023_PN Nga_007.txt
