# Import Library

In [None]:
import os
import csv
import re
import json
import pandas as pd

# Load Dataset

In [None]:
!pip install gdown



In [None]:
!gdown "https://drive.google.com/uc?id=1rozvTivcopRIejyknzg_N-LRi8k1UNY9"

Downloading...
From: https://drive.google.com/uc?id=1rozvTivcopRIejyknzg_N-LRi8k1UNY9
To: /content/keywords.txt
  0% 0.00/3.92k [00:00<?, ?B/s]100% 3.92k/3.92k [00:00<00:00, 8.15MB/s]


In [None]:
!gdown "https://drive.google.com/uc?id=1cf4aqDvIBNTB6vVj6ADb3iSdsOJbH4-5"

Downloading...
From: https://drive.google.com/uc?id=1cf4aqDvIBNTB6vVj6ADb3iSdsOJbH4-5
To: /content/putusan.zip
  0% 0.00/4.98M [00:00<?, ?B/s]100% 4.98M/4.98M [00:00<00:00, 242MB/s]


In [None]:
!ls

keywords.txt  putusan.zip  sample_data


In [None]:
# Extraction ZIP to Folder
from zipfile import ZipFile

# Path ke file ZIP
zip_file = "/content/putusan.zip"
output_folder = "/content/putusan"

# Ekstrak ZIP
with ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(output_folder)

print(f"File diekstrak ke {output_folder}")

File diekstrak ke /content/putusan


# Verdict Extraction

In [None]:
# Load keywords from a file
def load_keywords_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        keywords = json.load(file)
    return keywords

# Extract "No Putusan"
def extract_no_putusan(content):
    pattern = r"(Nomor\s*[:\-]?\s*\d+/[A-Za-z.]+/\d+/[A-Za-z\s\-]{1,7})"
    match = re.search(pattern, content)
    return match.group(1) if match else None

# Extract "Lembaga Peradilan"
def extract_lembaga_peradilan(content):
    pattern = r"(Pengadilan\s*(Negeri|Agama|Tinggi)(\s*di)?\s*\w+)(?!\w)"
    match = re.search(pattern, content)
    return match.group(1) if match else None

# Extract data from content using keywords
def extract_data_from_content(content, keywords):
    extracted_texts = []
    extracted_ranges = []

    for keyword in keywords:
        pattern = re.escape(keyword)
        match = re.search(pattern, content, re.IGNORECASE)

        if match:
            start_index = match.start()
            end_index = content.find('\n\n', start_index)
            end_index = end_index if end_index != -1 else len(content)

            overlaps = any(start <= start_index <= end or start <= end_index <= end for start, end in extracted_ranges)
            if overlaps:
                continue

            extracted_paragraph = content[start_index:end_index].strip()

            while extracted_paragraph.count('.') < 5:
                additional_end_index = content.find('\n\n', end_index + 2)
                additional_end_index = additional_end_index if additional_end_index != -1 else len(content)
                extracted_paragraph += content[end_index:additional_end_index].strip()
                end_index = additional_end_index

            extracted_texts.append(extracted_paragraph)
            extracted_ranges.append((start_index, end_index))

    consolidated_text = "\n\n".join(extracted_texts)
    return consolidated_text if consolidated_text else None

# Extract paragraph related to 'wanprestasi'
def extract_wanprestasi_paragraph(content):
    pattern = r"(Tergugat\stelah\smelakukan(?:perbuatan\s)?wanprestasi.*?)(?:\n\n|\Z)"
    match = re.search(pattern, content)
    return match.group(1) if match else None

def extract_data_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    extracted_data = {category: None for category in KEYWORDS.keys()}

    extracted_data["No Putusan"] = extract_no_putusan(content)
    extracted_data["Lembaga Peradilan"] = extract_lembaga_peradilan(content)
    extracted_data["Perihal Gugatan"] = extract_wanprestasi_paragraph(content)
    extracted_data["Identitas Terdakwa"] = extract_data_from_content(content, KEYWORDS["Identitas Terdakwa"])

    for category, keywords in KEYWORDS.items():
        if category not in ["No Putusan", "Lembaga Peradilan", "Perihal Gugatan", "Identitas Terdakwa"]:
            extracted_data[category] = extract_data_from_content(content, keywords)

    return extracted_data

# Extract "Penuntut Umum" from descriptive text with clean formatting
def extract_penuntut_umum(logging_csv_path):
    logging_data = pd.read_csv(logging_csv_path, header=None)  # Assume no explicit column headers
    penuntut_umum_dict = {}

    for index, row in logging_data.iterrows():
        full_text = row[0]  # Assuming all data is in the first column
        # Match the name and stop before "Terdakwa" if it exists
        match = re.search(r"Penuntut Umum\s*:\s*([A-Za-z\s,.\-]+?)(?=\s*Terdakwa|$)", full_text)
        if match:
            penuntut_umum_name = match.group(1).strip()
            # Clean the name format
            penuntut_umum_name = re.sub(r"\s*,\s*", ", ", penuntut_umum_name)  # Ensure spacing after comma
            penuntut_umum_name = re.sub(r"\s*\.\s*", ". ", penuntut_umum_name)  # Ensure spacing after period
            penuntut_umum_name = re.sub(r"\b(S\.H|M\.H|Ph\.D|Dr)\b", lambda m: m.group(0).replace(".", ""), penuntut_umum_name)  # Remove dots from titles
        else:
            penuntut_umum_name = "N/A"

        penuntut_umum_dict[index] = penuntut_umum_name

    return penuntut_umum_dict

# Extract "Tanggal Register" directly from the 'tanggal_register' column in Logging.csv
def extract_tanggal_register(logging_csv_path):
    logging_data = pd.read_csv(logging_csv_path)  # Assumes the CSV has column headers

    tanggal_register_dict = {}

    # Ensure the column name is correctly referenced
    if 'tanggal_register' not in logging_data.columns:
        raise ValueError("The column 'tanggal_register' was not found in the CSV file.")

    for index, row in logging_data.iterrows():
        tanggal_register = row['tanggal_register']  # Directly access the 'tanggal_register' column
        tanggal_register = tanggal_register if pd.notna(tanggal_register) else "N/A"
        tanggal_register_dict[index] = tanggal_register

    return tanggal_register_dict

# Extract "Tanggal Putusan" from the 'judul' column in Logging.csv (Assuming the title has a date)
# Extract "Tanggal Putusan" from the 'judul' column in Logging.csv (assuming the date is after the word 'Tanggal')
def extract_tanggal_putusan(logging_csv_path):
    logging_data = pd.read_csv(logging_csv_path)  # Assumes the CSV has column headers

    tanggal_putusan_dict = {}

    # Ensure the column name is correctly referenced
    if 'judul' not in logging_data.columns:
        raise ValueError("The column 'judul' was not found in the CSV file.")

    for index, row in logging_data.iterrows():
        # Extract the date from 'judul' after the word 'Tanggal'
        title = row['judul']
        # Match a date pattern following the word "Tanggal"
        match = re.search(r"Tanggal\s+(\d{1,2}\s+[A-Za-z]+\s+\d{4})", title)
        if match:
            tanggal_putusan = match.group(1)  # Extracted date
        else:
            tanggal_putusan = "N/A"

        tanggal_putusan_dict[index] = tanggal_putusan

    return tanggal_putusan_dict

# Clean illegal characters for Excel
def clean_illegal_characters(data):
    cleaned_data = {k: re.sub(r'[\x00-\x1F\x7F]', ' ', str(v)) if v is not None else '' for k, v in data.items()}
    return cleaned_data

# Save data to an Excel file
def save_to_excel(data, output_file):
    cleaned_data = {filename: clean_illegal_characters(file_data) for filename, file_data in data.items()}
    df = pd.DataFrame(cleaned_data).transpose()
    df = df.fillna("")
    df.to_excel(output_file, index_label="File Name")

# Read files from a folder and extract data to Excel
def read_folder_and_extract_data(folder_path, logging_csv_path, output_file_path, exclude_files=None):
    if exclude_files is None:
        exclude_files = []

    # Load data for Penuntut Umum, Tanggal Register, and Tanggal Putusan from logging CSV
    penuntut_umum_data = extract_penuntut_umum(logging_csv_path)
    tanggal_register_data = extract_tanggal_register(logging_csv_path)  # Extract Tanggal Register
    tanggal_putusan_data = extract_tanggal_putusan(logging_csv_path)  # Extract Tanggal Putusan

    all_extracted_data = {}

    # Iterate over files in the folder
    for idx, file_name in enumerate(os.listdir(folder_path)):
        if file_name in exclude_files:
            continue  # Skip excluded files

        if file_name.endswith('.txt'):
            file_path = os.path.join(folder_path, file_name)
            extracted_data = extract_data_from_file(file_path)

            # Add Penuntut Umum, Tanggal Register, and Tanggal Putusan data using index (make sure it matches the text index)
            extracted_data['Penuntut Umum'] = penuntut_umum_data.get(idx, "N/A")
            extracted_data['Tanggal Register'] = tanggal_register_data.get(idx, "N/A")
            extracted_data['Tanggal Putusan'] = tanggal_putusan_data.get(idx, "N/A")  # Add Tanggal Putusan

            all_extracted_data[file_name] = extracted_data

    # Save the extracted data to an Excel file
    save_to_excel(all_extracted_data, output_file_path)

# Specify the file to exclude
exclude_files = ['_2025-01-05.txt']

# Execute the extraction process
KEYWORDS = load_keywords_from_file('keywords.txt')
source_folder = 'putusan/'
logging_csv = 'putusan/Logging.csv'
destination_file = 'Data.xlsx'
read_folder_and_extract_data(source_folder, logging_csv, destination_file, exclude_files=exclude_files)