In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import os
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import glob
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import json
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
import torch

In [3]:
# Load CSV
df = pd.read_csv("/content/drive/MyDrive/ProyekA/CSV/putusan_ma__2025-06-24.csv")
df.fillna("", inplace=True)

# Fungsi metadata
def extract_metadata(row):
    return {
        "case_id": row.get("nomor", ""),
        "judul": row.get("judul", ""),
        "tanggal": row.get("tanggal_register", ""),
        "jenis_perkara": row.get("klasifikasi", ""),
        "pasal": re.findall(r"pasal[^.\n]{0,100}", row.get("text_pdf", "").lower()),
        "pihak": f"{row.get('hakim_ketua', '')}, {row.get('hakim_anggota', '')}, {row.get('panitera', '')}",
        "text_full": row.get("text_pdf", "")
    }

# Ekstraksi konten
def extract_summary_and_argument(text):
    text = text.lower()
    ringkasan = re.search(r"(barang bukti.*?)(?:\n|\.|;)", text)
    argumen = re.search(r"(menyatakan|mengadili|memutuskan).*?(?:\n|\.|;)", text)
    return ringkasan.group(1) if ringkasan else "", argumen.group(0) if argumen else ""

# QA Pairs
def generate_qa(text):
    text = text.lower()
    qas = []
    if (m := re.search(r"barang bukti.*?[.:;\n]", text)):
        qas.append({"question": "Apa barang bukti dalam perkara ini?", "answer": m.group(0).strip()})
    if (m := re.search(r"(menyatakan|mengadili|memutuskan).*?[.:;\n]", text)):
        qas.append({"question": "Apa amar putusan yang diputuskan?", "answer": m.group(0).strip()})
    if (m := re.search(r"pasal[^.\n]{0,100}", text)):
        qas.append({"question": "Pasal apa yang dijadikan dasar?", "answer": m.group(0).strip()})
    return qas

# Feature engineering
df['length'] = df['text_pdf'].apply(lambda x: len(str(x).split()))
vectorizer = CountVectorizer(max_features=10, stop_words='english')
X = vectorizer.fit_transform(df['text_pdf'])
bow_df = pd.DataFrame(X.toarray(), columns=[f"bow_{w}" for w in vectorizer.get_feature_names_out()])

# Proses seluruh data
cases = []
for _, row in df.iterrows():
    meta = extract_metadata(row)
    ringkasan, argumen = extract_summary_and_argument(meta["text_full"])
    qa = generate_qa(meta["text_full"])
    cases.append({
        "case_id": meta["case_id"],
        "no_perkara": meta["case_id"],
        "tanggal": meta["tanggal"],
        "jenis_perkara": meta["jenis_perkara"],
        "pasal": "; ".join(meta["pasal"]),
        "pihak": meta["pihak"],
        "ringkasan_fakta": ringkasan,
        "argumen_hukum": argumen,
        "text_full": meta["text_full"],
        "qa_pairs": json.dumps(qa, ensure_ascii=False),
    })

final_df = pd.DataFrame(cases)
final_df["length"] = df["length"]
final_df = pd.concat([final_df, bow_df], axis=1)

output_csv = "/content/drive/MyDrive/ProyekA/data/processed/cases.csv"
output_json = "/content/drive/MyDrive/ProyekA/data/processed/cases.json"
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
final_df.to_csv(output_csv, index=False)
final_df.to_json(output_json, orient="records", indent=2)

print("Berhasil disimpan ke:")
print(output_csv)
print(output_json)

  df.fillna("", inplace=True)


Berhasil disimpan ke:
/content/drive/MyDrive/ProyekA/data/processed/cases.csv
/content/drive/MyDrive/ProyekA/data/processed/cases.json
