# Install dan Import Library

In [None]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.1


In [None]:
!pip install pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Downloading pdfminer.six-20240706-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20240706


In [None]:
# %%
# Standard library imports
import argparse
from concurrent.futures import ThreadPoolExecutor, wait
from datetime import date
import io
import os
import re
import time
import urllib

# Third-party imports
import fitz  # PyMuPDF
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pdfminer import high_level

# Scrapping

In [None]:
# Make Directory
CURRENT_DIRECTORY = os.getcwd()

print(f"Direktori kerja saat ini: {CURRENT_DIRECTORY}")

def ensure_directory_exists(folder_name):

    path = create_path(folder_name)
    create_directory_if_not_exists(path)
    return path

def create_path(folder_name):
    current_directory = os.getcwd()
    full_path = os.path.join(current_directory, folder_name)
    print(f"Path yang akan dibuat: {full_path}")  # Add print to cek path
    if not os.path.exists(full_path):
        os.makedirs(full_path)
    return full_path

def create_directory_if_not_exists(path):
    if not os.path.exists(path):
        os.makedirs(path)

Direktori kerja saat ini: /content


In [None]:
# Catch HTML from URL
MAX_RETRIES = 3
RETRY_DELAY = 15  # delay in seconds


def fetch_html_with_retries(url):
    attempt_count = 0
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    while attempt_count < MAX_RETRIES:
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()  # Check if the response status is OK (200)
            return BeautifulSoup(response.text, "lxml")
        except requests.RequestException:
            attempt_count += 1
            time.sleep(RETRY_DELAY)

In [None]:
# Data Extraction
def extract_detail_from_soup(soup, keyword):
    try:
        keyword_tag = soup.find(lambda tag: tag.name == "td" and keyword in tag.text)
        if keyword_tag:
            next_tag = keyword_tag.find_next()
            detail_text = next_tag.get_text().strip()
            return detail_text
        else:
            return ""
    except AttributeError:  # This catches potential errors like 'NoneType' object has no attribute 'find_next'
        return ""

In [None]:
# Download PDF file
def download_pdf(file_url, output_directory):
    try:
        # Open the URL
        response = urllib.request.urlopen(file_url)

        # Extract the filename from headers
        filename = response.info().get_filename().replace("/", " ")

        # Read file content
        pdf_content = response.read()

        # Construct full file path
        file_path = os.path.join(output_directory, filename)

        # Write file content to a file
        with open(file_path, "wb") as pdf_file:
            pdf_file.write(pdf_content)

        # Return the content in BytesIO and the filename
        return io.BytesIO(pdf_content), filename

    except (urllib.error.URLError, IOError) as e:
        print(f"Error occurred: {e}")
        return None, None

In [None]:
# Clean Data
def clean_text(text):

    UNWANTED_TEXTS = [
    "M a h ka m a h A g u n g R e p u blik In d o n esia\n",
    "Disclaimer\n",
    "Kepaniteraan Mahkamah Agung Republik Indonesia berusaha untuk selalu mencantumkan informasi paling kini dan akurat sebagai bentuk komitmen Mahkamah Agung untuk pelayanan publik, transparansi dan akuntabilitas\n",
    "pelaksanaan fungsi peradilan. Namun dalam hal-hal tertentu masih dimungkinkan terjadi permasalahan teknis terkait dengan akurasi dan keterkinian informasi yang kami sajikan, hal mana akan terus kami perbaiki dari waktu kewaktu.\n",
    "Dalam hal Anda menemukan inakurasi informasi yang termuat pada situs ini atau informasi yang seharusnya ada, namun belum tersedia, maka harap segera hubungi Kepaniteraan Mahkamah Agung RI melalui :\n",
    "Email : kepaniteraan@mahkamahagung.go.id    Telp : 021-384 3348 (ext.318)\n",
    "Direktori Putusan Mahkamah Agung Republik Indonesia",
    "putusan.mahkamahagung.go.id",
    "hkama ahkamah Agung Repub ahkamah Agung Republik Indonesia mah Agung Republik Indonesia blik Indonesi",
    "Disclaimer Kepaniteraan Mahkamah Agung Republik Indonesia berusaha untuk selalu mencantumkan informasi paling kini dan akurat sebagai bentuk komitmen Mahkamah Agung untuk pelayanan publik, transparansi dan akuntabilitas pelaksanaan fungsi peradilan. Namun dalam hal-hal tertentu masih dimungkinkan terjadi permasalahan teknis terkait dengan akurasi dan keterkinian informasi yang kami sajikan, hal mana akan terus kami perbaiki dari waktu kewaktu. Dalam hal Anda menemukan inakurasi informasi yang termuat pada situs ini atau informasi yang seharusnya ada, namun belum tersedia, maka harap segera hubungi Kepaniteraan Mahkamah Agung RI melalui : Email : kepaniteraan@mahkamahagung.go.id Telp : 021-384 3348 (ext.318)",
    ]

    # remove extra space
    text = ' '.join(text.replace('\n', ' ').split())

    # Remove pages number
    for i in range(1, 100):
        UNWANTED_TEXTS.append(f"Halaman  | {i} ")
    # Remove 'Halaman {number} dari {number}' patterns
    text = re.sub(r'Halaman \d+ dari \d+', '', text)

    # Replace word 'Halaman' with a newline and remove any numbers following it
    text = re.sub(r'Halaman \d+', '\n', text)

    for unwanted_text in UNWANTED_TEXTS:
        text = text.replace(unwanted_text, "")
    return text

In [None]:
# Extract File and Clean Data
def extract_and_clean_text(pdf_path):
    """Extracts and cleans text from a PDF file."""
    import fitz  # Importing at function level for modularity

    text_content = ""
    with fitz.open(pdf_path) as document:
        for page in document:
            text_content += page.get_text()

    return clean_text(text_content)

In [None]:
# Extracting key data from each verdict page
PDF_AVAILABLE_TEXT = "Ada PDF"
PDF_NOT_AVAILABLE_TEXT = "Tidak ada PDF"
LOGGING_FILE_NAME = "Logging.csv"

def extract_data(url, keyword_url):
    # Extract the required details and save to both PDF and CSV simultaneously
    path_output = create_path("putusan")
    path_pdf = create_path("pdf-putusan")
    current_date = date.today().strftime("%Y-%m-%d")
    soup = fetch_html_with_retries(url)
    table = soup.find("table", {"class": "table"})
    judul = table.find("h2").text.strip()
    tahun = extract_detail_from_soup(table, "Tahun")
    tanggal_register = extract_detail_from_soup(table, "Tanggal Register")
    kaidah = extract_detail_from_soup(table, "Kaidah")

    link_pdf, text_pdf, file_name_pdf, has_pdf = process_pdf(soup, path_pdf)

    data = {
        "judul": judul,
        "tanggal_register": tanggal_register,
        "tahun": tahun,
        "kaidah": kaidah,
        "link": url,
        "link_pdf": link_pdf,
        "pdf_name": file_name_pdf,
        "has_pdf": has_pdf,
    }

    save_results(path_output, keyword_url, current_date, data, file_name_pdf, text_pdf)

def process_pdf(soup, path_pdf):
    try:
        link_pdf = soup.find("a", href=re.compile(r"/pdf/"))["href"]
        file_pdf, file_name_pdf = download_pdf(link_pdf, path_pdf)
        pdf_file_path = os.path.join(path_pdf, file_name_pdf)
        text_pdf = clean_text(extract_and_clean_text(pdf_file_path))
        has_pdf = PDF_AVAILABLE_TEXT
    except Exception as e:
        link_pdf = ""
        text_pdf = ""
        file_name_pdf = ""
        has_pdf = PDF_NOT_AVAILABLE_TEXT
    return link_pdf, text_pdf, file_name_pdf, has_pdf

def save_results(path_output, keyword_url, current_date, data, file_name_pdf, text_pdf):
    result = pd.DataFrame([data])
    keyword_url = keyword_url.replace("/", " ")

    destination_csv = os.path.join(path_output, LOGGING_FILE_NAME)
    if not os.path.isfile(destination_csv):
        result.to_csv(destination_csv, header=True, index=False)
    else:
        result.to_csv(destination_csv, mode="a", header=False, index=False)

    destination_txt = os.path.join(path_output, f"{file_name_pdf}_{current_date}.txt")
    with open(destination_txt, 'w', encoding='utf-8') as file:
        file.write(text_pdf)

In [None]:
def build_search_link(base_url, page_number, sort):
    """Build the search link based on the given parameters."""
    search_link = f"{base_url}&page={page_number}" if base_url.startswith(
        "https") else f"https://putusan3.mahkamahagung.go.id/search.html?q={base_url}&page={page_number}"
    if sort:
        search_link += "&obf=TANGGAL_PUTUS&obm=desc"
    return search_link


def run_process(base_url, page_number, sort_results):
    search_link = build_search_link(base_url, page_number, sort_results)
    soup = fetch_html_with_retries(search_link)
    decision_links = soup.find_all("a", {"href": re.compile("/direktori/putusan")})
    for decision_link in decision_links:
        extract_data(decision_link["href"], base_url)


def run_scraper(keyword=None, url=None, sort_by_date=True, download_pdfs=True):
    """Main scraping function, accepts keyword or URL and sorting preferences."""
    if not keyword and not url:
        print("Please provide a keyword or URL")
        return

    path_output = create_path("putusan")
    path_pdf = create_path("pdf-putusan")
    today_str = date.today().strftime("%Y-%m-%d")
    search_link = url if url else f"https://putusan3.mahkamahagung.go.id/search.html?q={keyword}&page=1"

    soup = fetch_html_with_retries(search_link)
    last_page_number = int(soup.find_all("a", {"class": "page-link"})[-1]["data-ci-pagination-page"])

    base_url = url or keyword
    print(
        f"Scraping with {'url' if url else 'keyword'}: {base_url} - {20 * last_page_number} data - {last_page_number} page")

    futures = []
    with ThreadPoolExecutor(max_workers=4) as executor:
        for page_number in range(1, last_page_number + 1):
            futures.append(executor.submit(run_process, base_url, page_number, sort_by_date))
    wait(futures)

In [None]:
# Download PN Denpasar - Pidana Khusus - Narkotika dan Psikotropika
run_scraper(url="https://putusan3.mahkamahagung.go.id/search.html?q=&jenis_doc=putusan&cat=3c40e48bbab311301a21c445b3c7fe57&jd=&tp=0&court=099780PN75++++++++++++++++++++++&t_put=2024&t_reg=&t_upl=&t_pr=")

Path yang akan dibuat: /content/putusan
Path yang akan dibuat: /content/pdf-putusan
Scraping with url: https://putusan3.mahkamahagung.go.id/search.html?q=&jenis_doc=putusan&cat=3c40e48bbab311301a21c445b3c7fe57&jd=&tp=0&court=099780PN75++++++++++++++++++++++&t_put=2024&t_reg=&t_upl=&t_pr= - 520 data - 26 page
Path yang akan dibuat: /content/putusan
Path yang akan dibuat: /content/pdf-putusan
Path yang akan dibuat: /content/putusan
Path yang akan dibuat: /content/pdf-putusan
Path yang akan dibuat: /content/putusan
Path yang akan dibuat: /content/pdf-putusan
Path yang akan dibuat: /content/putusan
Path yang akan dibuat: /content/pdf-putusan
Path yang akan dibuat: /content/putusan
Path yang akan dibuat: /content/pdf-putusan
Path yang akan dibuat: /content/putusan
Path yang akan dibuat: /content/pdf-putusan
Path yang akan dibuat: /content/putusan
Path yang akan dibuat: /content/pdf-putusan
Path yang akan dibuat: /content/putusan
Path yang akan dibuat: /content/pdf-putusan
Path yang akan dib

In [None]:
from google.colab import files
import shutil

# Specify the path of the folder you want to download
folder_path = '/content/pdf-putusan'

# Compress the folder into a zip file
shutil.make_archive('/content/pdf-putusan', 'zip', folder_path)

# Download the zip file that has been created
files.download('/content/pdf-putusan.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files
import shutil

folder_path = '/content/putusan'

shutil.make_archive('/content/putusan', 'zip', folder_path)

files.download('/content/putusan.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>