In [41]:
import requests
import xmltodict
import pandas as pd
from pathlib import Path
import logging
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import os
import json
import time


In [33]:


def get_eclis():
    """
    function to load all ECLI links from the rechtspraak.nl website
    Extract all ecli and stores them into list/text file
    """

    chrome_options = Options()
    # set path for chromedriver 
    chrome_path = "C:/Program Files/Google-OLD/chrome-win64/chrome.exe"
    chrome_options.binary_location = chrome_path  
    # chrome_options.add_argument("--headless")  #
    driver = webdriver.Chrome(options=chrome_options)

    url = 'https://uitspraken.rechtspraak.nl/resultaat?zoekterm=~encrochat%20~sky-ecc%20~ennetcom%20~anom%20~pgp-safe%20~exclu&inhoudsindicatie=zt0&sort=Relevance&publicatiestatus=ps1&rechtsgebied=r3'

    driver.get(url)


    ecli_links = []
    while len(ecli_links) < 10:
        try:
            # Find the 'Laad meer resultaten' button
            load_more_button = driver.find_element(By.ID, "lib-rnl-lib-rnl-laadMeerBtn")

            # if found, click the button
            if load_more_button:
                driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)
                time.sleep(2)
                load_more_button.click()
                print("Clicked 'Laad meer resultaten' button.")
                time.sleep(2)  # Wait for the page to load more result

        # if not found, break the loop

        except Exception as e:
            print(f"No more 'Laad meer resultaten' button found.{e}")
            break

    # proces page_source with beautifulsoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # get all the links tha contain ecli    
    ecli_links = [a['href'] for a in soup.find_all('a', href=True) if 'ECLI:' in a.text]
    
    

    return ecli_links





In [None]:
eclis = get_eclis()

In [38]:
# only keep unique eclis
eclis_uniq = list(set(eclis))

In [40]:
# post process links
processed_eclis = []
base_url = 'https://uitspraken.rechtspraak.nl'
for link in eclis_uniq:
    if not link.startswith('https://'):
        link = f"{base_url}{link}"
    processed_eclis.append(link)

In [42]:
 # Save current links to checkpoint file
with open('processed_eclis.txt', 'w') as file:
    for link in processed_eclis:
        file.write(f"{link}\n")

## Via API
Not needed since we have all the eclis, but key word search via API potential upgrade

In [8]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Constants
API_BASE_URL = "https://data.rechtspraak.nl/uitspraken/zoeken?"

def check_api(url):
    """Check if the API is reachable and return the status code."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Will raise an error for 4XX or 5XX status
        return response.status_code
    except requests.RequestException as e:
        logging.error(f"Failed to reach the API: {e}")
        return None

def fetch_data(url):
    """Fetch and convert XML data from the API URL to a JSON-like Python dictionary."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        xml_data = xmltodict.parse(response.text)
        json_data = xml_data['feed']['entry'] if 'entry' in xml_data['feed'] else []
        return json_data
    except requests.RequestException as e:
        logging.error(f"Error fetching data: {e}")
        return []
    except Exception as e:
        logging.error(f"Error parsing XML data: {e}")
        return []

def save_to_csv(data, filename):
    """Save extracted data to a CSV file."""
    if data:
        df = pd.DataFrame(data)
        Path('data').mkdir(parents=True, exist_ok=True)
        file_path = Path('data') / f'{filename}.csv'
        df.to_csv(file_path, index=False, encoding='utf8')
        logging.info(f"Data saved to CSV file successfully at {file_path}")
    else:
        logging.info("No data to save.")

def main(max_cases=100, start_date='1900-01-01', end_date=None, keywords=[], save_file=True):
    logging.info("Starting Rechtspraak data extraction...")

    end_date = end_date or datetime.now().strftime("%Y-%m-%d")

    formatted_keywords = [quote(f"2B%257e{keyword}") for keyword in keywords]
    keyword_query = ''.join(formatted_keywords)
    url = f"{API_BASE_URL}max={max_cases}&date={start_date}&date={end_date}&keyword={keyword_query}"
    print(url)
    if check_api(url) == 200:
        logging.info("API is responsive, proceeding with data fetching...")
        data = fetch_data(url)
        if data:
            filename = f'rechtspraak_{start_date}_{end_date}_{datetime.now().strftime("%H-%M-%S")}'
            if save_file:
                save_to_csv(data, filename)
        else:
            logging.info("No cases found for the given dates.")
    else:
        logging.error("API check failed.")

In [11]:
if __name__ == "__main__":
    keywords = ['encrochat', 'sky-ecc', 'ennetcom', 'anom', 'pgp-safe', 'exclu']
    main(max_cases=10, start_date='2023-01-01', keywords=keywords, save_file=True)

2024-04-20 19:28:04,306 - INFO - Starting Rechtspraak data extraction...
2024-04-20 19:28:04,428 - INFO - API is responsive, proceeding with data fetching...


https://data.rechtspraak.nl/uitspraken/zoeken?max=10&date=2023-01-01&date=2024-04-20&keyword=2B%25257eencrochat2B%25257esky-ecc2B%25257eennetcom2B%25257eanom2B%25257epgp-safe2B%25257eexclu


2024-04-20 19:28:04,533 - INFO - Data saved to CSV file successfully at data\rechtspraak_2023-01-01_2024-04-20_19-28-04.csv
