## Scraping all PDFs before the API came into work

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.webdriver import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import requests
import os
import time

In [None]:
options = Options()
driver = webdriver.Chrome(options=options)  

# Putting the downloaded PDFs in the correct folder
downloads_folder = os.path.join(os.path.expanduser("~"), "Downloads")
pdf_save_folder = os.path.join(downloads_folder, "debates_pdf")
year_folder = os.path.join(pdf_save_folder, "etc")

# Checking if the folder exists, otherwise creating it
os.makedirs(year_folder, exist_ok=True)

# Opening target webpage
base_url = "https://zoek.officielebekendmakingen.nl/resultaten?q=(c.product-area==%22officielepublicaties%22)and((dt.creator==%22Tweede%20Kamer%20der%20Staten-Generaal%22)or(dt.creator==%22Tweede%20Kamer%20OCV%20/%20UCV%22))and(((w.publicatienaam==%22Agenda%22))or((w.publicatienaam==%22Handelingen%22))or((w.publicatienaam==%22Kamerstuk%22))or(((w.publicatienaam==%22Kamervragen%20(Aanhangsel)%22)or(w.publicatienaam==%22Kamervragen%20zonder%20antwoord%22)))or((w.publicatienaam==%22Niet-dossierstuk%22)))%20AND%20dt.type==%22Handeling%22%20AND%20w.vergaderjaar==%222020-2021%22&zv=&pg=10&col=AlleParlementaireDocumenten&svel=Kenmerkendedatum&svol=Oplopend"
driver.get(base_url)

# Waiting for the page to load
time.sleep(3)

# Implementing a maximum amount of pages to scrape for disk space purposes
page_count = 0
max_pages = 2

# Looping through all pages until all "Handelingen"-documents per year have been processed
while page_count < max_pages:
    # Creating an empty list to store all PDF download links for this webpage
    pdf_links = []
    
    try:
        # Locating the "Publicaties"-element that contains the Download PDF button
        publicaties_section = driver.find_element(By.ID, "Publicaties")

        # Find all li-elements inside the first ul-element, which equals the amount of downloadable documents on this pages
        li_elements = publicaties_section.find_elements(By.TAG_NAME, "li")

        # Appending all Download-links to the previously defined list
        for li in li_elements:
            try:
                # Locating the nested <ul class="result--actions">-element
                actions_ul = li.find_element(By.CLASS_NAME, "result--actions")

                # Locating the relevant li-element inside the previously defined nested ul-element
                action_li = actions_ul.find_element(By.TAG_NAME, "li")

                # Locating the a-element that contains the href towards the PDF
                pdf_link = action_li.find_element(By.TAG_NAME, "a").get_attribute("href")

                # Storing the PDF download link in the pdf_links-list
                pdf_links.append(pdf_link)

            # Catching off errors in the previous try-function, for debugging purposes
            except Exception as e:
                print(f"Skipping an item due to an error: {e}")

    # Catching off errors in the previous try-function, for debugging purposes
    except Exception as e:
        print(f"Error finding elements: {e}")

    # Downloading all PDFs in the pdf_links-list
    for index, pdf_url in enumerate(pdf_links):
        try:
            # Printing the progress of downloading all PDFs on the page
            print(f"Downloading PDF {index + 1}/{len(pdf_links)}: {pdf_url}")

            # Downloading the PDF using requests
            response = requests.get(pdf_url, stream=True)

            if response.status_code == 200:
                # Creating a filename utilising the download link
                pdf_filename = os.path.join(year_folder, pdf_url.split("/")[-1])

                # Saving the downloaded PDF in the correct folder
                with open(pdf_filename, "wb") as pdf_file:
                    for chunk in response.iter_content(1024):
                        pdf_file.write(chunk)

                # Printing if the saving was succesful
                print(f"PDF saved: {pdf_filename}")

            # Printing if the saving failed for debugging purposes
            else:
                print(f"Failed to download PDF {index + 1}")

            # Short delay before the next download, to not hit quota limits
            time.sleep(1)

        # Catching off errors in the previous try-function, for debugging purposes
        except Exception as e:
            print(f"Error downloading PDF {index + 1}: {e}")

    # Trying to find and click the "Next"-button
    try:
        # Finding the relevant section and saving the button itself
        pagination_section = driver.find_element(By.ID, "paging-results")
        next_button = pagination_section.find_element(By.CLASS_NAME, "next").find_element(By.TAG_NAME, "a")

        # If the button exists, click to go to next page
        if next_button:
            next_page_url = next_button.get_attribute("href")
            print(f"Going to next page: {next_page_url}")
            page_count += 1
            driver.get(next_page_url)
            time.sleep(2)
        
        # Otherwise break down the loop (no extra pages to be found)
        else:
            print("No more pages. Exiting.")
            break
    
    # Catching off errors in the previous try-function, for debugging purposes
    except Exception:
        print("No 'Next' button found. Exiting loop.")
        break

# Closing the browser after scraping
driver.quit()

#### Making PDFs smaller, so to not overload my PC

In [None]:
import os
import subprocess

# Define paths
gs_path = r"[put path to your Ghostscript executable here]"
input_folder = r"[put path to the normally downloaded PDFs here]"
output_folder = r"[put path to the desired output folder for the compressed PDFs here]"

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Compression settings: Change "/screen" to "/ebook" or "/printer" for better quality
compression_level = "/screen"

# Process each PDF file
for filename in os.listdir(input_folder):
    if filename.lower().endswith(".pdf"):
        input_file = os.path.join(input_folder, filename)
        output_file = os.path.join(output_folder, f"compressed_{filename}")

        command = [
            gs_path, "-sDEVICE=pdfwrite", "-dNOPAUSE", "-dBATCH", "-dSAFER",
            f"-dPDFSETTINGS={compression_level}", f"-sOutputFile={output_file}", input_file
        ]

        subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        print(f"Compressed: {filename} â†’ {output_file}")