In [1]:
import requests
import os
import pandas as pd

In [2]:
##########
# This code will take several minutes to run. 
# It will download 1448 documents which will take up 369 MB of space.
# If you wish to extend the dataset with more years' speeches, modify lines 9, 14, 59 and 63 for each new year.
##########

# Initial setup
base_url = "https://gegevensmagazijn.tweedekamer.nl/OData/v4/2.0/Document"
query_params = "?$filter=Soort eq 'Stenogram' and year(Datum) eq 2024"  # change the year here to get data for other years.
page_size = 250
skip = 0

# Ensure there is a directory to save the downloaded PDFs
pdf_folder = 'tweede_pdfs2024'
os.makedirs(pdf_folder, exist_ok=True)

# Function to make HTTP GET requests
def fetch_data(url):
    response = requests.get(url)
    response.raise_for_status()  # Will stop the loop if an HTTP error occurs
    return response.json()

# Function to download a PDF from a URL
def download_pdf(pdf_url, filename):
    response = requests.get(pdf_url)
    response.raise_for_status()
    with open(filename, 'wb') as f:
        f.write(response.content)

# Loop to fetch all pages and download PDFs
file_paths = []
file_names = []
while True:
    full_url = f"{base_url}{query_params}&$skip={skip}"
    data = fetch_data(full_url)
   
    if 'value' in data and len(data['value']) > 0:
        for entry in data['value']:
            # Construct the URL for the PDF resource
            doc_id = entry['Id']
            pdf_url = f"{base_url}/{doc_id}/resource"
            file_names.append(doc_id + '.pdf')
           
            # Download the PDF and save it
            pdf_filename = os.path.join(pdf_folder, f"{doc_id}.pdf")
            file_paths.append(pdf_filename)
            download_pdf(pdf_url, pdf_filename)
       
        skip += page_size  # Prepare for the next page
        if len(data['value']) < page_size:
            break
    else:
        break

print(f"Downloaded all PDFs to {pdf_folder}")

# save filenames to text file so they can be reused without re-downloading the files:
# Note that if you are downloading a new year, you should probably rename this file to match.
with open('filepaths2024.txt', 'w') as file:
    for item in file_paths:
        file.write(item + '\n')

with open('filenames2024.txt', 'w') as file:
    for item in file_names:
        file.write(item + '\n')

Downloaded all PDFs to tweede_pdfs2024


In [None]:
""" 
uses the saved text file to recreate the list
"""
# with open('filepaths2024.txt', 'r') as file:
#     file_paths = file.read().splitlines()
# print(file_paths)

# with open('filenames2024.txt', 'r') as file:
#     file_names = file.read().splitlines()
# print(file_names)