In [1]:
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
#Name url to scrape from, output directory and the number of pages to scrape
start_url = "https://zoek.officielebekendmakingen.nl/resultaten?q=(c.product-area==%22officielepublicaties%22)and((dt.subject=%22basisonderwijs%22)or(dt.subject=%22beroepsonderwijs%22)or(dt.subject=%22kenniseconomie%22)or(dt.subject=%22Onderwijs%20en%20wetenschap%22)or(dt.subject=%22onderwijsvoorzieningen%22))and((w.publicatienaam==%22Staatsblad%22))&zv=&pg=50&col=Staatsblad&svel=Publicatiedatum&svol=Aflopend"
output_dir = "officielebekendmakingen"
max_pages = 31  #Total number of pages to scrape
os.makedirs(output_dir, exist_ok=True)

#Setup Selenium
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

#code loops over pages downloading all pdfs it can find
for page in range(1, max_pages + 1):
    print(f"Scraping page {page}...") #tells which page you are on

    if page == 1:
        url = start_url
    else:
        url = f"{start_url}&pagina={page}"

    driver.get(url)
    time.sleep(2)

    #Finds all PDF links on the current page
    links = driver.find_elements(By.XPATH, "//a[contains(@href, '.pdf')]")
    print(f"Found {len(links)} PDFs") #prints number of pdfs found

    for link in links:
        pdf_url = link.get_attribute("href")
        if not pdf_url:
            continue
        filename = os.path.join(output_dir, os.path.basename(pdf_url))

        if os.path.exists(filename):
            print(f"Already downloaded: {filename}")
            continue

        print(f"Downloading: {pdf_url}")
        try:
            r = requests.get(pdf_url)
            with open(filename, "wb") as f:
                f.write(r.content)
        except Exception as e:
            print(f"Failed to download {pdf_url}: {e}")

driver.quit()
print("All PDFS found from number of pages provided are downloaded")

Scraping page 1...
Found 50 PDFs
Downloading: https://zoek.officielebekendmakingen.nl/stb-2025-157.pdf
Downloading: https://zoek.officielebekendmakingen.nl/stb-2025-30.pdf
Downloading: https://zoek.officielebekendmakingen.nl/stb-2024-408.pdf
Downloading: https://zoek.officielebekendmakingen.nl/stb-2024-345.pdf
Downloading: https://zoek.officielebekendmakingen.nl/stb-2024-316.pdf
Downloading: https://zoek.officielebekendmakingen.nl/stb-2024-314.pdf
Downloading: https://zoek.officielebekendmakingen.nl/stb-2024-310.pdf
Downloading: https://zoek.officielebekendmakingen.nl/stb-2024-290.pdf
Downloading: https://zoek.officielebekendmakingen.nl/stb-2024-279.pdf
Downloading: https://zoek.officielebekendmakingen.nl/stb-2024-268.pdf
Downloading: https://zoek.officielebekendmakingen.nl/stb-2024-258.pdf
Downloading: https://zoek.officielebekendmakingen.nl/stb-2024-257.pdf


KeyboardInterrupt: 

In [4]:
#Since delpher has the pdf files within each link, selenium collects and opens eachs instance on the search page and downloads the .pdf from there
#Name url to scrape from, output directory and the number of pages to scrape
output_dir = "Nederlandschestaatscourantaaa"
max_pages = 78  # Adjust this to how many pages you want to scrape
os.makedirs(output_dir, exist_ok=True)

#Setup of Selenium
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

#Loop through all search result pages
for page in range(1, max_pages + 1):
    print(f"Scraping page {page}...")

    url = f"https://www.delpher.nl/nl/kranten/results?query=staatsblad&page={page}&maxperpage=50&cql%5B%5D=(date+_gte_+%2201-01-1946%22)&cql%5B%5D=(date+_lte_+%2231-12-2005%22)&cql%5B%5D=ppn+any+(400915472+OR+830850090)&coll=ddd"
    driver.get(url)

    try:
        #Wait for results to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//a[contains(@href, '/nl/kranten/view?')]"))
        )
        time.sleep(1)  # Optional fallback wait
    except:
        print(f"No articles loaded on page {page}")
        continue

    #Collect unique article URLs
    article_link_elems = driver.find_elements(By.XPATH, "//a[contains(@href, '/nl/kranten/view?')]")
    article_urls = list({elem.get_attribute("href") for elem in article_link_elems if elem.get_attribute("href")})

    print(f"Found {len(article_urls)} articles")

    #Visit & download each article from the urls
    for article_url in article_urls:
        print(f"Opening article: {article_url}")
        driver.get(article_url)
        time.sleep(2)

        try:
            #Wait for the PDF download link
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//a[contains(@href, 'type=pdf') and contains(@href, '/api/resource')]"))
            )

            pdf_link_elem = driver.find_element(
                By.XPATH,
                "//a[contains(@href, 'type=pdf') and contains(@href, '/api/resource')]"
            )
            pdf_url = pdf_link_elem.get_attribute("href")

            if pdf_url:
                identifier = pdf_url.split("identifier=")[1].split("&")[0].replace(":", "_")
                filename = os.path.join(output_dir, f"{identifier}.pdf")

                if os.path.exists(filename):
                    print(f"Already downloaded: {filename}")
                    continue

                print(f"Downloading PDF: {pdf_url}")
                r = requests.get(pdf_url)
                r.raise_for_status()
                with open(filename, "wb") as f:
                    f.write(r.content)
                print(f"Saved: {filename}")

        except Exception as e:
            print(f"Failed to download from {article_url}: {e}")

# Close browser
driver.quit()
print("Done downloading all PDFs.")

Scraping page 1...
Found 50 articles
Opening article: https://www.delpher.nl/nl/kranten/view?query=staatsblad&page=1&maxperpage=50&cql%5B%5D=%28date+_gte_+%2201-01-1946%22%29&cql%5B%5D=%28date+_lte_+%2231-12-2005%22%29&cql%5B%5D=ppn+any+%28400915472+OR+830850090%29&coll=ddd&redirect=true&identifier=MMKB08:000166939:mpeg21:a0003&resultsidentifier=MMKB08:000166939:mpeg21:a0003&rowid=20
Downloading PDF: https://www.delpher.nl/nl/api/resource?identifier=MMKB08:000166939:mpeg21:a0003&coll=ddd&operation=download&type=pdf
Saved: Nederlandschestaatscourantaaa\MMKB08_000166939_mpeg21_a0003.pdf
Opening article: https://www.delpher.nl/nl/kranten/view?query=staatsblad&page=1&maxperpage=50&cql%5B%5D=%28date+_gte_+%2201-01-1946%22%29&cql%5B%5D=%28date+_lte_+%2231-12-2005%22%29&cql%5B%5D=ppn+any+%28400915472+OR+830850090%29&coll=ddd&redirect=true&identifier=MMKB08:000166676:mpeg21:a0002&resultsidentifier=MMKB08:000166676:mpeg21:a0002&rowid=7
Downloading PDF: https://www.delpher.nl/nl/api/resource?ide

KeyboardInterrupt: 