### Purpose of this notebook

Fetch data from internetconsultatie.nl and make it more usable

Note that this is a lot of data, 

In [1]:
import re
import urllib.parse
from importlib import reload

import bs4

from wetsuite.helpers import etree, net, localdata, notebook

In [2]:
# URL -> HTML page bytestring
# this contains a mix of page types; it is the below code that keeps track what belongs to a single fetch and how|
fetch_store         = localdata.LocalKV('internetconsultaties_fetched.db', key_type=str, value_type=bytes )     

# the case's paginated index of reactions:
reactiepage_store   = localdata.LocalKV('internetconsultaties_reactiepages.db',   key_type=str, value_type=bytes )     
# each individual reaction's page, and attached PDF if it's there.
reacties_store      = localdata.LocalKV('internetconsultaties_reacties.db' ,      key_type=str, value_type=bytes )     
reacties_store_pdf  = localdata.LocalKV('internetconsultaties_reacties_pdf.db' ,  key_type=str, value_type=bytes )     
#   we assume the last three do not change (because of where we started), so we can be nice and cache them forever

### Fetch pagination - list of cases only

Start at a URL like https://www.internetconsultatie.nl/geslotenconsultaties/1/10

Fetch 
- all pagination pages like it, based on the page link, 
- each case's basic detail page - look like https://www.internetconsultatie.nl/klimaatmaatregelenfinancielesector/b1

This step should take a few minutes, in part due to the pagination not being the fastest to serve.

In [3]:
# How chunked up top fetch the pagination in.  Options on the page are 10, 25, 100.
# arbitrary numbers seem to work, but this site is slow even with small numbers, and slower with larger nubers
PERPAGE = 100


# result pages URLs we have fetched, and have still to fetch
pages_to_fetch       = set( [f'https://www.internetconsultatie.nl/geslotenconsultaties/1/{PERPAGE}'] )
pages_fetched        = set() # URLs of pagination pages we have fetched
detail_pages_fetched = set() # URLs of detail pages pages we have fetched


# We can't easily show a progress bar because we fetch everything concurrently, 
#    so only discover how much there is on the way
#    ...though we could extract a perfectly serviceable esimate from   <p class="active">Er zijn in totaal 2566 consultaties...

while len(pages_to_fetch)>0:
    fetching_page_url = pages_to_fetch.pop()
    print( f' ========== PAGE: {fetching_page_url} ============ ')

    # pages are not cached, the pages will change with each new case
    pagebytes = net.download( fetching_page_url, timeout=30 ) # needs a moderately high timeout
    pages_fetched.add( fetching_page_url )

    soup = bs4.BeautifulSoup(pagebytes, features='lxml')

    ### find all page links to closed cases, add them to the "to fetch" if we didn't before
    for page_a in soup.find_all('a', attrs={'href':re.compile(r'/geslotenconsultaties/([0-9]+)/([0-9]+)')}):
        abs_linked_page_url = urllib.parse.urljoin( fetching_page_url, page_a.get('href') )  # resolve href relative to the page it's on
        if abs_linked_page_url not in pages_fetched:
            pages_to_fetch.add( abs_linked_page_url )

    ### extract all all case links, fetch the HTML page they point to
    for case_li in soup.select("div[class*='result--list'] > ul > li"): # CSS-style selector is succinct here
        case_a = case_li.find('a', attrs={'class':re.compile(r'\bresult--title\b')})
        abs_casedetail_url = urllib.parse.urljoin( fetching_page_url, case_a.get('href') )
        print( '      ', abs_casedetail_url )

        localdata.cached_fetch( fetch_store, abs_casedetail_url )
        detail_pages_fetched.add( abs_casedetail_url )
        #time.sleep(1) # be somewhat nice to the server

        # we could nest further, but let's separate parts of this notebook a little more
        #detail_page_soup = bs4.BeautifulSoup(detail_page_bytedata, features='lxml')

       https://www.internetconsultatie.nl/rijksmediabijdrage/b1
       https://www.internetconsultatie.nl/emissiehandel/b1
       https://www.internetconsultatie.nl/schorsendublin/b1
       https://www.internetconsultatie.nl/productierechten/b1
       https://www.internetconsultatie.nl/poliovirus/b1
       https://www.internetconsultatie.nl/grensgemeente/b1
       https://www.internetconsultatie.nl/gewasbeschermingsmiddelenbesluit/b1
       https://www.internetconsultatie.nl/registratieregeling/b1
       https://www.internetconsultatie.nl/rouwverlof/b1
       https://www.internetconsultatie.nl/msnp/b1
       https://www.internetconsultatie.nl/financieringsstructuurhr/b1
       https://www.internetconsultatie.nl/contourenbriefverbruiksbelasting/b1
       https://www.internetconsultatie.nl/bbft2025/b1
       https://www.internetconsultatie.nl/regelingtweedetranchelokaleaanpak/b1
       https://www.internetconsultatie.nl/besluitcasusoverleggen/b1
       https://www.internetconsultatie.nl/

### Fetch details of each case

In [4]:
verbose = 0


# To fish out the (relative) URLs on the page
_RE_REACTIE_UUID  = re.compile( r'.*/reactie/[0-9A-Fa-f]{8}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{12}$' )
_RE_REACTIE_START = re.compile( r'.*/reacties$' )
_RE_REACTIE_DATUM = re.compile( r'.*/reacties/datum/[0-9]+$' )

count_fetched, count_cached = 0, 0

def count(from_cache):
    global count_fetched, count_cached 
    if from_cache:
        count_cached += 1
    else:
        count_fetched += 1
    

pb = notebook.progress_bar( len(detail_pages_fetched), description='Cases: ' )

def fetch_case(case_page_url):
    pb.value +=1
    pb.description = f'{count_fetched} docs fetched, {count_cached} docs from cache;  cases:' # update bar after every reactie

    # The case detail page has a few different possible layouts
        # One has 
        # - a button-styled <a> with 
        #   - id="mainContentPlaceHolder_alleReactiesHyperLink" 
        #   - text: "Bekijk alle reacties"
        # - a few entries and
        #   ...which is sometimes all reactions on one page, but frequently also has pagination that looks like:
        #     /CASENAME/reacties/datum/PAGENUM
        #  each reaction is on its own page, linked like:
        #    /CASENAME/reactie/UUID

    if verbose:
        print( f'CASE: {case_page_url}' )
    case_page_bytedata, from_cache = localdata.cached_fetch( fetch_store, case_page_url ) # should all be cached due to the above
    count(from_cache)
    case_page_soup                 = bs4.BeautifulSoup(case_page_bytedata, features='lxml')

    # the pages that -- not the individual reactie details
    reactiepages_to_fetch = set() # a "list of things we saw but haven't stored"; we also test against the store that stores things we have
    reactiepages_fetched  = set()


    # The detail page may link to some reactions
    # - in some cases this is the complete list (if it's a few) and there is no separate pagination of reactions
    # - in other cases the pagination has the complete list and this will be redundant with that
    for reactie_a in case_page_soup.find_all('a', attrs={'href':_RE_REACTIE_UUID}):
        reactie_abs_url = urllib.parse.urljoin( case_page_url, reactie_a.get('href'))
        #print('   CASEPAGE REACTION', reactie_abs_url)
        _, from_cache = localdata.cached_fetch( reacties_store, reactie_abs_url )
        count(from_cache)


    # If there is a link (button at the bottom, or item in the menu on the left) 
    # that seems to view/paginate the reactions, go for that (if not, assume that the above reactions were all (?) )
    for reactie_pages_a in case_page_soup.find_all('a', attrs={'href':_RE_REACTIE_START}):
        reactie_page_abshref = urllib.parse.urljoin( case_page_url, reactie_pages_a.get('href'))
        if reactie_page_abshref not in reactiepages_to_fetch:
            reactiepages_to_fetch.add( reactie_page_abshref )
    for reactie_pages_a in case_page_soup.find_all('a', attrs={'href':_RE_REACTIE_DATUM}):
        reactie_page_abshref = urllib.parse.urljoin( case_page_url, reactie_pages_a.get('href'))
        if reactie_page_abshref not in reactiepages_to_fetch:
            reactiepages_to_fetch.add( reactie_page_abshref )


    while len(reactiepages_to_fetch)>0: # while there is reactie paging we haven't visited yet
        reactiepage_url = reactiepages_to_fetch.pop()
        if verbose:
            print( f"   REACTIE PAGING: {reactiepage_url}")
        reactiepaging_bytedata, from_cache = localdata.cached_fetch( reacties_store, reactiepage_url )
        count(from_cache)

        reactiepages_fetched.add( reactiepage_url )

        # Process the reactie page itself
        reactiepaging_soup = bs4.BeautifulSoup(reactiepaging_bytedata, features='lxml')

        # add previously unseen pagination page links
        for reactie_pages_a in reactiepaging_soup.find_all('a', attrs={'href':_RE_REACTIE_DATUM}):
            reactie_page_abshref = urllib.parse.urljoin( case_page_url, reactie_pages_a.get('href'))
            if reactie_page_abshref not in reactiepages_fetched  and   reactie_page_abshref not in reactiepage_store:
                reactiepages_to_fetch.add( reactie_page_abshref )

        # process content of the page:
        # links to actual reactie details
        for reactie_a in reactiepaging_soup.find_all('a', attrs={'href':_RE_REACTIE_UUID}):
            reactie_abs_url = urllib.parse.urljoin( case_page_url, reactie_a.get('href'))
            if verbose:
                print('      PAGINATED_REACTION', reactie_abs_url)

            reactie_bytes, from_cache = localdata.cached_fetch( reacties_store, reactie_abs_url ) # also do above
            count(from_cache)

            # see if there is a PDF attachment, because it, rather than the page, will probably include the response text
            reactie_soup     = bs4.BeautifulSoup( reactie_bytes, features='lxml' )
            for pdf_a in reactie_soup.select( "#content ul[class*='result--actions'] a[class*='icon--download']" ):
                pdf_abs_url = urllib.parse.urljoin( reactie_abs_url, pdf_a.get('href'))
                if verbose:
                    print('      PAGINATED_REACTION_PDF', pdf_abs_url)
                _, from_cache = localdata.cached_fetch( reacties_store_pdf, pdf_abs_url )
                count(from_cache)

            pb.description = f'{count_fetched} docs fetched, {count_cached} docs from cache;  cases:' # update bar after every reactie


for case_page_url in sorted(detail_pages_fetched):
    try:
        fetch_case( case_page_url )
    except Exception as e:
        print(e)

Cases:   0%|          | 0/2721 [00:00<?, ?it/s]

In [12]:
sorted( detail_pages_fetched )

['https://www.internetconsultatie.nl/%22essenti%C3%ABle-informatiedocument%22',
 'https://www.internetconsultatie.nl/%22vestigingsplaatsen%22%20',
 'https://www.internetconsultatie.nl/113_suicidepreventie',
 'https://www.internetconsultatie.nl/1289',
 'https://www.internetconsultatie.nl/1310',
 'https://www.internetconsultatie.nl/140a/b1',
 'https://www.internetconsultatie.nl/1413',
 'https://www.internetconsultatie.nl/1500',
 'https://www.internetconsultatie.nl/18xy',
 'https://www.internetconsultatie.nl/2099',
 'https://www.internetconsultatie.nl/2138',
 'https://www.internetconsultatie.nl/2198',
 'https://www.internetconsultatie.nl/2todrive',
 'https://www.internetconsultatie.nl/3629/b1',
 'https://www.internetconsultatie.nl/aanbestedingswetdefensieveiligheid',
 'https://www.internetconsultatie.nl/aandachtsgroepenregeling',
 'https://www.internetconsultatie.nl/aandeelhoudersbetrokkenheid',
 'https://www.internetconsultatie.nl/aandelenoptieregeling',
 'https://www.internetconsultatie

In [5]:
import collections
cases = collections.defaultdict( dict )#lambda: collections.defaultdict(dict) )

#for case_url in detail_pages_fetched:
#    case_idn = case_url.split('/')[3]
#    cases[case_idn] = {}
    #print(case_idn, case_url)
#display( sorted(cases.keys()) )

for reactie_url in list(reacties_store.keys())[:10]:
    idn = reactie_url.split('/')[3] # percent-decode, though?
    cases[idn]['idn'] = idn 
    #if idn not in cases:
    #    raise KeyError("Found a reactie URL that doesn't match how we understand case URLs (%r)"%(idn,))
    #    #print(  , reactie_url )

    print()
    print( reactie_url )
    docbytes = reacties_store.get( reactie_url )
    soup = bs4.BeautifulSoup(docbytes)
    print( soup )

cases


https://www.internetconsultatie.nl/%22essenti%C3%ABle-informatiedocument%22/reactie/2937d339-2046-4c61-ba7e-0bf5f143cbf0
<!DOCTYPE html>
<html lang="nl">
<head id="head1"><meta charset="utf-8"/><meta content="IE=edge" http-equiv="X-UA-Compatible"/><meta content="width=device-width,initial-scale=1" name="viewport"/><link href="http://purl.org/dc/terms/" rel="schema.dcterms"/><meta content="nl-NL" name="dcterms.language"/><meta content="text/html" name="dcterms.format"/><meta content="Overheid.nl is de kortste weg naar alle informatie van de Nederlandse overheid op internet, zoals nationale en lokale wet- en regelgeving, officiële publicaties, bekendmakingen en consultaties. Overheid.nl geeft ook toegang tot internetsites en informatiebronnen van andere overheidsorganisaties." name="dcterms.description"/>
<meta content="noindex, nofollow" name="robots"/>
<!--[if IE 6]><link href="/resources/css/ie6.css" rel="stylesheet" type="text/css" /><![endif]-->
<!--[if IE 7]><link href="/resources

defaultdict(dict,
            {'%22essenti%C3%ABle-informatiedocument%22': {'idn': '%22essenti%C3%ABle-informatiedocument%22'},
             '%22vestigingsplaatsen%22%20': {'idn': '%22vestigingsplaatsen%22%20'},
             '113_suicidepreventie': {'idn': '113_suicidepreventie'}})

### Process into something more useful