<a href="https://colab.research.google.com/github/knobs-dials/wetsuite-datacollect/blob/main/koop_bwb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Purpose of this notebook

Show how we fetch data from the BWB repository to be used to create our corresponding datasets

TODO: finish, this is a copy-paste from a script

## Fetching

In [1]:
import re
import collections
import datetime
import pprint
import random

import wetsuite.helpers.etree
import wetsuite.helpers.notebook
import wetsuite.helpers.date
import wetsuite.helpers.localdata
import wetsuite.helpers.koop_parse
import wetsuite.datacollect.koop_repositories 
import wetsuite.datasets

In [2]:
# contains toestand, manifest, and wti downloads
bwb_fetched = wetsuite.helpers.localdata.LocalKV( 'bwb_fetched.db', str, bytes )

In [3]:
def bwb_search_callback( search_record_node ):
    ''' BWB records follow http://standaarden.overheid.nl/sru/gzd.xsd
        
        Right now we merge all the parts of a record into one dict, 
            which throws away some structure (on top of the already removed namespaces)
            but is easier to deal with.
    '''
    #print( wetsuite.helpers.etree.debug_pretty( record ) ) # for later reference, if you want to extract more out of these search records
    meta_dict = wetsuite.helpers.koop_parse.bwb_searchresult_meta( search_record_node )

    # toestand XML
    _, toestand_came_from_cache = wetsuite.helpers.localdata.cached_fetch( bwb_fetched,  meta_dict['locatie_toestand'],  force_refetch=False )

    # manifest and WTI - assume these probably changed (so need to be refetched) if we got a toestand we didn't previously have
    force_refetch_meta = (not toestand_came_from_cache)     
    _, man_cached = wetsuite.helpers.localdata.cached_fetch( bwb_fetched,  meta_dict['locatie_manifest'],  force_refetch=force_refetch_meta )
    _, wti_cached = wetsuite.helpers.localdata.cached_fetch( bwb_fetched,  meta_dict['locatie_wti'],       force_refetch=force_refetch_meta )

    if (not toestand_came_from_cache or not man_cached or not wti_cached): # fetched anything new? Mention that.
        print( "FETCHED new data for %s - %r"%( meta_dict['identifier'], meta_dict ) )

In [4]:
# This is intended as a "update with recent changes"  (we previously did a lot more fetching)
sru_bwb = wetsuite.datacollect.koop_repositories.BWB( verbose=True )
_ = sru_bwb.search_retrieve_many('dcterms.modified >= %s'%( wetsuite.helpers.date.date_weeks_ago(6).strftime('%Y-%m-%d') ),
                                 up_to=20000, at_a_time=500, callback=bwb_search_callback)

[SRU searchRetrieve] fetching 'http://zoekservice.overheid.nl/sru/Search?&version=1.2&x-connection=BWB&operation=searchRetrieve&startRecord=1&maximumRecords=500&query=dcterms.modified%20%3E%3D%202024-03-16'
[SRU searchRetrieve] fetching 'http://zoekservice.overheid.nl/sru/Search?&version=1.2&x-connection=BWB&operation=searchRetrieve&startRecord=501&maximumRecords=500&query=dcterms.modified%20%3E%3D%202024-03-16'
[SRU searchRetrieve] fetching 'http://zoekservice.overheid.nl/sru/Search?&version=1.2&x-connection=BWB&operation=searchRetrieve&startRecord=1001&maximumRecords=500&query=dcterms.modified%20%3E%3D%202024-03-16'
[SRU searchRetrieve] fetching 'http://zoekservice.overheid.nl/sru/Search?&version=1.2&x-connection=BWB&operation=searchRetrieve&startRecord=1501&maximumRecords=500&query=dcterms.modified%20%3E%3D%202024-03-16'
FETCHED new data for BWBR0012288 - {'identifier': 'BWBR0012288', 'title': 'Vreemdelingencirculaire 2000 (C)', 'type': 'circulaire', 'language': 'nl', 'authority': '

## Take that downloaded store, extract useful things into datasets

CONSIDER: smaller subset to start with, e.g. just 2023

In [4]:
# go through all fetched URLS and group  
# - manifest
# - wti
# - all toestanden
# ...per BWB-id.
# We assume URL structure is consistent, which they seem to be.

bwbr_groups = collections.defaultdict(dict)  #  bwbr -> { toestanden:   latest_toestand:    wti:    manifest:  }

print("Grouping relevant URLs")

for url in wetsuite.helpers.notebook.ProgressBar( bwb_fetched.keys() ):

    # both filters for basic URLs we care about at all (in case other things got dropped in),
    # and filters for URLs with BWBR  - which implies skipping BWBV (verdragen/treaties), BWBW (?)
    # (the matching here and below is a little hacky, though, clean up?)
    bwbr = re.search('/bwb/(BWBR[0-9]{7})', url)
    if bwbr is not None:
        bwbr = bwbr.groups()[0] # the BWBR-and-number text

        if url.endswith('manifest.xml'): # e.g. https://repository.officiele-overheidspublicaties.nl/bwb/BWBR0019805/manifest.xml
            bwbr_groups[bwbr]['manifest_url'] = url
            continue

        if url.endswith('.WTI'):         # e.g.  https://repository.officiele-overheidspublicaties.nl/bwb/BWBR0016700/BWBR0016700.WTI
            bwbr_groups[bwbr]['wti_url'] = url
            continue

        toestand_match =  re.search('/bwb/(BWBR[0-9]{7})(/[0-9].*[.]xml)', url) 
        if toestand_match is not None: # e.g. #https://repository.officiele-overheidspublicaties.nl/bwb/BWBR0001840/2002-03-21_0/xml/BWBR0001840_2002-03-21_0.xml
            _, sortname = toestand_match.groups() # assume that date is lexically sortable
            # those will be something like 'BWBR0001821'  and  '/1998-01-01_0/xml/BWBR0001821_1998-01-01_0.xml'
            if 'toestanden' not in bwbr_groups[bwbr]:
                bwbr_groups[bwbr]['toestanden'] = []
            bwbr_groups[bwbr]['toestanden'].append( (sortname,url) )
            continue
        
        print( "SKIP / LOOKAT   %s"%url )


print( 'We have %d Unique BWB-id groups'%len(bwbr_groups) )


print( "Finding latest versions of each" )
for bwbr, details in wetsuite.helpers.notebook.ProgressBar( list( bwbr_groups.items() ) ): # within each BWB-id
    for key, url in sorted( details['toestanden'], reverse=True ): # latest first, then use only the first
        bwbr_groups[bwbr]['latest_toestand_url'] = url
        break

Grouping relevant URLs


  0%|          | 0/208791 [00:00<?, ?it/s]

We have 38024 Unique BWB-id groups
Finding latest versions of each


  0%|          | 0/38024 [00:00<?, ?it/s]

In [5]:
# Now do some extraction and also make that datasets
# ...keep mind mind that all the extraction could use some refinement.

bwb_latestonly_xml = wetsuite.helpers.localdata.LocalKV( 'bwb-mostrecent-xml.db', str, bytes ) # bwbr -> xmlbytes
bwb_latestonly_xml._put_meta('description_short',
                             'Raw XML for the latest revision from each BWB-id')
bwb_latestonly_xml._put_meta('description','''
Maps from the BWB-id to the XML file as a bytestring, e.g. 

'BWBR0019090' -> b'<?xml version="1.0" encoding="UTF-8"?><toestand xmlns...'

'''+wetsuite.datasets.generated_today_text())

bwb_latestonly_text = wetsuite.helpers.localdata.LocalKV( 'bwb-mostrecent-text.db', str, str )
bwb_latestonly_text._put_meta('description_short',
                             'Plain text for the latest revision from each BWB-id')
bwb_latestonly_text._put_meta('description','''
Maps from the BWB-id to plain text without any of the structure.
                               
'BWBR0025942': 'De bij dit besluit gevoegde ‘ selectielijst voor de neerslag...'
                              
'''+wetsuite.datasets.generated_today_text())

bwb_latestonly_meta = wetsuite.helpers.localdata.MsgpackKV( 'bwb-mostrecent-meta-struc.db', str, None )
bwb_latestonly_meta._put_meta('description_short',
                             'Metadata structure text for the latest revision from each BWB-id')
bwb_latestonly_meta._put_meta('description','''
Maps from the BWB-id to metadata,
where that metadata comes from the toestand itself, from the manifest file, and from the Wetstechnische informatie (WTI) file. 

For example:                              
                              
'BWBR0017744':{
  'bwb-id': 'BWBR0017744',
  'intitule': 'Regeling inzake de bijdragen van de gebruikers in de kosten van de landelijk raadpleegbare deelverzameling GBA (Bijdragenregeling LRD)',
  'citeertitel': 'Bijdragenregeling LRD ',
  'soort': 'ministeriele-regeling',
  'inwerkingtredingsdatum': '2005-01-01',
  'latest_toestand_url': 'https://repository.officiele-overheidspublicaties.nl/bwb/BWBR0017744/2005-01-01_0/xml/BWBR0017744_2005-01-01_0.xml',
  'wti_url': 'https://repository.officiele-overheidspublicaties.nl/bwb/BWBR0017744/BWBR0017744.WTI',
  'wti': {'algemene_informatie': {'citeertitels_withdate': [['2005-01-01',
      '9999-12-31',
      'Bijdragenregeling LRD ']],
    'citeertitels_distinct': ['Bijdragenregeling LRD '],
    'eerstverantwoordelijke': 'Binnenlandse Zaken en Koninkrijksrelaties',
    'identificatienummer': 'BWBR0017744',
    'rechtsgebieden': [['Openbare orde en veiligheidsrecht', None]],
    'overheidsdomeinen': ['Openbare orde en veiligheid']},
   'related': [['grondslagen',
     'BWBR0006933',
     'jci1.3:c:BWBR0006933&artikel=6',
     'Artikel 6, achtste lid'],
  ...

'''+wetsuite.datasets.generated_today_text())


print("Writing latest-toestand-XML dataset")

for bwbr, details in wetsuite.helpers.notebook.ProgressBar( bwbr_groups.items() ): # within each BWB-id
    bwb_latestonly_xml.put(bwbr, bwb_fetched.get( details['latest_toestand_url'] ), commit=False) # postponed commit makes this much faster
bwb_latestonly_xml.commit()


print("Parsing further metadata, writing meta and text datasets")

#for bwbr, details in wetsuite.helpers.notebook.ProgressBar( random.sample( list(bwbr_groups.items()), 100) ): # debug: test on a few
for bwbr, details in wetsuite.helpers.notebook.ProgressBar( bwbr_groups.items() ): # within each BWB-id

    toestand_tree = wetsuite.helpers.etree.fromstring( bwb_fetched.get( details['latest_toestand_url'] ) )
    text          = wetsuite.helpers.koop_parse.bwb_toestand_text(toestand_tree)

    meta_dict     = wetsuite.helpers.koop_parse.bwb_toestand_usefuls(toestand_tree)

    meta_dict['latest_toestand_url'] = details['latest_toestand_url']

    wti_url       = details['wti_url']
    if wti_url is not None:
        meta_dict['wti_url']       = wti_url
        wti_tree                   = wetsuite.helpers.etree.fromstring( bwb_fetched.get( wti_url ) )
        meta_dict['wti']           = wetsuite.helpers.koop_parse.bwb_wti_usefuls(wti_tree)

    manifest_url  = details['manifest_url']
    if manifest_url is not None:
        meta_dict['manifest_url']  = manifest_url
        manifest_tree              = wetsuite.helpers.etree.fromstring( bwb_fetched.get( manifest_url ) )
        meta_dict['manifest']      = wetsuite.helpers.koop_parse.bwb_manifest_usefuls(manifest_tree)

        # redundant, but sometimes nice to have more accessible
        version_dates = list()
        for expression in manifest_tree.findall('expression'):
            version_dates.append( expression.find('metadata/datum_inwerkingtreding').text )
            meta_dict['version_dates'] = version_dates

    bwb_latestonly_text.put( bwbr, text      )
    bwb_latestonly_meta.put( bwbr, meta_dict )

Writing latest-toestand-XML dataset


  0%|          | 0/38024 [00:00<?, ?it/s]

Parsing further metadata, writing meta and text datasets


  0%|          | 0/38024 [00:00<?, ?it/s]