<a href="https://colab.research.google.com/github/WetSuiteLeiden/data-collection/blob/master/koop_cvdr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Purpose of this notebook

Show how we fetch data from the CVDR repository to be used to create our corresponding datasets

## Fetching

In [2]:
import collections
import datetime
import random
import pprint
import time

import wetsuite.helpers.notebook
import wetsuite.helpers.localdata
import wetsuite.datacollect.koop_sru 
import wetsuite.helpers.date
import wetsuite.helpers.etree
import wetsuite.helpers.koop_parse

In [3]:
# store to put downloads into:
cvdr_fetched = wetsuite.helpers.localdata.LocalKV( 'cvdr_fetched.db', str, bytes )

# out of interest  (can take a few seconds once it's large, because get_num_items walks through everything)
#cvdr_fetched.summary(get_num_items=True)

In [4]:
# no fancy queries, just date ranges
queries = []

if 0: 
    # split many years into shorter spans, to do many fetches in smaller chunks
    #   (for reference, there are usually 20 to 250 items per day)
    for from_date, to_date in wetsuite.helpers.date.date_ranges( from_date=datetime.date( 2000, 1, 1 ),  to_date=datetime.date.today(), increment_days=50, strftime_format="%Y-%m-%d" ):
        queries.append( f'dcterms.modified>={from_date} and dcterms.modified<={to_date}' ) # TODO: check whether there is a better field than modified

else:
    # ask for recent changes 
    #   (note: we treat this as "fetch documents that were mentioned", 
    #          not as a "re-fetch things that were changed" )
    some_time_ago = datetime.date.today() - datetime.timedelta( days=62 )
    queries.append( f'dcterms.modified >= {some_time_ago.strftime("%Y-%m-%d")}' )

print( queries )

['dcterms.modified >= 2024-03-16']


In [5]:
# Post those queries, fetch any referenced documents we didn't already have

sru_cvdr = wetsuite.datacollect.koop_sru.CVDR( )

for query in queries:
    print( f'Search: {query}' )
    sru_cvdr.search_retrieve( query ) # purely for the number of records, itself only for the progress bar

    numrecs = sru_cvdr.num_records()
    pbar = wetsuite.helpers.notebook.progress_bar( numrecs, description='fetching' )

    count_cached, count_fetched, count_error = 0, 0, 0

    def cvdr_callback( record_node ):
        ''' Read search result records, pick out the URLs to fetch and fetch them. 
            Is a local function because we count per query, in a slightly weirdly scoped way '''
        #print( wetsuite.helpers.etree.debug_pretty( record_node ) ) # for later reference, if you want to extract more out of these search records
        global count_cached, count_fetched, count_error

        merged = wetsuite.helpers.koop_parse.cvdr_meta( record_node, flatten=True ) 
        # using flatten is a little creative for something that needs to be a precise value (see cvdr_meta's docstring) but in current use it is valid.
        #pprint.pprint( merged )

        for resource_name, resource_key in ( 
            ('XML',  'publicatieurl_xml'),
            ('HTML', 'publicatieurl_xhtml'),
        ):
            if resource_key not in merged:
                print('SKIP: no %r in %r'%(resource_key, merged))
            else:
                try:
                    _, came_from_cache = wetsuite.helpers.localdata.cached_fetch( cvdr_fetched, merged[ resource_key] ) # we currently care only about the XML it links to
                    if not came_from_cache:
                        count_fetched += 1
                        time.sleep( 2 ) # be somewhat nice to the servers
                    else:
                        count_cached += 1
                # mainly expecting 404, 500
                except ValueError as e:
                    count_error += 1
                    print( "ERROR downloading %s: %s  for %r"%(resource_name, e, merged[resource_key]))
                    time.sleep( 10 ) # be somewhat nicer to the servers

        pbar.value       += 1
        pbar.description  = f'{count_fetched} fetched, {count_cached} cached' # , {count_error} errors

    try:
        sru_cvdr.search_retrieve_many( query, at_a_time=500, up_to=50000, callback=cvdr_callback)
    except ValueError as e:
        count_error += 1
        print( "ERROR querying %s: %s"%(query, e) )

Search: dcterms.modified >= 2024-03-16
[SRU searchRetrieve] fetching 'http://zoekservice.overheid.nl/sru/Search?&version=1.2&x-connection=cvdr&operation=searchRetrieve&query=dcterms.modified%20%3E%3D%202024-03-16'


fetching:   0%|          | 0/3060 [00:00<?, ?it/s]

[SRU searchRetrieve] fetching 'http://zoekservice.overheid.nl/sru/Search?&version=1.2&x-connection=cvdr&operation=searchRetrieve&startRecord=1&maximumRecords=500&query=dcterms.modified%20%3E%3D%202024-03-16'
SKIP: no 'publicatieurl_xhtml' in {'organisatietype': 'Gemeente', 'publicatieurl_xml': 'https://repository.officiele-overheidspublicaties.nl/cvdr/CVDR696164/1/xml/CVDR696164_1.xml', 'preferred_url': 'https://lokaleregelgeving.overheid.nl/CVDR696164/1', 'identifier': 'CVDR696164_1', 'title': 'Omgevingsplan gemeente almere', 'language': 'nl', 'type': 'regeling (overheid:Informatietype)', 'creator': 'Almere (overheid:Gemeente)', 'modified': '2024-04-25+02:00', 'isFormatOf': 'gmb-2024-183602 (https://zoek.officielebekendmakingen.nl/gmb-2024-183602.html),  gmb-2024-104067 (https://zoek.officielebekendmakingen.nl/gmb-2024-104067.html),  stcrt-2023-35432 (https://zoek.officielebekendmakingen.nl/stcrt-2023-35432.html),  prb-2023-14836 (https://zoek.officielebekendmakingen.nl/prb-2023-14836

## Creating dataset

We'll spare you the full contents of that store,
because it contains most versions of most things, 
is even more overcomplete than that because of past experiments,
and probably not something you want to fetch yourself for the sheer size of it.

Mostly for our own reference, it contains keys that are URLs like:
- https://repository.officiele-overheidspublicaties.nl/CVDR/100078/1/html/100078_1.html
- https://repository.officiele-overheidspublicaties.nl/CVDR/100078/1/xml/100078_1.xml

The values are the according files, as bytestrings.

Right now we care more about parseable data than readable pages,
so we focus on the XML (also in the parsing helper functions), 
but also extract HTML for those that prefer it.
We ignore anything else it might contain.

Also, it seems that KOOP search results expose some variation in the capitalisation, led to duplicate URLs in the above, e.g. 
- https://repository.officiele-overheidspublicaties.nl/CVDR/100078/1/xml/100078_1.xml
- https://repository.officiele-overheidspublicaties.nl/cvdr/100078/1/xml/100078_1.xml

...so we also ensure we pick just one.

In [7]:
# case insensitive choice.
# (We previously had some tests that their contents are identical, and indeed found no difference)

casededup_xml   = collections.defaultdict(list)  # lowercased version of URL -> actual URLs
casededup_html  = collections.defaultdict(list)  # lowercased version of URL -> actual URLs
ignore_list     = []

unique_xml_urls = []
unique_html_urls = []

for url in cvdr_fetched:
    if url.endswith('.xml'):
        casededup_xml[ url.lower() ].append( url )
    elif url.endswith('.html'):
        casededup_html[ url.lower() ].append( url )
    else:
        ignore_list.append( url )

for lurl in list(casededup_xml):
     url_list = sorted( casededup_xml[lurl] ) # sorting for some consistency in which one we pick - not necessary, but nice
     unique_xml_urls.append( url_list[0] )

for lurl in list(casededup_html):
     url_list = sorted( casededup_html[lurl] ) 
     unique_html_urls.append( url_list[0] )

# report
print( f"The store had {len(cvdr_fetched)} items, of which {len(ignore_list)} not immediately relevant" )
print( f"  Of the relevant ones, {len( unique_xml_urls )} are XMLs, {len( unique_html_urls )} are (X)HMTLs. " )
print( f"  (so approx %d seem to be case duplicates?)"%(
    len(cvdr_fetched) - ( len( unique_xml_urls ) + len( unique_html_urls ))    
) )
if len(ignore_list)>0:
    print("some URLs are ignored include:")
    for url in random.sample( ignore_list, 10):
        print( f'   {url}' )

The store had 884010 items, of which 0 not immediately relevant
  Of the relevant ones, 283946 are XMLs, 282605 are (X)HMTLs. 
  (so approx 317459 seem to be case duplicates?)


In [9]:
# Group expressions by their work ID. More specifically, we want to create a dict like:
#   work_id -> expression_id [ dict with version, xml_url, html_url), ... ]
# We spend some extra time to be able to deal with the absence of html (but not xml)
# ...and _then_ pick just the last

def work_expression_in_url(url):
    # fish IDs out of an URL like 'https://repository.officiele-overheidspublicaties.nl/CVDR/100078/1/xml/100078_1.xml'
    ids                    = url.rsplit('/',1)[1].rsplit('.',1)[0]                # output would e.g. be '100078_1'
    work_id, expression_id = wetsuite.helpers.koop_parse.cvdr_parse_identifier(ids)
    version_int            = int( expression_id.split('_',1)[1], 10)   # as an integer, mainly for correct sorting
    return work_id, expression_id, version_int                         # output would e.g. be ('100078', '100078_1', 1)


group_collect = collections.defaultdict( lambda: collections.defaultdict(dict) ) # workid-> { expressionid: }

for url in unique_xml_urls:
    work_id, expression_id, version_int = work_expression_in_url( url )
    group_collect[work_id][expression_id]['xml']     = url
    group_collect[work_id][expression_id]['version'] = version_int

for url in unique_html_urls:
    work_id, expression_id, version_int = work_expression_in_url( url )
    group_collect[work_id][expression_id]['html'] = url

# now we can actually do that choice of the last from each
lasts_only = {}
for work_id in group_collect:
#for work_id in list(group_collect)[10:11]:
    versions_dict = list( group_collect[work_id].items() )
    choice_key, choice_dict = sorted( versions_dict, key=lambda x:x[1]['version'])[-1] # details of last version
    lasts_only[work_id] = (choice_dict['version'], choice_key, choice_dict.get('xml'), choice_dict.get('html') )

Start a store that intends to contain just the most recent expression XML for each work.
And the same for HTML

In [10]:
# takes minute or to just to write that much data  (order of a few GB)
cvdr_latestonly_xml = wetsuite.helpers.localdata.LocalKV( 'cvdr-mostrecent-xml.db', str, bytes )
cvdr_latestonly_xml._put_meta('description_short',  'Raw XML for the latest expression within each CVDR work set')
cvdr_latestonly_xml._put_meta('description',''' ''')

cvdr_latestonly_html = wetsuite.helpers.localdata.LocalKV( 'cvdr-mostrecent-html.db', str, bytes )
cvdr_latestonly_html._put_meta('description_short',  'Raw HTML for the latest expression within each CVDR work set')
cvdr_latestonly_html._put_meta('description',''' ''')

for work_id, (version, expr_id, xml_url, html_url) in wetsuite.helpers.notebook.ProgressBar( lasts_only.items() ):
    cvdr_latestonly_xml.put( work_id, cvdr_fetched.get( xml_url ), commit=False )
    if html_url is not None:
        cvdr_latestonly_html.put( work_id, cvdr_fetched.get( html_url ), commit=False )

cvdr_latestonly_xml.commit()
cvdr_latestonly_html.commit()

  0%|          | 0/236967 [00:00<?, ?it/s]

In [73]:
cvdr_latestonly_xml.summary(True)

{'size_bytes': 9371648,
 'size_readable': '9.4M',
 'num_items': 1,
 'avgsize_bytes': 9371648}

...and stores that contain the plain text, and the metadata, for the same latest expressions. 

These three stores should have exactly the same keys (unless maybe we forget to clean the lastest leftoves betwen rerunning this).

In [11]:
cvdr_latestonly_text = wetsuite.helpers.localdata.LocalKV( 'cvdr-mostrecent-text.db', str, str )
cvdr_latestonly_text._put_meta('description_short','Flattened plain text for the latest expression within each CVDR work set') 
cvdr_latestonly_text._put_meta('description',''' ''') 

cvdr_latestonly_meta = wetsuite.helpers.localdata.MsgpackKV( 'cvdr-mostrecent-meta-struc.db', str, None)
cvdr_latestonly_meta._put_meta('description_short','Metadata for the latest expression within each CVDR work set') 
cvdr_latestonly_meta._put_meta('description',''' ''') 


unknown_xml = 0
for work_id, xml_bytes in wetsuite.helpers.notebook.ProgressBar( cvdr_latestonly_xml.items() ):
#for url in wetsuite.helpers.notebook.ProgressBar( list(cvdr_latestonly_xml.keys())[210000:] ):
#        xml_bytes = cvdr_latestonly_xml.get( url )

    tree = wetsuite.helpers.etree.fromstring( xml_bytes )
        
    if work_id not in cvdr_latestonly_meta:
        try:
                meta = wetsuite.helpers.koop_parse.cvdr_meta(tree, flatten=True)
                cvdr_latestonly_meta.put(work_id, meta, commit=False)
        except ValueError as ve: # probably us noticing we don't know a variant of XML
                #print( f'{ve} for {url}' )
                unknown_xml += 1
                #pprint.pprint(meta)

    if work_id not in cvdr_latestonly_text:
        try:
                text = wetsuite.helpers.koop_parse.cvdr_text(tree)
                cvdr_latestonly_text.put(work_id, text, commit=False)
        except AttributeError as ae:
                #print( f'{ae} for {url}' )
                unknown_xml += 1

cvdr_latestonly_meta.commit()
cvdr_latestonly_text.commit()

unknown_xml

  0%|          | 0/236967 [00:00<?, ?it/s]

In [81]:
# examples of the metadata
cvdr_latestonly_meta.random_sample(3)

[('119373',
  {'identifier': 'CVDR119373_1',
   'title': 'Besluit voorzieningen maatschappelijke ondersteuning gemeente Vlagtwedde 2011',
   'language': 'nl',
   'type': 'regeling (overheid:Informatietype)',
   'creator': 'Vlagtwedde (overheid:Gemeente)',
   'modified': '2017-09-05',
   'spatial': 'Vlagtwedde (overheid:Gemeente)',
   'isFormatOf': 'Ter Apeler Courant d.d. 24 augustus 2011 ()',
   'alternative': 'Besluit voorzieningen maatschappelijke ondersteuning gemeente Vlagtwedde 2011',
   'source': 'Geen ()',
   'isRatifiedBy': 'college van burgemeester en wethouders (overheid:BestuursorgaanGemeente)',
   'subject': 'maatschappelijke zorg en welzijn',
   'issued': '2011-08-16',
   'rights': 'De tekst in dit document is vrij van auteursrecht en\n                    databankrecht',
   'inwerkingtredingDatum': '2011-08-25',
   'uitwerkingtredingDatum': '2011-12-31',
   'betreft': 'Onbekend',
   'kenmerk': 'ZA.11-12573',
   'gedelegeerdeRegelgeving': 'Geen',
   'redactioneleToevoeging

In [None]:
# TODO: See if there is anyything useful in the below that should go above


indat  = meta.get('inwerkingtredingDatum')
if indat is not None:
    indat = indat[0]['text']
if indat is not None:
    #print(indat)
    indat = parse_date( indat )

uitdat = meta.get('uitwerkingtredingDatum')
if uitdat is not None: 
    uitdat = uitdat[0]['text']
if uitdat is not None: 
    #print(uitdat)
    uitdat = parse_date( uitdat )

###  
# collect things into a dict
doc = {
    'xml_url':url, 
    'web_url':'https://lokaleregelgeving.overheid.nl/CVDR%s'%( expression_id.replace('_','/') ) # presumably?
}

doc['title']      = meta.get('title')[0]['text'] # assumes there's always exactly one

for fetch_as_list in (
        'alternative', 'subject', 'issued', 'modified', 'onderwerp','betreft',
        'inwerkingtredingDatum', 'uitwerkingtredingDatum', 
        'kenmerk', 'redactioneleToevoeging',
    ):
    dict_list = meta.get(fetch_as_list)
    if dict_list is not None:
        doc[fetch_as_list] = []
        for d in dict_list:
            dtext = d.get('text')
            if dtext is not None:
                doc[fetch_as_list].append( dtext )

for fetch_as_list_with_attr in ( 
        ('creator', 'scheme'),
        ('spatial', 'scheme'),
        ('isRatifiedBy', 'scheme'),
        ('source', 'resourceIdentifier'),
        ('isFormatOf', 'resourceIdentifier'),
    ):
    want_key, want_attrkey = fetch_as_list_with_attr
    dict_list = meta.get(want_key)
    if dict_list is not None:
        doc[want_key] = []
        for d in dict_list:
            dtext = d.get('text')
            if dtext is not None:
                attr  = d.get('attr')
                if want_attrkey in attr:
                    doc[want_key].append( (attr.get(want_attrkey), dtext) )


# for 'print what haven't I handled yet' purposes:
for rem in ['title', 'alternative', 'subject', 'issued', 'modified',
            'language', 'format', 'rights', 'identifier', 'type',
            'creator', 'spatial', 'isRatifiedBy', 'source', 'isFormatOf',
            'onderwerp','betreft', 'kenmerk', 'redactioneleToevoeging',
            'inwerkingtredingDatum', 'uitwerkingtredingDatum', 
            ]:
    if rem in meta:
        meta.pop(rem)

text = wetsuite.helpers.koop_parse.cvdr_text( tree )
doc['text']       = text

#pprint.pprint( doc )

def tuple_or_none(val):
    ' '
    if val is not None:
        val = tuple(v  for v in val)
    return val

def get_tuple_or_none(key, join_if_sequence=' '):
    ' '
    val = doc.get(key)
    ret = []
    if val is not None:
        for item in val:
            if type(item) in (list, tuple):
                ret.append( join_if_sequence.join(item) )
            else:
                ret.append( item )
    return ret

#print( inwerkingtreding )

alternative            = get_tuple_or_none( 'alternative' )
inwerkingtredingDatum  = tuple_or_none( doc.get( 'inwerkingtredingDatum'  ) )
uitwerkingtredingDatum = tuple_or_none( doc.get( 'uitwerkingtredingDatum' ) )
issued                 = get_tuple_or_none( 'issued' )
subject                = get_tuple_or_none( 'subject' )
creator                = get_tuple_or_none( 'creator' )
spatial                = get_tuple_or_none( 'spatial' )

curs2.execute('''INSERT INTO cvdr  (work_id, expression_id, title, alternative, inwerkingtreding, uitwerkingtreding, 
                                    issued, subject, creator, spatial, web_url, xml_url, plaintext)
                    VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''', (
    work_id, expression_id,  doc['title'], alternative, inwerkingtredingDatum, uitwerkingtredingDatum,
    issued, subject, creator, spatial, doc['web_url'], doc['xml_url'], text,
) )

count+=1
if count%1000==0:
    conn2.commit()
    gc.collect()

conn2.commit()


#with open('cvdr.json','w', encoding='utf8') as f:
#    f.write( json.dumps(dataset) )


if 0:

    if 0:
        for resource_name, dds in meta.items():
            print( resource_name, dds)
            for dd in dds:
                text = dd.get('text')
                if text is None:
                    print('text is None for %r'%dd)
                    continue
                tlow = text.lower()

                #attr = dd.get('attr')
                #def normalize_isformatof(text):

                if resource_name in ('isFormatOf',):
                    if tlow.startswith('wsb-'):
                        pass
                    elif tlow.startswith('gmb-'):
                        pass
                    elif tlow.startswith('prb-'):
                        pass
                    elif tlow.startswith('bgr-'):
                        pass
                    elif tlow.startswith('stcrt-'):
                        pass
                    elif tlow.startswith('gemeenteblad'):
                        pass # TODO: parse 
                    elif tlow.startswith('digitaal gemeenteblad'):
                        pass # TODO: parse 
                    elif tlow.startswith('elektronisch gemeenteblad'):
                        pass # TODO: parse 
                    elif tlow.startswith('waterschapsblad'):
                        pass # TODO: parse 
                    elif tlow.startswith('provinciaal blad'):
                        pass # TODO: parse 
                    #else:
                    #    print("TODO: handle isFormatOf %r"%text)

    if 0:
        refs = wetsuite.helpers.koop_parse.cvdr_sourcerefs( tree )
        if len(refs)>0:
            doc['refs'] = []
            for typ, raw, bwb, params, reftext in refs:
                #print( [typ, raw,bwb,params,reftext] )
                if typ=='BWB':
                    shortref = bwb
                    #print(params)
                    if 'hoofdstuk' in params:
                        shortref += ' hoofdstuk '+params['hoofdstuk'][0]
                    if 'artikel' in params:
                        shortref += ' artikel '+params['artikel'][0]
                    if 'lid' in params:
                        shortref += ' lid '+params['lid'][0]

                    if 1:
                        print('RAW:      %s'%raw)
                        print('BWB-ID:   %s'%bwb)
                        print('PARAMS:   %s'%params)
                        print('SHORTREF: %s'%shortref)
                        print('TEXT:     %s'%reftext)
                        print('')
                    for k in params:
                        params[k]=params[k][0] # probably usually good enough
                    doc['refs'].append( (bwb, params, reftext) )