<a href="https://colab.research.google.com/github/WetSuiteLeiden/example-notebooks/blob/main/datasets/dataset_intro_by_doing__bwb__(definitions_example).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# (only) in colab, run this first to install wetsuite from (the most recent) source. 
#    (this should soon simplify to something like   !pip3 install --upgrade wetsuite)
# For your own setup, see wetsuite's install guidelines.
!pip3 install -U --no-cache-dir --quiet https://github.com/WetSuiteLeiden/wetsuite-core/archive/refs/heads/main.zip

## Purpose of this notebook

Explore what is in the Basis WettenBestand dataset(s), and what you could easily do with it. 

You may also want to find the [extras_diagnose_koop_bwb_docstructure](zextras_diagnose_koop_bwb_docstructure.ipynb) notebook 
which is some introduction to the structure of the varied documents in here. 

Any real research question is probably going to be fairly specific,
so let's start with something relatively dumb - looking for definition lists.
 


In [1]:
import collections
import random
import pprint

import wetsuite.helpers.etree
import wetsuite.helpers.koop_parse
import wetsuite.helpers.net
from wetsuite.helpers import lazy
import wetsuite.datasets

In [3]:
bwb_text = wetsuite.datasets.load('bwb-mostrecent-text')
bwb_xml  = wetsuite.datasets.load('bwb-mostrecent-xml')
bwb_meta = wetsuite.datasets.load('bwb-mostrecent-meta-struc')  # we don't end up using this

In [None]:
#print( bwb_text.data.get('BWBR0034320') )
print( wetsuite.helpers.etree.debug_pretty( bwb_xml.data.get('BWBR0034320')) )

In [41]:
# Figure out which laws have a definition list.

# There are a few different ways. You could e.g. reach into specific parts of the XML, 
#   and e.g. look for an artikel/kop/titel with text 'Definities' or 'Begripsbepalingen'
#
# But even simplwer would be to look for the following text is present, 
#   and anywhere in the document is probably good enough of a test
#   It happens to often be used literally -- but chances are we're missing something
#   We could likely catch more if we allowed more variants, but for a quick test this is plenty.
wordt_verstaan = 'deze regeling wordt verstaan'

bwbids_with_verstaan = set()
for bwbid, text in bwb_text.data.items():
    if wordt_verstaan in text:
        bwbids_with_verstaan.add( bwbid )
len( bwbids_with_verstaan )

5344

In [52]:
# fetch the XML for each of those BWB-ids
# and fish out just the definitions list -- by looking around the element that contains that same text


definitions = collections.defaultdict(list)

def_header_titles = collections.Counter() # the name of the header of this section, to see how consistent it is


for test_bwbid in bwbids_with_verstaan:

    xmlbytes = bwb_xml.data.get( test_bwbid )
    etree = lazy.etree( xmlbytes )
    
    # Ask for something like "the parent (if it is an artikel) of a node that contains 'wordt verstaan' as text" 
    # (via XPath, it's fewer lines than a bunch of node navigation)
    for node in list( etree.xpath( "//al[contains(text(),'%s')]/parent::artikel"%wordt_verstaan ) ):

        # this is just for the "what is the header this is in called?",
        kop = node.find('kop')
        titel = kop.find('titel')
        if titel is not None:
            def_header_titles.update( [titel.text] )
        # the rest is picking up the definitions:

        # From looking at some of these documents, most look like:
        #   <al><nadruk type="cur">de minister:</nadruk>de Minister van Binnenlandse Zaken en Koninkrijksrelaties;</al>
        # note: a serious investigation would try for completeness, this is just a proof of concept.
        #       looking for nadruk will later prove too approximate, but it's simple for an example
        for al in node.xpath('//nadruk/parent::al'):
            al_before = wetsuite.helpers.etree.debug_pretty(al)

            nadruk = al.find('nadruk')
            defined_thing = nadruk.text
            if defined_thing is not None  and  len(defined_thing.strip()) >= 2: # skip some empty nodes, and single letters
                defined_thing = defined_thing.rstrip(': ')

                # the further text is often the etree-.tail of the nadruk node, but let's assume there can be markup in there,
                # We can use our own text extractor function on the whole thing 
                #   ...if we remove the term we are defining from the in-memory document 
                #   (specifically nadruk; we just copied it to `what`) before doing so, to avoid it showing up twice
                nadruk.text = ''

                rest_text = (  ' '.join( wetsuite.helpers.etree.all_text_fragments(al) )  ).strip('; ')

                if len(rest_text)==0: # only nadruk, no other text in the alinea -- this is probably wrong and skippable.
                    pass
                    #print('CONFUSED about:')
                    #print( al_before )
                else:
                    definitions[defined_thing].append( (test_bwbid, rest_text) )  # add the BWB-id to signal where it came from

In [57]:
# Count and list the name of the section we just picked these out of
dht = list( def_header_titles.items() )
dht.sort( key=lambda x:x[1], reverse=True) # most used on top
for header, count in dht:
    if count >=2: # show only those used more than once
        print( '%5s %s'%(count, header) )
# Turns out there's some variation.

 1230 Begripsbepalingen
  382 Definities
  173 Begripsbepaling
   66 Begripsomschrijvingen
   23 (begripsbepalingen)
   21 Begrippen
   17 Definitiebepalingen
   15 begripsbepalingen
   14 (definities)
   11 Definitiebepaling
   10 Begripsomschrijving
    9 Definitie
    7 (Begripsbepalingen)
    6 Algemene bepalingen
    6 (begripsomschrijving)
    4 definities
    3 (begripsomschrijvingen)
    3 Algemene begripsbepalingen
    3 
      
    2 Begrippen en definities
    2 – definities
    2 Begripsbepalingen 
    2 – Definities –
    2 - Begripsbepalingen
    2  Begripsbepalingen


In [66]:
# CONSIDER: using a case insensitive merge on defined_thing
import wetsuite.extras.word_cloud # has a function that counts identical strings

defdata = list( definitions.items() )  
defdata.sort( key=lambda x:len(x[1]), reverse=True)

# items like:  ('Lucky Bamboo', [('BWBR0025197', 'sierplant met de wetenschappelijke naam Dracaena sanderiana')])
# The below throws away that origin ID for brevity,
# yet you might well want that when digging deeper

for defined_thing, definitions_list in defdata:
    # we'd expect at least things like 'minister' and 'wet' to be "this particular one within this document"-style definitions, not general definitions
    if defined_thing.lower() in ('minister', 'de minister', 'wet','de wet', 'besluit'):
        continue # so ignore them

    # simpler:  print everything
    if len(definitions_list)>0:
        print( defined_thing )
        for origin_bwb, definition in definitions_list:
           print(f'  In {origin_bwb}: {definition}')

    # more complex: only things that appear more often
    counts = wetsuite.extras.word_cloud.count_normalized( 
        list(definition     for origin_bwbid, definition  in definitions_list ),  
        min_count = 2 # show only definitions used more than once
    )
    if len(counts)>0:
        print( defined_thing )
        pprint.pprint(counts)

    print()

school
  In BWBR0047083: school als bedoeld in artikel 1 van de wet
  In BWBR0038536: bekostigde school als bedoeld in de WPO of een bekostigde school of instelling als bedoeld in de WEC .
  In BWBR0035590: bekostigde school als bedoeld in de Wet op het primair onderwijs of een bekostigde school of instelling als bedoeld in de Wet op de expertisecentra .
  In BWBR0041307: school als bedoeld in artikel 1 WVO
  In BWBR0025837: een uit ’s Rijks kas bekostigde school als bedoeld in artikel 1 van de Wet op het primair onderwijs , artikel 1 van de Wet op de expertisecentra , artikel 1 van de Wet op het voortgezet onderwijs of een instelling als bedoeld in artikelen 1.1.1, onder b , 12.3.8 of 12.3.9 van de Wet educatie en beroepsonderwijs
  In BWBR0042461: uit ’s Rijks kas bekostigde school als bedoeld in artikel 1 van de Wet op het primair onderwijs , artikel 1.1 van de Wet voortgezet onderwijs 2020 , dan wel instelling als bedoeld in artikel 1.1.1 van de Wet educatie en beroepsonderwijs
  I