# Creating a smaller raw-data datasets

We would like to make a dataset with a smallish selection, of the raw data, probably just XML.

In [3]:
import collections

import wetsuite.helpers.notebook
import wetsuite.helpers.localdata
import wetsuite.helpers.koop_parse
import wetsuite.helpers.date
import wetsuite.helpers.format

In [4]:
# previously fetched
op_fetched             = wetsuite.helpers.localdata.LocalKV( 'op_fetched.db', str, bytes )

From this set, the most interesting might be:
- **kamerstukken**, which belong to dossiers
- **handelingen**, which may relate to multiple dossiers, or none (most seem _not_ to belong to dossiers), consider e.g.
  - https://repository.overheid.nl/frbr/officielepublicaties/h-tk/20162017/h-tk-20162017-55-37/1/metadata/metadata.xml
  - https://repository.overheid.nl/frbr/officielepublicaties/h-tk/19981999/h-tk-19981999-2741-2768/1/metadata/metadata.xml
- **aanhangsels (bij de handelingen)**, which do not belong to dossiers

Also, it'll be a mix of tweede kamer, eerste kamer, and verenigde vergaderingen, which brings a few more bordercases.

### Considering dossiers

Just sampling recent documents by date would break up kamerstuk-dossiers,
and there are probably various questions that would care about complete dossiers.

So we would probably care to select, say,
- kamerstukdossier that have had recent activity 
  - ...though note that because these are often the more frequently updated, will also be many of the largest
- handelingen that are recent and/or belong to that chosen set - which presumably has strong overlap
- aanhangsels that are recent

We can later choose to throw them in the same pile, or not.

In this process we will probably dig into the metadata more than once, so let's make that easier to read out - we 
- create a map from id to a metadata dict
- create a map from id to the URL to the content URL (which is something we can fetch from op_fetched) (also, we basically already did that above)

In [5]:
# takes all URLs fetched into the store,
#   groups metadata and corresponding content item,   like  { 'kst-29692-26' -> {'metadata':'anurl', 'xml':'anurl'} } 
docmeta_groups        = collections.defaultdict( dict )

for fetched_url in op_fetched.keys():
    try:
        parsed_url = wetsuite.helpers.koop_parse.parse_repo_url( fetched_url ) # take useful things out of that URL
        doc_id   = parsed_url['doc_id']    # e.g. 'kst-29692-26'
        exprtype = parsed_url['exprtype']  # e.g. 'pdf' or 'xml'

        docmeta_groups[doc_id][exprtype] = fetched_url   # and put it in that data structure as mentioned
    except ValueError as ve:
        print('IGNORE: ', ve)

IGNORE:  ERROR understanding 'https://repository.overheid.nl/frbr/officielepublicaties/gmb/2018/gmb-2018-118956/1/html/gmb-2018-118956..html'


In [None]:
# Decisions for each ID will dig into more detailed metadata, so should probably parse _all_ metadata
#   ...which, note, is note: parsing five million stored files may take maybe twenty minutes)
# Note that we are ignoring PDFs for now, hoping there is XML data for most of these, 
#   ...which for these areas happens to be a workable assumption

# More concretely, we fill:
import wetsuite.helpers.etree


id_meta    = collections.defaultdict(dict) # id -> parsed metadata dict
id_xmlurl  = {}                            # id -> URL of the data XML

for doc_id in wetsuite.helpers.notebook.ProgressBar(docmeta_groups): # progress bar to get an idea of how long it will take
    if 'metadata' in docmeta_groups[doc_id]  and  'xml' in docmeta_groups[doc_id]: # ignore cases without metadata and XML
        try:
            meta_url = docmeta_groups[doc_id]['metadata']
            meta_bytes = op_fetched.get(meta_url)
            meta_list = wetsuite.helpers.koop_parse.parse_op_metafile( meta_bytes )
            id_xmlurl[doc_id] = docmeta_groups[doc_id]['xml']
            for metakey,schema,value in meta_list:
                id_meta[doc_id][metakey] = value # WARNING: for any duplicate keys this _will_ overwrite
        except wetsuite.helpers.etree.lxml.etree.XMLSyntaxError as xse:
            print( 'PARSE ERROR', meta_url, meta_bytes )

#### kst

What next? 

Let's address the part with the most work first: kamerstukken and their dossiers.

Since we want to select a smallish set, and probably prefer recent content, let's figure out dates and sizes of documents within each dossier.

In [8]:
doc_ids = list( id_meta.keys() )

dossier_dates     = collections.defaultdict( list )
dossier_bytesizes = collections.defaultdict( list )
dossier_doc_ids   = collections.defaultdict( list )

for doc_id in doc_ids:
    meta = id_meta[doc_id]
    if 'dossiernummer' in meta: # will be true for kamerstukken -- but also blg, h-ek, h-tk (VERIFY), so:
        if doc_id.startswith('kst-'):
            for dossiernummer in meta['dossiernummer'].split(';'): # split because that can actually be a list
                if 'available' in meta:
                    dossier_dates[     dossiernummer ].append( wetsuite.helpers.date.parse( meta['available'] ) )
                    dossier_bytesizes[ dossiernummer ].append( len( op_fetched.get( id_xmlurl[doc_id] )) )
                    dossier_doc_ids[   dossiernummer ].append( doc_id )

In [None]:
dossier_dates        # probably run only if you care about checking intermediates

In [None]:
dossier_bytesizes    # probably run only if you care about checking intermediates

In [None]:
# sort dossier identifiers, by the date of the most recent document in it
#   ...with the intent of later choosing the most recently added-to dossiers 
#      until we've reaced a particular size
dossier_maxdate = list( (dn, max(dates))  for dn, dates in dossier_dates.items() )
dossier_maxdate.sort(key = lambda p:p[1], reverse=True )

In [20]:
dossier_maxdate      # probably run only if you care about checking intermediates

[('31839', datetime.datetime(2025, 3, 28, 0, 0)),
 ('25295', datetime.datetime(2025, 3, 28, 0, 0)),
 ('31066', datetime.datetime(2025, 3, 28, 0, 0)),
 ('21501-08', datetime.datetime(2025, 3, 28, 0, 0)),
 ('33037', datetime.datetime(2025, 3, 28, 0, 0)),
 ('30252', datetime.datetime(2025, 3, 28, 0, 0)),
 ('32140', datetime.datetime(2025, 3, 28, 0, 0)),
 ('36546', datetime.datetime(2025, 3, 28, 0, 0)),
 ('32847', datetime.datetime(2025, 3, 27, 0, 0)),
 ('28676', datetime.datetime(2025, 3, 27, 0, 0)),
 ('21501-28', datetime.datetime(2025, 3, 27, 0, 0)),
 ('34293', datetime.datetime(2025, 3, 27, 0, 0)),
 ('36600-VIII', datetime.datetime(2025, 3, 27, 0, 0)),
 ('36626', datetime.datetime(2025, 3, 27, 0, 0)),
 ('27625', datetime.datetime(2025, 3, 26, 0, 0)),
 ('29237', datetime.datetime(2025, 3, 26, 0, 0)),
 ('24587', datetime.datetime(2025, 3, 26, 0, 0)),
 ('32317', datetime.datetime(2025, 3, 26, 0, 0)),
 ('22112', datetime.datetime(2025, 3, 26, 0, 0)),
 ('21501-20', datetime.datetime(2025, 3

In [18]:
# build a list of identifiers; we will pick out the documents later
chosen_dossiers = []  

total_size = 0
for dossier_id, _ in dossier_maxdate:
    dossier_size = sum( dossier_bytesizes[dossier_id] )
    chosen_dossiers.append( dossier_id )
    print("Chose dossier  %-15r  (%5d docs totaling %5s)"%(dossier_id, len(dossier_bytesizes[dossier_id]), wetsuite.helpers.format.kmgtp(dossier_size)))
    total_size += dossier_size

    # Once we selected a bunch of data, we can stop selecting
    if total_size > 700*1024*1024: # 700MB
        break

print('Chose %s (%5s) out of %s dossiers'%(len(chosen_dossiers), wetsuite.helpers.format.kmgtp(total_size), len(dossier_maxdate)))

Chose dossier  '31839'          ( 1183 docs totaling   25M)
Chose dossier  '25295'          ( 2944 docs totaling   56M)
Chose dossier  '31066'          ( 1593 docs totaling   40M)
Chose dossier  '21501-08'       ( 1039 docs totaling   21M)
Chose dossier  '33037'          (  659 docs totaling 12.4M)
Chose dossier  '30252'          (  225 docs totaling  3.4M)
Chose dossier  '32140'          (  280 docs totaling 10.6M)
Chose dossier  '36546'          (   41 docs totaling  2.2M)
Chose dossier  '32847'          ( 1412 docs totaling   29M)
Chose dossier  '28676'          (  528 docs totaling 13.8M)
Chose dossier  '21501-28'       (  288 docs totaling  8.5M)
Chose dossier  '34293'          (  158 docs totaling  3.1M)
Chose dossier  '36600-VIII'     (  190 docs totaling  9.4M)
Chose dossier  '36626'          (    8 docs totaling  488K)
Chose dossier  '27625'          (  798 docs totaling 12.3M)
Chose dossier  '29237'          (  228 docs totaling  5.1M)
Chose dossier  '24587'          ( 1245 d

#### h (handelingen) and ah (their aanhangsels )

Okay, we've dealt with `kst`. What about these two, and their relation?

So, from the counting code below, there seem to be:
* ~56K `h-` documents totaling 2.2GByte of XML
* ~84K `ah-`documents totaling 700MByte of XML
* if restricting that to 5 years:
  * ~14K `h-` documents totaling 600MByte of XML
  * ~18K `ah-`documents totaling 200Byte of XML

We _could_ consider just including **all** of both these sets,
that way we don't have to worry yet about fishing out all required relations, different types of handelingen, etc.

In [None]:
h_amount,  total_h_size = 0, 0
ah_amount, total_ah_size = 0, 0

hah_chosen_ids = set()

# go through all identifiers's metadata, decide what to select
# roughly speaking: 
for doc_id in id_meta.keys(): 
    
    meta = id_meta[doc_id]

    # note: not used yet; only used in combination
    meta_date = wetsuite.helpers.date.parse( meta['available'] )
    is_old = False 
    if meta_date.date() < wetsuite.helpers.date.date_months_ago( 12*5 ): # not in last 5 years?
        is_old = True

    if doc_id.startswith('ah-'): 
        # anything from the last five years
        if is_old:
            continue
        ah_amount += 1
        total_ah_size += len(op_fetched.get( id_xmlurl[doc_id] ))
        hah_chosen_ids.add(doc_id)
        continue

    elif doc_id.startswith('h-'):
        # h(andelingen) may refer to kamerstukdossiers. While we could skip these,
        #   yet let's try to select them even if they are older _if_ they seem to refer to a dossier we selected.
        # So our logic becomes 'anything from the last five years  OR  referring to a dossier we selected'
        refers_to_chosen_kst = False
        if 'behandeldDossier' in meta:
            behandeldDossier = meta['behandeldDossier'].split(';')[0] # split because it points to a specific number in that dossier
            if behandeldDossier in chosen_dossiers:
                refers_to_chosen_kst = True

        if is_old and not refers_to_chosen_kst:
            continue
        else:
            h_amount += 1
            total_h_size += len(op_fetched.get( id_xmlurl[doc_id] ))
            hah_chosen_ids.add(doc_id)
            continue

    # implied else: type is kst or other

h_amount, total_h_size,  ah_amount, total_ah_size

(13410, 601661806, 18206, 217210592)

### Finally write it out

In [21]:
import wetsuite.datasets
op_selection = wetsuite.helpers.localdata.LocalKV( 'parliament-sample.db', str, bytes )
op_selection.truncate()

op_selection._put_meta('description_short', 'A moderate-sized collection of kamerstukken, handelingen, and aanhangsels')
op_selection._put_meta('description','''
                       
A moderate-sized collection of kamerstukken (~700MB of the most recently touched dossiers), handelingen (up to five years), and aanhangsels (up to five years)
                       
For each conceptual item, e.g. kst-27625-351, it contains
- a metadata XML, e.g. https://repository.overheid.nl/frbr/officielepublicaties/kst/27625/kst-27625-351/1/metadata/metadata.xml
- an content XML, e.g. https://repository.overheid.nl/frbr/officielepublicaties/kst/27625/kst-27625-351/1/xml/kst-27625-351.xml                       

The metadata file can be parsed with wetsuite.helpers.koop_parse.parse_op_metafile()

TODO: more elaboration
                                              
Dataset generated on:                        
'''+wetsuite.datasets.generated_today_text())

In [None]:
# adding kst- (chosen subset)
for dossier_id in chosen_dossiers:
    for doc_id in dossier_doc_ids[ dossier_id ]:
        xml_url  = id_xmlurl[doc_id]
        meta_url = docmeta_groups[doc_id]['metadata']
        op_selection.put( xml_url,  op_fetched.get( xml_url ) , commit=False )
        op_selection.put( meta_url, op_fetched.get( meta_url ), commit=False )
        #print( xml_url, meta_url )
        #break
    #break

op_selection.commit()

op_selection.summary( True )

{'size_bytes': 880533504,
 'size_readable': '840MiB',
 'num_items': 64698,
 'avgsize_bytes': 13610,
 'avgsize_readable': '13.3KiB'}

In [None]:
# adding h- ahd ah- (chosen subset)
for doc_id in hah_chosen_ids:
    xml_url  = id_xmlurl[doc_id]
    meta_url = docmeta_groups[doc_id]['metadata']
    op_selection.put( xml_url,  op_fetched.get( xml_url ),     commit=False)
    op_selection.put( meta_url, op_fetched.get( meta_url ),    commit=False)

op_selection.commit()
op_selection.summary( True )

{'size_bytes': 1846579200,
 'size_readable': '1.7GiB',
 'num_items': 127930,
 'avgsize_bytes': 14434,
 'avgsize_readable': '14.1KiB'}