### Purpose of this notebook

In [1]:
import time, urllib.parse

import bs4

import wetsuite.datasets
from wetsuite.helpers import net, localdata, notebook, etree

In [2]:
detail_pages    = localdata.LocalKV('tuchtrecht_detailpages.db',        key_type=str, value_type=bytes )

In [4]:
# result pages URLs we have fetched, and have still to fetch, in this particular crawl
pagination_pages_to_fetch  = set()  # our TODO list
pagination_pages_fetched   = set()  # URLs of pagination pages we have fetched and handled, and shouldn't add to fetch again

In [None]:
## seed with the first page of a search
# You can use the browser URL after any search, e.g. Scheepvaart is a small example with only a few dozen items:
pagination_pages_to_fetch.add( f'https://tuchtrecht.overheid.nl/zoeken/resultaat?ftsscope=uitspraak&domein=Scheepvaart&datumtype=uitspraak' ) 

#or, if you just want a _lot_ of text, everything:
#pagination_pages_to_fetch.add( f'https://tuchtrecht.overheid.nl/zoeken/resultaat?ftsscope=uitspraak&datumtype=uitspraak' ) 

# If you're fetching everything, and being nice to the server, expect this to take on the order of days.


# (we _could_ do a progress bar, because that first page also mentions the highest-numbered page)
while len(pagination_pages_to_fetch) > 0: # we keep adding pages on the way, and will eventually exchaust them

    ## pick and fetch another page.
    # pagination pages are not cached, the pages will change with each new case
    fetching_page_url = pagination_pages_to_fetch.pop()
    print( f' ========== PAGE: {fetching_page_url} ============ ')
    try:
        pagebytes = net.download( fetching_page_url ) 
        pagination_pages_fetched.add( fetching_page_url )
    except ValueError:
        print("HACKY POSTPONE", fetching_page_url)
    
    ## parse the pagination HTML page so we can find the cases and further pagination
    soup = bs4.BeautifulSoup(pagebytes, features='lxml')

    # extract all links to other pagination pages (part of this crawl)
    for page_link_a in soup.select("div[class*='pagination'] a[href*='&page=']"):
        href = page_link_a.get('href')
        page_abs_href = urllib.parse.urljoin( fetching_page_url, href)
        if page_abs_href not in pagination_pages_fetched:
            pagination_pages_to_fetch.add( page_abs_href )
    
    # extract all links to detail pages (part of fetched data)
    for detail_link in soup.select("div[class*='column'] a[href*='/uitspraak/']"):
        href = detail_link.get('href')
        detail_abs_href = urllib.parse.urljoin( fetching_page_url, href)
        try:
            _, fromcache = localdata.cached_fetch( detail_pages, detail_abs_href )
            
            if not fromcache:
                print( 'FETCHED', detail_abs_href )
                time.sleep(5) # be somewhat nice to the server
            #else:
            #    print( 'CACHED', detail_abs_href )
        except ValueError:
            print("HACKY POSTPONE", detail_abs_href)

In [3]:
detail_pages.summary(True)

{'size_bytes': 1499602944,
 'size_readable': '1.4GiB',
 'num_items': 43475,
 'avgsize_bytes': 34493,
 'avgsize_readable': '34KiB'}

In [4]:
def parse(htmlbytes):
    ' This takes one such downloaded HTML page, and returns (the metadata in the table as a dict, the plaintext as a single string) '
    meta={}
    plain=[]
    soup = bs4.BeautifulSoup(htmlbytes, features='lxml')
    for cc in soup.select('div#content div.column'):
        if 'column--sidebar' in cc.get('class'):
            continue

        for ch in cc.children:
            if isinstance(ch, bs4.element.NavigableString):
                s = str( str(ch) ).strip()
                if len(s) > 0:
                    #print( 'S %r'%str(ch) )
                    plain.append( str(ch) )
            else:
                if ch.name in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
                    #plain.append('H:')
                    plain.append( wetsuite.helpers.etree.html_text( str(ch) ) )
                    plain.append('\n\n')
                elif ch.name == 'table'  and  'table__data-overview' in ch.get('class',''):
                    for tr in ch.select('tr'):
                        #print('TR %r'%str(tr))
                        rowcells = list(tr.find_all(['th','td']))
                        if len(rowcells) == 2:
                            meta[ rowcells[0].text ] = wetsuite.helpers.etree.html_text( str(rowcells[1]) )
                    pass
                elif ch.name == 'table':
                    #plain.append('T:')
                    plain.append( wetsuite.helpers.etree.html_text( str(ch) ) )
                    plain.append('\n\n')
                elif ch.name == 'p':
                    #plain.append('P:')
                    plain.append( wetsuite.helpers.etree.html_text( str(ch) ) )
                    plain.append('\n\n')
                elif ch.name == 'blockquote':
                    #plain.append('Q:')
                    plain.append( wetsuite.helpers.etree.html_text( str(ch) ) )
                    plain.append('\n\n')
                elif ch.name == 'pre':
                    #plain.append('P:')
                    plain.append( wetsuite.helpers.etree.html_text( str(ch) ) )
                    plain.append('\n\n')
                elif ch.name == 'div'  and  'align-right' in ch.get('class',''): # probably page links
                    pass 
                elif ch.name in ('div', 'strong', 'em'):
                    #plain.append('b:')
                    plain.append( wetsuite.helpers.etree.html_text( str(ch) ) )
                    plain.append('\n')
                elif ch.name in ('br', 'hr'):
                    plain.append('\n')
                elif ch.name in ('ul', 'ol'):
                    #plain.append('L:')
                    plain.append( wetsuite.helpers.etree.html_text( str(ch) ) )
                    plain.append('\n')
                else:
                    print( 'N', ch.name, ch.get('class') )
    return meta, ''.join(plain)

In [None]:
for url, htmlbytes in detail_pages.random_sample(3):
    print( url )
    meta, plain = parse( htmlbytes )
    for k,v in meta.items():
        print( '%25s   %r'%(k,v) )
    print(plain)

In [None]:
# Real parse and store - this might take ten minutes
tuchtrecht_struc = wetsuite.helpers.localdata.MsgpackKV('tuchtrecht-struc.db', str, None)
tuchtrecht_struc.truncate()
tuchtrecht_struc._put_meta('description_short', '''The text and basic metadata shown in the website cases at tuchtrecht.overheid.nl''' )
tuchtrecht_struc._put_meta('description',       '''The text and basic metadata shown in the website cases at tuchtrecht.overheid.nl.

A case would look something like:                                    
{'meta': {'Beslissingen:': '',
          'Datum publicatie:': '20-09-2013',
          'Datum uitspraak:': '24-03-1994',
          'ECLI:': 'ECLI:NL:TDIVBC:1994:1',
          'Inhoudsindicatie:': 'Behandeling hond',
          'Onderwerp:': 'Honden',
          'Zaaknummer(s):': 'VB 1993-02'},
 'plaintext': "ECLI:NL:TDIVBC:1994:1 Veterinair Beroepscollege 's-Gravenhage "
              '(text omitted for brevity)\n'
              '\n',
 'url': 'https://tuchtrecht.overheid.nl/zoeken/resultaat/uitspraak/1994/ECLI_NL_TDIVBC_1994_1'}

'''+wetsuite.datasets.generated_today_text())

for url, htmlbytes in notebook.ProgressBar( detail_pages.items() ):
    meta, text = parse( htmlbytes )
    item = {}
    item['url']       = url
    item['meta']      = meta
    item['plaintext'] = text
    tuchtrecht_struc.put( url, item )

In [None]:
tuchtrecht_struc.summary(True)

{'size_bytes': 601862144,
 'size_readable': '574MiB',
 'num_items': 43475,
 'avgsize_bytes': 13844,
 'avgsize_readable': '13.5KiB'}