In [None]:
import re, pprint, collections, random
import wetsuite.datasets
import wetsuite.helpers.patterns
import wetsuite.helpers.etree

In [2]:
bwb  = wetsuite.datasets.load('bwb-mostrecent-xml').data
cvdr = wetsuite.datasets.load('cvdr-mostrecent-xml').data
rsnl = wetsuite.datasets.load('rechtspraaknl-struc').data
# TODO: even more free-flowing text

### Find things that look ECLI-like, to create test data

...test data for the ECLI parsing code, that is.

In [None]:
mostrefs     = 0
mostrefs_url = ''

# create a text file where each line contains
# - ECLI identifier text with possible muck after it
# - tab character
# - URL of the document this was found in   (CONSIDER: also adding offset)
with open('eclitest.txt','w') as eclitestfile:
    
    # this goes through a few gigabytes of text, will take order of ten minutes
    for store in ( #easier to skip one with comments this way
        bwb, 
        cvdr,
        rsnl,
    ):
        for i, (url, data) in enumerate( store.items() ):
            # this deals with the fact that one of the datasets gives us python dicts, and the other two XML bytestrings.
            #   note that both of these cases can be made cleaner (at some speed cost)
            if isinstance(data, dict): # assume rechtspraak
                data = str(data)  # quick and dirty way to make that a string to search in 
            else: # assume it's a bytes object
                data = data.decode('utf8') # we don't parse these
                
            # this regexp awkwardly tries to accept more, without too much nonsense  (definitely _some_ nonsense, though)
            matches = list( re.finditer(r'(?<!:)([eECc][a-zA-Z]+[:](?:[^\s]{1,10}[:]){2,7}[^\s]+)', data) ) #intentionally overaccepts at the end
            if len(matches) > 1:
                for match_object in matches:
                    txt = match_object.group(0)
                    if 'jci' in txt:
                        continue
                    msg = f'{txt}\t{url}\n'
                    eclitestfile.write( msg )                    

#### Get a subset of that ECLI test data 

Try to parse each ECLI to see if it seems valid, to sort this into probably-good and probably-bad ECLIs.

We also have _loads_ of good examples and don't need that many, 
so we creat shorter test lists via a random sample.

In [None]:
#count_str = collections.defaultdict(int)
#count_bad     = 0

good = []
bad  = []

with open('eclitest.txt','r') as eclitestfile:
    for line in eclitestfile:
        text, url = line.rstrip('\n').rsplit('\t', 1)
        
        try:
            parsed = wetsuite.helpers.meta.parse_ecli(text)
            #if parsed['country_code'].upper()=='NL':
            #    cc = parsed['court_code']
            #    if wetsuite.extras.gerechtcodes.case_insensitive_lookup(cc) is None:
            #        count_str[cc] += 1
            good.append( (text, url) )
        except ValueError as ve:
            bad.append( (text, url) )
            #count_bad += 1
            #count_str[text.split(':')[0]] += 1
            print(text, ve, url) # currently spits out ~25K complaints, so consider commenting this out 

In [8]:
print( len(good), len(bad) )

974016 25843


In [11]:
with open('ecli_good.txt','w') as good_ecli_file:
    for text, url in sorted(random.sample(good, 20000)):
        good_ecli_file.write(f'{text}\t{url}\n')

with open('ecli_bad.txt','w') as bad_ecli_file:
    for text, url in sorted(random.sample(bad, 20000)):
        bad_ecli_file.write(f'{text}\t{url}\n')