In [None]:
from lxml import etree
from glob import glob
import re
from collections import Counter, defaultdict
import pandas as pd
import multiprocessing
import itertools
from collections import Counter
from matplotlib import pyplot as plt
import random
import hashlib
import os

In [None]:
years = list(range(1998, 2019 + 1))

# General functions

In [None]:
def get_text_and_keys(tree):
    for t in tree.xpath('//text'):
        item = t.getparent()
        text = etree.tostring(t, method='text', encoding='utf-8').decode('utf-8')
        if text:
            yield text, item.attrib['key']

In [None]:
def replace_reference_elements(tree):
    for ref in tree.xpath('//reference'):
        parent = ref.getparent()
        placeholder = etree.SubElement(parent,'placeholder')
        placeholder.tail = ref.tail
        placeholder.text = '{ref}'
        parent.replace(ref, placeholder)

In [None]:
def get_first_word(text, reverse=False):
    splitted = text.split(' ')
    i = -1 if reverse else 0
    if not splitted:
        return ''
    elif len(splitted) == 1:
        return splitted[0]
    elif splitted[i] == '':
        if reverse:
            return splitted[-2] + ' '
        else:
            return ' ' + splitted[1]
    else:
        return splitted[i]

In [None]:
def get_regex_matches(item, pattern):
    text, key = item
    res = []
    for match in pattern.finditer(text):
        match_text = match[0]
        match_text_suffix = get_first_word(text[match.end():])
        match_text_prefix = get_first_word(text[:match.start()], reverse=True)
        res.append(dict(
            title=key.split('_')[0],
            year=key.split('_')[1],
            key=key,
            match=match_text,
            match_long=match_text_prefix+match_text+match_text_suffix,
            context=text,
            start=match.start(),
            end=match.end(),
        ))
    return res

In [None]:
def run_pattern(pattern, year):
    res = []
    
    files = sorted(glob(f'../../legal-networks-data/us/2_xml/**_{year}.xml'))

    for f in files:
        tree = etree.parse(f)
        replace_reference_elements(tree)

        items = list(get_text_and_keys(tree))
        
        context = multiprocessing.get_context('fork')
        with context.Pool() as p:
            matches_nested = p.starmap(get_regex_matches, [(i, pattern) for i in items])

        res.extend(itertools.chain.from_iterable(matches_nested))
    return res

In [None]:
def save_txt_inspection(res, filename, prefix_len=100, suffix_len=100):
    with open(filename, 'w') as f:
        for r in res:
            prefix = r['context'][max(0, r['start'] - prefix_len): r['start']]
            prefix = prefix.rjust(prefix_len)
            suffix = r['context'][r['end'] :r['end'] + suffix_len]
            text = r['context'][r['start']:r['end']]
            f.write(r['key'] + ' - ' + prefix + '|||' + text + '|||' + suffix + '\n')

In [None]:
def str_to_int_hash(text):
    a = hashlib.md5(text.encode('utf8'))
    b = a.hexdigest()
    as_int = int(b, 16)
    return as_int


def run_analysis(pattern, pattern_name, random_sample=True):
    os.makedirs('../data/patterns/', exist_ok=True)
    for year in years:
        res = run_pattern(pattern, year)
        df = pd.DataFrame(res)
        df.to_csv(f'../data/patterns/{pattern_name}_{year}.csv', index=False)
        save_txt_inspection(res, f'../data/patterns/{pattern_name}_{year}.txt', prefix_len=100, suffix_len=100)
        random.seed(year + str_to_int_hash(pattern_name))
        random_draw = random.sample(res, min(100, len(res)))
        save_txt_inspection(
            random_draw,
            f'../data/patterns/{pattern_name}_{year}_random_sample.txt', 
            prefix_len=100, 
            suffix_len=100
        )

# Run sesitivity analysis

In [None]:
def get_and_or_pattern(max_distance):
    return re.compile(
        r'\band\b[\s\w]{1,max_distance}\bor\b|\bor\b[\s\w]{1,max_distance}\band\b'.replace(
        'max_distance', str(max_distance)
    ))

for max_distance in range(10, 100, 10):
    run_analysis(
        get_and_or_pattern(max_distance), 
        f'and_or_pattern_{max_distance}', 
        random_sample=False
    )
    print('Completed', max_distance)

# "and", "or" in same text fragment

In [None]:
and_or_pattern = re.compile(r'\band\b[\s\w]{1,50}\bor\b|\bor\b[\s\w]{1,50}\band\b')
get_regex_matches([
    'The circumstances and conditions under which the list or manifest.',
    '080_1994_001657'
], and_or_pattern)

In [None]:
run_analysis(and_or_pattern, 'and_or_pattern')

# "or", "or" in same text fragment

In [None]:
or_or_pattern = re.compile(r'\bor\b[\s\w]{1,50}\bor\b')
get_regex_matches([
    'The circumstances or conditions under which the list or manifest.',
    '080_1994_001657'
], or_or_pattern)

In [None]:
run_analysis(or_or_pattern, 'or_or_pattern')

# "and", "and" in same text fragment

In [None]:
and_and_pattern = re.compile(r'\band\b[\s\w]{1,50}\band\b')
get_regex_matches([
    'The circumstances and conditions under which the list and manifest.',
    '080_1994_001657'
], and_and_pattern)

In [None]:
run_analysis(and_and_pattern, 'and_and_pattern')

# "and/or"

In [None]:
and_slash_or_pattern = re.compile(r'\band\s?/\s?or\b')
get_regex_matches([
    'The circumstances and/or conditions under which the list or manifest.',
    '080_1994_001657'
], and_slash_or_pattern)

In [None]:
run_analysis(and_slash_or_pattern, 'and_slash_or_pattern')

# "or both"

In [None]:
or_both_pattern = re.compile(r'\bor\b.{1,50}?\sor\sboth\b')
get_regex_matches([
    'Conditions under which the list or manifest, or both.',
    '080_1994_001657'
], or_both_pattern)

In [None]:
run_analysis(or_both_pattern, 'or_both_pattern')

# No ... {and,or}

In [None]:
no_and_or_pattern = re.compile(r'\bno\b.{1,50}\b(and|or)\b')
get_regex_matches([
    'The no circumstances and conditions under which the list or manifest.',
    '080_1994_001657'
], no_and_or_pattern)

In [None]:
run_analysis(no_and_or_pattern, 'no_and_or_pattern')

# Not ... {and,or}

In [None]:
not_and_or_pattern = re.compile(r'\bnot\b.{1,50}\b(and|or)\b')
get_regex_matches([
    'The not circumstances and conditions under which the list or manifest.',
    '080_1994_001657'
], not_and_or_pattern)

In [None]:
run_analysis(not_and_or_pattern, 'not_and_or_pattern')

# {or,and} ... unless

In [None]:
unless_pattern = re.compile(r'\b(or|and)\b.{3,50}\bunless\b')
get_regex_matches([
    'The not circumstances and conditions unless it is less than',
    '080_1994_001657'
], unless_pattern)

In [None]:
run_analysis(unless_pattern, 'unless_pattern')

# {or,and} ... except

In [None]:
except_pattern = re.compile(r'\b(or|and)\b.{3,50}\bexcept\b')
get_regex_matches([
    'The not circumstances and conditions except it is less than',
    '080_1994_001657'
], except_pattern)

In [None]:
run_analysis(except_pattern, 'except_pattern')

# {or,and} ... but not

In [None]:
but_not_pattern = re.compile(r'\b(or|and)\b.{3,50}\bbut\snot\b')
get_regex_matches([
    'The not circumstances or conditions but not it is less than',
    '080_1994_001657'
], but_not_pattern)

In [None]:
run_analysis(but_not_pattern, 'but_not_pattern')

# {or,and} ... notwithstanding

In [None]:
notwithstanding_pattern = re.compile(r'\b(or|and)\b.{3,50}\bnotwithstanding\b')
get_regex_matches([
    'The not circumstances or conditions notwithstanding it is less than',
    '080_1994_001657'
], notwithstanding_pattern)

In [None]:
run_analysis(notwithstanding_pattern, 'notwithstanding_pattern')