In [317]:
import os

from regex import regex
from collections import namedtuple

In [237]:
data_dir = 'data'

In [238]:
def read_file(path):
    with open(path, 'r') as f:
        lines = f.readlines()
        content = ''.join(lines)
        return content

In [448]:
def resolve_title(bill_content):
    try:
        title_pattern = r'U\s*S\s*T\s*A\s*W\s*A[\s\d\p{L}\p{P}]+?(?=Rozdział|Art)'
        title = regex.search(title_pattern, bill_content).group(0)
        cleansed = regex.sub('\s+', ' ', title)
        cleansed = regex.sub('U\s*S\s*T\s*A\s*W\s*A', 'Ustawa', cleansed).strip(' ')
        return cleansed
    except:
        return None

In [449]:
class BillFile:
    def __init__(self, file_name, content):
        self.file_name = file_name
        self.content = content
        self.title = resolve_title(content)

def read_files():
    files = {}
    for file_name in os.listdir(data_dir):
        path = os.path.join(data_dir, file_name)
        content = read_file(path)
        files[file_name] = BillFile(file_name, content)
    return files

In [450]:
class Bill:
    def __init__(self, position, number, year):
        self.position = position
        self.number = number
        self.year = year
    
    def __repr__(self):
        return '{0}-{1}-{2}'.format(self.position, self.number, self.year)

In [451]:
files = read_files()

from_1993 = { k: v for k, v in files.items() if k.startswith('1993') }

## 1. External references
---

In [469]:
def journal_matches(text):
    """
    Matches the part: Dz. U. Nr 183 (...)
    
    In journal content like: 
    
    Art. 2.
    W ustawie z dnia
    29 lipca 2005 r. o obrocie instrumentami finansowymi (Dz. U. Nr 183, poz. 1538,
    z późn. zm.[3]))
    w art. 70 w ust. 2 pkt 1 otrzymuje brzmienie:
    „1) art. 69

    """
    
    in_paren_pattern = r'\(Dz.\s?U.[^\)]*\)'
    in_quotes_pattern = r'"Dz.\s?U.[^\)]*"' 
    
    in_paren = list(regex.finditer(in_paren_pattern, text))
    in_quotes = list(regex.finditer(in_quotes_pattern, text))
    
    return in_paren + in_quotes

In [468]:
def foot_note_matches(text):
    """
     Matches the part: Dz. U. z 2004 r. (...)
     
     In foot notes like:
     
    [4]) Zmiany tekstu jednolitego wymienionej ustawy zostały
    ogłoszone w Dz. U. z 2004 r. Nr 273, poz. 2703, z 2005 r. Nr 155,
    poz. 1297 i Nr 172, poz. 1440, z 2006 r. Nr 12, poz. 61, z 2007 r. Nr 23, poz.
    136 i Nr 99, poz. 666, z 2008 r. Nr 218, poz. 1391 oraz z 2009 r. Nr 3, poz.
    11, Nr 19, poz. 101, Nr 86, poz. 720, Nr 105, poz. 877, Nr 115, poz. 966,
    Nr 143, poz. 1164 i Nr 157, poz. 1241. 
    """
    
    foot_note_pattern = r'\[\d*\]([\s\p{L}\p{P}]*(?=Dz\.\s*U\.))(Dz\.\s*U\.\s*(z\s*\d{4}\s*r\.\s*|Nr\s*\d*[,\.]\s*|poz\.\s*\d*[,\.]?\s*|i\s*\d*,?\s*|oraz\s*)*)'
    return [match.group(2) for match in regex.finditer(foot_note_pattern, text)]

In [471]:
def match_bills(register_match):
    """
    It takes something like "Dz. U z 2004 r. (...)" as an input 
    matches positions and numbers to years and returns bill references.
    """
    
    year_matches = list(regex.finditer(r'z\s(\d{4})\sr.\s(Nr (\d*)[,\s]*poz\.\s(\d*)[,i\s]*)*', register_match))
    
    # TODO: handle hanging
    
    bills = []
    for year_match in year_matches:
        year = year_match.group(1)
        bill_matches = regex.findall(r'Nr\s(\d*),?\spoz\.?\s(\d*)', year_match.group(0))
        for bill_match in bill_matches:
            position = bill_match[1]
            number = bill_match[0]
            bills.append(Bill(position, number, year))
    return bills

In [472]:
def search_for_bills(text): 
    bills = []
    for journal_match in journal_matches(text):
        journal_text = journal_match.group(0)
        cleansed = regex.sub('\s+', ' ', journal_text).strip('(")')
        bills += match_bills(cleansed)
    return bills

In [473]:
search_for_bills(test)

[253-60-1991,
 320-73-1991,
 442-100-1991,
 85-21-1992,
 279-60-1993,
 598-129-1993,
 416-90-1993]

In [476]:
# for name, bill in from_1993.items():
#     print(name)
#     for bill in search_for_bills(bill.content):
#         print('     ' + str(bill))
        

## 2. Internal references
---

## 3. Count occurrences of "ustawa"
---

In [245]:
def match_bill(files):
    results = {}
    
    flexes = ['ustawa', 'ustawy', 'ustawy', 'ustaw', 'ustawie', 
              'ustawom',  'ustawę', 'ustawy', 'ustawą', 'ustawami',
              'ustawie', 'ustawach', 'ustawo', 'ustawy'] 
    
    alternatives = '|'.join(flexes)

    pattern = r'(?i)\b' + alternatives + '\b'
    
    for file_name, content in files.items():
        result = regex.findall((pattern), content)
        results[file_name] = len(result)
        
    return results

In [246]:
def summary(results):
    least_n, least_v = None, None
    most_n, most_v = None, None
    total = 0
    for file_name, result in results.items():
        if least_v is None or least_v > result: 
            least_n, least_v = file_name, result
        if most_v is None or most_v < result:
            most_n, most_v = file_name, result
        total += result
        
    print('---------- SUMMARY ----------')
    print('least: {0:15} {1}'.format(least_n, least_v))
    print('most:  {0:15} {1}'.format(most_n, most_v))
    print('total: {0:21}'.format(total))
    print('-----------------------------')

In [247]:
results = match_bill(files)
summary(results)

---------- SUMMARY ----------
least: 1996_400.txt    0
most:  2000_696.txt    304
total:                 25940
-----------------------------


In [453]:
# weirdness: Dz.U. z 1996 r. Nr 19, poz. 87  (USTAWA BUDŻETOWA)

In [340]:
example = "Dz.U. Nr 75, poz. 445, z 1991 r. Nr 60, poz. 253, Nr 73, poz. 320 i Nr 100, poz. 442, z 1992 r. Nr 21, poz. 85 oraz z 1993 r. Nr 60, poz. 279 i Nr 129, poz. 598"

# 445-75-... - hanging
# 253-60-1991
# 320-73-1991
# 442-100-1991
# 85-21-1992
# 279-60-1993
# 598-128-1993