## Assignment 1 - Regular Expressions

Łukasz Kaźmierczak

---

In [1]:
import os
import pandas as pd 

from regex import regex
from collections import Counter

In [2]:
data_dir = 'data'

In [3]:
def read_file(path):
    with open(path, 'r') as f:
        lines = f.readlines()
        content = ''.join(lines)
        return content

In [4]:
def resolve_title(bill_content):
    """
    Extract the title from the beginning of a document.
    """
    
    try:
        title_pattern = r'U\s*S\s*T\s*A\s*W\s*A[\s\d\p{L}\p{P}]+?(?=Rozdział|Art)'
        title = regex.search(title_pattern, bill_content).group(0)
        cleansed = regex.sub('\s+', ' ', title)
        cleansed = regex.sub('U\s*S\s*T\s*A\s*W\s*A', 'Ustawa', cleansed).strip(' ')
        return cleansed
    except:
        return None

In [5]:
class BillFile:
    def __init__(self, file_name, content):
        self.file_name = file_name
        self.content = content
        self.title = resolve_title(content)

def read_files():
    files = {}
    for file_name in os.listdir(data_dir):
        path = os.path.join(data_dir, file_name)
        content = read_file(path)
        files[file_name] = BillFile(file_name, content)
    return files

files = read_files()

In [6]:
class BillReference:
    def __init__(self, position, number, year, title=None):
        self.position = position
        self.number = number
        self.year = year
        self.title = title
        
    def pretty(self):
        title = '' if self.title is None else ', title: {0}'.format(self.title)
        clean_title = regex.sub('\s+', ' ', title).strip(' ').strip('-')
        return 'pos: {0}, nr: {1}, year: {2}{3}'.format(self.position, self.number, self.year, clean_title)
    
    def __repr__(self):
        return '{0}-{1}-{2}'.format(self.position, self.number, self.year)
    
    def __eq__(self, other):
        return self.position == other.position and self.year == other.year
    
    def __hash__(self):
        return self.__repr__().__hash__()

## 1. External references
---

Find all external references to bills, e.g. ustawie z dnia 4 marca 1994 r. o zakładowym funduszu świadczeń socjalnych (Dz. U. z 2012 r. poz. 592). The result should be aggregated by bill ID (year and position) and sorted by descending number of reference counts. The reference format should include:
- the title of the regulation (if present)
- the year of the regulation
- the number of the Journal of Laws of the Republic of Poland (Dziennik Ustaw) - if applicable
- the position of the regulation


---

In [178]:
def positions(text):
    """
    Given something like:
        z roku 2016 r. Nr 5, poz. 3 i 4, Nr 7 poz. 2
        
    Return:
        [(3, 5),
         (4, 5),
         (2, 7)]
    """
    
    nr_parts_pattern = r'Nr\s*(\d*)\,(\s*[^N]*)'
    nr_parts_matches = list(regex.finditer(nr_parts_pattern, text))
    
    results = []
    
    for nr_match in nr_parts_matches:
        nr = nr_match.group(1)
        rest = nr_match.group(2)
        
        position_pattern = r'\s+(\d+)(?!-)'
        position_matches = list(regex.finditer(position_pattern, rest))
        
        position_range_pattern = r'(\d+)\-(\d+)'
        position_range_matches = regex.finditer(position_range_pattern, rest)
        
        for position_match in position_matches:
            results.append((position_match.group(1), nr))
            
        for position_range_match in position_range_matches:
            start = int(position_range_match.group(1))
            end = int(position_range_match.group(2))
            
            for pos in range(start, end+1):
                results.append((pos, nr))
            
    return results

In [179]:
def references_in_year_groups(text):
    references = []
    
    year_group_pattern = r'z\s*(\d{4})\s*r.\s*((Nr\s*(\d*),\s*poz\.\s*(\d*-?)[,\si\d(?!r\.)]*)*)'
    year_group_matches = list(regex.finditer(year_group_pattern, text))

    for year_group_match in year_group_matches:
        year = year_group_match.group(1)
        rest = year_group_match.group(2)
        
        for pos, nr in positions(rest):
            references.append(BillReference(pos, nr, year))
    return references

In [180]:
def handle_b(year, title, text):
    """
    Given something like:
      year = 2017
      title = o zmianie czegośtam
      text = Dz. U. Nr 5, pos 13 i 14
      
    Return:
      [BillReference(13, 5, 2017, 'o zmianie ustawy'),
       BillReference(14, 5, 2017, 'o zmianie ustawy')]
    """
    references = []
    
    for pos, nr in positions(text):
        references.append(BillReference(pos, nr, year, title))
        
    return references

In [181]:
def external_references(year, title, rest_match):
    """
    rest_match   - something like "Dz. U. z 2004 r. (...)"
    year & title - year and title that were match befor the rest_match 
                   like in:  (...) 29 lipca 2017 r. o zmianie ustawy (Dz. U. (...))
                   in which case year = 2017, title = "o zmianie ustawy", rest_match = "Dz. U. (...)"
    
    there are two cases:
    a) full reference
       when rest_match start's with a year: Dz. U. z 2016 r. poz. 1510 i 2074
       in which case we take "2016" as a year
    b) partial reference
       when rest_match start's with a Nr:  Dz. U. Nr 183, poz. 1538 
       in which case the year was somewhere before
    """

    references = []
    
    is_b = len(list(regex.finditer(r'Dz\.\s*U\.\s*z\s*\d{4}', rest_match))) == 0
    
    if is_b:
        year_matches = list(regex.finditer(r'z\s*\d{4}\s*r\.', rest_match))
        if len(year_matches) == 0:
            references += handle_b(year, title, rest_match)
        else:
            start = year_matches[0].start()
            references += handle_b(year, title, rest_match[:start])
            references += references_in_year_groups(rest_match[start:])
    else:
        references += references_in_year_groups(rest_match)
    
    return references

In [182]:
def external_journal_matches(text):
    """    
    Given someting like:    
        Art. 2.
        W ustawie z dnia
        29 lipca 2005 r. o obrocie instrumentami finansowymi (Dz. U. Nr 183, poz. 1538,
        z późn. zm.[3]))
        w art. 70 w ust. 2 pkt 1 otrzymuje brzmienie:
        „1) art. 69
    
    It matches:
    - "2005" as a year
    - "o obrocie instrumentami finansowymi" as a title
    - (Dz. U (...)) - as something that will be processed later
    """
    
    references = []
    
    pattern = r'(?<=(\d{4})\sr\.)([\p{L}\p{P}\s]*)(\(Dz.\s?U.[^\)]*\))'
    matches = list(regex.finditer(pattern, text))
     
    for match in matches:
        year = match.group(1)
        title = match.group(2).strip(' ')
        rest = match.group(3).strip('()')
        
        references += external_references(year, title, rest)
        
    return references 

In [183]:
def external_footnote_matches(text):
    """
     Matches the part: Dz. U. z 2004 r. (...)
     
     In foot notes like:
     
    [4]) Zmiany tekstu jednolitego wymienionej ustawy zostały
    ogłoszone w Dz. U. z 2004 r. Nr 273, poz. 2703, z 2005 r. Nr 155,
    poz. 1297 i Nr 172, poz. 1440, z 2006 r. Nr 12, poz. 61, z 2007 r. Nr 23, poz.
    136 i Nr 99, poz. 666, z 2008 r. Nr 218, poz. 1391 oraz z 2009 r. Nr 3, poz.
    11, Nr 19, poz. 101, Nr 86, poz. 720, Nr 105, poz. 877, Nr 115, poz. 966,
    Nr 143, poz. 1164 i Nr 157, poz. 1241.
    """
    
    references = []
    
    footnote_pattern = r'\[\d*\]([\s\p{L}\p{P}]*(?=Dz\.\s*U\.))(Dz\.\s*U\.\s*(z\s*\d{4}\s*r\.\s*|Nr\s*\d*[,\.]\s*|poz\.\s*\d*[,\.]?\s*|i\s*\d*,?\s*|oraz\s*)*)'
    footnote_matches = regex.finditer(footnote_pattern, text)

    for match in footnote_matches:
        references += references_in_year_groups(match.group(2))
    
    return references

In [184]:
def all_external_references(bill_content): 
    return external_journal_matches(bill_content) + external_footnote_matches(bill_content)

In [185]:
def print_external_references_in(bill):
    """
    For a given bill summarize the references to other bills
    """
    references = all_external_references(bill.content)
    reference_count = sorted(list(Counter(references).items()), key=lambda x: x[1], reverse=True)
    
    print('{0} - {1}...'.format(bill.file_name, bill.title[:50]))
    for ref, count in reference_count:
        print('\t{0:15}: {1}'.format(str(ref), count))

In [186]:
print_external_references_in(files['2001_44.txt'])

2001_44.txt - Ustawa z dnia 21 grudnia 2000 r. o jakości handlow...
	489-43-2000    : 2
	293-23-2000    : 2
	584-124-1996   : 2
	783-124-1997   : 1
	928-82-1999    : 1
	136-12-2000    : 1
	550-48-2000    : 1
	718-62-2000    : 1
	816-70-2000    : 1
	852-73-2000    : 1


In [187]:
def title_map(references):
    ref_map = {}
    for ref in references:
        if ref_map.get(ref) is None and ref.title is not None:
            clean_title = regex.sub('\s+', ' ', ref.title).strip('- ')
            ref_map[ref] = clean_title
    return ref_map

In [188]:
def summary(bills):
    references = []
    for bill in bills:
        references += all_external_references(bill.content)
    reference_count = sorted(list(Counter(references).items()), key=lambda x: x[1], reverse=True)
    
    t_map = title_map(references)
    results = []
    for ref, count in reference_count:
        results.append([count, ref.position, ref.number, ref.year, t_map.get(ref)])
        
    return pd.DataFrame(results, columns=['Count', 'Position', 'Number', 'Year', 'Title'])

### Results
---

In [189]:
summary(files.values())

Unnamed: 0,Count,Position,Number,Year,Title
0,732,668,106,1998,o zmianie niektórych ustaw określających kompe...
1,506,496,106,1996,o Służbie Więziennej
2,454,770,121,1997,Kodeks celny
3,361,136,12,2000,o zmianie niektórych ustaw związanych z funkcj...
4,297,554,88,1997,Przepisy wprowadzające Kodeks karny
5,285,153,28,1997,o powszechnym ubezpieczeniu zdrowotnym
6,248,198,34,1990,o podziale zadań i kompetencji określonych w u...
7,237,1118,162,1998,o systemie ubezpieczeń społecznych
8,226,1268,120,2000,
9,198,1126,162,1998,o systemie ubezpieczeń społecznych


## 2. Internal references
---
Find all internal references to regulations, e.g. art. 5 ust. 2, art. 5 ust. 7, etc. The result should exclude the internal numbering of the bill (e.g. Art. 1. W ustawie ...). The result should be aggregated by regulation ID (as described below) and sorted by descending number of reference counts inside particular bill. The bills should be sorted by descending number of internal references. The reference format should include all elements necessary to identify the regulation, e.g.:
- art. 1, ust. 2 - if an article inside the regulation is referenced,
- ust. 2 - if a paragraph inside the same article is referenced,
- etc.

---

In [190]:
class ArticleReference:
    def __init__(self, paragraph, article, same_article):
        self.article = article
        self.paragraph = paragraph
        self.same_article = same_article
        
    def __eq__(self, other):
        return self.paragraph == other.paragraph and self.article == other.article
    
    def __hash__(self):
        return self.__repr__().__hash__()
        
    def __repr__(self):
        return '{0}-{1}'.format(self.paragraph, self.article)

In [231]:
def paragraphs(text, article, same_article):
    """
    Given something like "art. 343 ust 2, 3 i 4" or "art. 5 ust 2-4"
    Return:
        [ArticleRef(2, 343),
         ArticleRef(3, 343),
         ArticleRef(4, 343)]
    """
    references = []
    
    paragraph_pattern = r'(?<!-)\d+\w?\w?(?!-)'
    paragraph_matches = regex.finditer(paragraph_pattern, text)

    paragraph_range_pattern = r'(\d+)\-(\d+)'
    paragraph_range_matches = regex.finditer(paragraph_range_pattern, text)

    for paragraph_match in paragraph_matches:
        paragraph = paragraph_match.group(0)
        references.append(ArticleReference(paragraph, article, same_article))

    for paragraph_range_match in paragraph_range_matches:
        start = int(paragraph_range_match.group(1))
        end = int(paragraph_range_match.group(2))

        for paragraph in range(start, end+1):
            references.append(ArticleReference(paragraph, article, same_article))
            
    return references

In [226]:
def search_for_internal(bill):
    references = []
    
    # split the text into paragraphs
    articles_pattern = r'Art\.\s*(\d+)\.\s*[\p{L}\p{P}\d\s]*?(?=Art|\Z)'
    articles_matches = regex.finditer(articles_pattern, bill.content)
    
    for article_match in articles_matches:
        
        # match parts like: "art. 4 ust. 1-3", "art. 343 ust. 2, 3 i 5"...
        different_article_pattern = 'art\.\s*(\d+)\s*ust\.\s*(\d*\w?-?\w?(\s*(i|,|oraz)*\s*\d*)*\w)' 
        different_article_matches = regex.finditer(different_article_pattern, article_match.group(0))
        
        same_article_pattern = r'(?<!art\.)ust\.\s*(\d*\w?-?\w?(\s*(i|,|oraz)*\s*\d*)*\w?)'
        same_article_matches = regex.finditer(same_article_pattern, article_match.group(0))
        
        for different_article_match in different_article_matches:
            art_pattern = "art\.\s*(\d+)\s*([\s\p{P}\p{L}\d]*)"
            art_matches = regex.finditer(art_pattern, different_article_match.group(0))
            
            for art_match in art_matches:
                article = art_match.group(1)
                rest = art_match.group(2)
                
                references += paragraphs(rest, article, same_article = False)
            
        for same_article_match in same_article_matches:
            rest = same_article_match.group(0)
            references += paragraphs(rest, article_match.group(1), same_article = True)
            
    return references

In [227]:
class BillInternalSummary:
    def __init__(self, internal_total, count_per_ref):
        self.internal_total = internal_total
        self.count_per_ref = count_per_ref
         
    def __repr__(self):
        return '''
            total_references = '{0}'
            count_per_ref = 
                {1}
            '''.format(self.internal_total, self.count_per_ref)
    
class RefCount:
    def __init__(self, ref, total, same, diff):
        self.ref = ref
        self.total = total
        self.same = 0 if same is None else same
        self.diff = 0 if diff is None else diff
        
    def __repr__(self):
        return '{0} | {1} ({2}+{3})'.format(self.ref, self.total, self.same, self.diff)

def bill_summary(bill):
    references = search_for_internal(bill)
    same_article = [ref for ref in references if ref.same_article]
    diff_article = [ref for ref in references if not ref.same_article]
    internal_total = len(references)
    
    count_total = sorted(list(Counter(references).items()), key=lambda x: x[1], reverse=True)
    count_per_same = Counter(same_article)
    count_per_diff = Counter(diff_article)
    
    count_per_ref = []
    for ref, count in count_total:
        count_per_ref.append(RefCount(ref, count, count_per_same.get(ref), count_per_diff.get(ref)))
        
    return BillInternalSummary(internal_total, count_per_ref)

In [228]:
def all_internal_references(bills): 
    return [(bill, bill_summary(bill)) for bill in bills]

In [229]:
def print_internal_summary(bills):
    bills_summary = sorted(all_internal_references(bills), key=lambda x: x[1].internal_total, reverse=True)
    
    for bill, summary in bills_summary:
        header = '{0} - {1}...'.format(bill.file_name, '' if bill is None else bill.title[:50])
        print('{0:70} | total: {1}'.format(header, summary.internal_total))
        
        for ref_count in summary.count_per_ref:
            ref = ref_count.ref
            header = 'art. {0} ust. {1}'.format(ref.article, ref.paragraph)
            print('\t{0:15} | {1} ({2}+{3})'.format(header, ref_count.total, ref_count.same, ref_count.diff))

In [230]:
print_internal_summary(files.values())

2000_696.txt - Ustawa z dnia 24 lipca 1998 r. o zmianie niektóryc...   | total: 1905
	art. 36 ust. 1  | 26 (24+2)
	art. 36 ust. 2  | 22 (20+2)
	art. 23 ust. 1  | 21 (19+2)
	art. 127 ust. 1 | 20 (20+0)
	art. 56 ust. 1  | 17 (17+0)
	art. 36 ust. 3  | 16 (16+0)
	art. 9 ust. 1   | 15 (15+0)
	art. 23 ust. 2  | 15 (11+4)
	art. 127 ust. 2 | 15 (15+0)
	art. 86 ust. 1  | 14 (14+0)
	art. 126 ust. 1 | 14 (14+0)
	art. 131 ust. 1 | 14 (14+0)
	art. 137 ust. 1 | 14 (14+0)
	art. 8 ust. 1   | 13 (7+6)
	art. 131 ust. 3 | 13 (13+0)
	art. 22 ust. 2  | 12 (11+1)
	art. 17 ust. 1  | 12 (11+1)
	art. 6 ust. 1   | 12 (7+5)
	art. 13 ust. 3  | 12 (8+4)
	art. 9 ust. 2   | 12 (11+1)
	art. 74 ust. 1  | 12 (12+0)
	art. 86 ust. 2  | 12 (11+1)
	art. 126 ust. 2 | 12 (12+0)
	art. 4 ust. 1   | 11 (10+1)
	art. 10 ust. 2  | 11 (3+8)
	art. 45 ust. 1  | 11 (9+2)
	art. 14 ust. 1  | 11 (10+1)
	art. 56 ust. 4  | 11 (11+0)
	art. 23 ust. 3  | 10 (8+2)
	art. 17 ust. 3  | 10 (10+0)
	art. 12 ust. 1  | 10 (10+0)
	art. 74 ust. 2  | 10 

	art. 18 ust. 1  | 2 (2+0)
	art. 19 ust. 1  | 2 (2+0)
	art. 22 ust. 1  | 2 (2+0)
	art. 26 ust. 1  | 2 (2+0)
	art. 30 ust. 3  | 2 (2+0)
	art. 31 ust. 1  | 2 (2+0)
	art. 31 ust. 4  | 2 (2+0)
	art. 35 ust. 6  | 2 (0+2)
	art. 34 ust. 2  | 2 (0+2)
	art. 74 ust. 2  | 2 (1+1)
	art. 47 ust. 1  | 2 (1+1)
	art. 50 ust. 2  | 2 (2+0)
	art. 50 ust. 3  | 2 (2+0)
	art. 56 ust. 1  | 2 (2+0)
	art. 59 ust. 1  | 2 (2+0)
	art. 67 ust. 2  | 2 (2+0)
	art. 73 ust. 6  | 2 (1+1)
	art. 75 ust. 1  | 2 (2+0)
	art. 76 ust. 4  | 2 (2+0)
	art. 82 ust. 3  | 2 (1+1)
	art. 83 ust. 3  | 2 (2+0)
	art. 91 ust. 1  | 2 (2+0)
	art. 97 ust. 2  | 2 (2+0)
	art. 98 ust. 3  | 2 (2+0)
	art. 101 ust. 2 | 2 (2+0)
	art. 102 ust. 2 | 2 (2+0)
	art. 106 ust. 2 | 2 (2+0)
	art. 106 ust. 3 | 2 (1+1)
	art. 110 ust. 2 | 2 (1+1)
	art. 111 ust. 1 | 2 (1+1)
	art. 122 ust. 5 | 2 (1+1)
	art. 126 ust. 1 | 2 (2+0)
	art. 112 ust. 1 | 2 (0+2)
	art. 129 ust. 2 | 2 (2+0)
	art. 133 ust. 1 | 2 (0+2)
	art. 142 ust. 1 | 2 (1+1)
	art. 6 ust. 1   | 2 (0+2)
	

	art. 170 ust. 2 | 3 (3+0)
	art. 185 ust. 1 | 3 (3+0)
	art. 188 ust. 1 | 3 (3+0)
	art. 271 ust. 3 | 3 (1+2)
	art. 265 ust. 3 | 3 (3+0)
	art. 305 ust. 1 | 3 (1+2)
	art. 7 ust. 1   | 2 (1+1)
	art. 13 ust. 3  | 2 (2+0)
	art. 15 ust. 1  | 2 (2+0)
	art. 15 ust. 2  | 2 (2+0)
	art. 37 ust. 2  | 2 (1+1)
	art. 41 ust. 4  | 2 (2+0)
	art. 41 ust. 5  | 2 (2+0)
	art. 43 ust. 2  | 2 (2+0)
	art. 49 ust. 2  | 2 (1+1)
	art. 49 ust. 1  | 2 (1+1)
	art. 59 ust. 1  | 2 (2+0)
	art. 68 ust. 1  | 2 (2+0)
	art. 69 ust. 1  | 2 (2+0)
	art. 82 ust. 1  | 2 (1+1)
	art. 90 ust. 1  | 2 (1+1)
	art. 108 ust. 2 | 2 (2+0)
	art. 118 ust. 2 | 2 (2+0)
	art. 120 ust. 1 | 2 (1+1)
	art. 122 ust. 1 | 2 (2+0)
	art. 125 ust. 1 | 2 (2+0)
	art. 125 ust. 2 | 2 (2+0)
	art. 31 ust. 4  | 2 (0+2)
	art. 153 ust. 3 | 2 (2+0)
	art. 153 ust. 5 | 2 (2+0)
	art. 155 ust. 1 | 2 (2+0)
	art. 166 ust. 1 | 2 (2+0)
	art. 168 ust. 1 | 2 (2+0)
	art. 170 ust. 1 | 2 (2+0)
	art. 176 ust. 2 | 2 (2+0)
	art. 192 ust. 1 | 2 (1+1)
	art. 192 ust. 5 | 2 (1+1)
	

	art. 81 ust. 1  | 1 (1+0)
	art. 84 ust. 1  | 1 (1+0)
2004_177.txt - Ustawa z dnia 29 stycznia 2004 r. Prawo zamówień p...   | total: 388
	art. 122 ust. 1 | 9 (3+6)
	art. 3 ust. 1   | 8 (1+7)
	art. 36 ust. 1  | 6 (2+4)
	art. 227 ust. 3 | 6 (6+0)
	art. 67 ust. 1  | 5 (0+5)
	art. 100 ust. 1 | 5 (1+4)
	art. 179 ust. 2 | 5 (3+2)
	art. 227 ust. 4 | 5 (5+0)
	art. 40 ust. 3  | 4 (3+1)
	art. 67 ust. 2  | 4 (1+3)
	art. 136 ust. 1 | 4 (0+4)
	art. 121 ust. 1 | 4 (4+0)
	art. 129 ust. 1 | 4 (4+0)
	art. 130 ust. 1 | 4 (4+0)
	art. 227 ust. 2 | 4 (4+0)
	art. 23 ust. 1  | 3 (2+1)
	art. 24 ust. 1  | 3 (1+2)
	art. 52 ust. 2  | 3 (2+1)
	art. 62 ust. 1  | 3 (1+2)
	art. 102 ust. 1 | 3 (3+0)
	art. 127 ust. 1 | 3 (3+0)
	art. 134 ust. 1 | 3 (3+0)
	art. 167 ust. 3 | 3 (0+3)
	art. 137 ust. 3 | 3 (3+0)
	art. 138 ust. 2 | 3 (3+0)
	art. 142 ust. 2 | 3 (2+1)
	art. 146 ust. 1 | 3 (2+1)
	art. 28 ust. 4  | 3 (0+3)
	art. 226 ust. 3 | 3 (3+0)
	art. 227 ust. 1 | 3 (3+0)
	art. 5 ust. 1   | 2 (2+0)
	art. 11 ust. 6  | 2 (1+1

	art. 6 ust. 10  | 1 (1+0)
	art. 10 ust. 7  | 1 (1+0)
	art. 20 ust. 1  | 1 (0+1)
	art. 22 ust. 2  | 1 (0+1)
	art. 11 ust. 13 | 1 (1+0)
	art. 29 ust. 3  | 1 (0+1)
	art. 26 ust. 5  | 1 (1+0)
	art. 26 ust. 2a | 1 (1+0)
	art. 26 ust. 3  | 1 (1+0)
	art. 26 ust. 2  | 1 (1+0)
	art. 42 ust. 7  | 1 (0+1)
	art. 42 ust. 4a | 1 (0+1)
	art. 35 ust. 3  | 1 (0+1)
	art. 30 ust. 7  | 1 (1+0)
	art. 30 ust. 4a | 1 (1+0)
	art. 30 ust. 9  | 1 (1+0)
	art. 32 ust. 2  | 1 (1+0)
	art. 32 ust. 1  | 1 (1+0)
	art. 32 ust. 3  | 1 (1+0)
	art. 34 ust. 3  | 1 (1+0)
	art. 34 ust. 4  | 1 (1+0)
	art. 39 ust. 1  | 1 (0+1)
	art. 36 ust. 4  | 1 (1+0)
	art. 36 ust. 5a | 1 (1+0)
	art. 36 ust. 5b | 1 (1+0)
	art. 36 ust. 7a | 1 (1+0)
	art. 51 ust. 3  | 1 (0+1)
	art. 30 ust. 6  | 1 (0+1)
	art. 49 ust. 1a | 1 (1+0)
	art. 49 ust. 3a | 1 (1+0)
	art. 49 ust. 5  | 1 (1+0)
	art. 49 ust. 6  | 1 (1+0)
	art. 49 ust. 8  | 1 (1+0)
	art. 67 ust. 3  | 1 (0+1)
	art. 68 ust. 2  | 1 (0+1)
	art. 79 ust. 1  | 1 (0+1)
	art. 61 ust. 1a | 1 (1+0)
	

	art. 70 ust. 2b | 1 (1+0)
	art. 70 ust. 3b | 1 (1+0)
	art. 70 ust. 5b | 1 (1+0)
	art. 70 ust. 6  | 1 (1+0)
	art. 70 ust. 7  | 1 (1+0)
	art. 70 ust. 8  | 1 (1+0)
	art. 70 ust. 1a | 1 (1+0)
	art. 2 ust. 4   | 1 (1+0)
	art. 2 ust. 5   | 1 (1+0)
	art. 10 ust. 5  | 1 (0+1)
	art. 10 ust. 9  | 1 (0+1)
	art. 3 ust. 13  | 1 (1+0)
	art. 3 ust. 410 | 1 (1+0)
	art. 3 ust. 10  | 1 (1+0)
	art. 3 ust. 1b  | 1 (1+0)
	art. 3 ust. 6a  | 1 (1+0)
	art. 3 ust. 8a  | 1 (1+0)
	art. 3 ust. 9   | 1 (1+0)
	art. 3 ust. 10  | 1 (1+0)
	art. 3 ust. 11  | 1 (1+0)
	art. 3 ust. 7b  | 1 (1+0)
	art. 4 ust. 1   | 1 (1+0)
	art. 4 ust. 2   | 1 (1+0)
	art. 6 ust. 6   | 1 (1+0)
	art. 6 ust. 1a  | 1 (1+0)
	art. 8 ust. 3   | 1 (1+0)
	art. 8 ust. 4   | 1 (1+0)
	art. 8 ust. 2   | 1 (1+0)
	art. 10 ust. 1  | 1 (1+0)
	art. 97 ust. 2  | 1 (0+1)
	art. 12 ust. 2  | 1 (1+0)
	art. 13 ust. 2a | 1 (1+0)
	art. 14 ust. 1  | 1 (1+0)
	art. 15 ust. 1  | 1 (1+0)
	art. 15 ust. 2  | 1 (1+0)
	art. 17 ust. 4  | 1 (1+0)
	art. 18 ust. 3  | 1 (1+0)
	

	art. 4 ust. 3   | 1 (1+0)
	art. 26 ust. 4  | 1 (0+1)
	art. 32 ust. 1  | 1 (0+1)
	art. 25 ust. 2  | 1 (1+0)
	art. 25 ust. 4  | 1 (1+0)
	art. 25 ust. 1  | 1 (1+0)
	art. 40 ust. 1  | 1 (0+1)
	art. 36 ust. 4a | 1 (1+0)
	art. 42 ust. 2  | 1 (1+0)
	art. 5 ust. 1   | 1 (1+0)
	art. 17 ust. 2  | 1 (0+1)
	art. 6 ust. 4   | 1 (1+0)
	art. 6 ust. 5   | 1 (1+0)
	art. 6 ust. 2   | 1 (1+0)
	art. 21 ust. 8  | 1 (0+1)
	art. 19 ust. 3  | 1 (0+1)
	art. 14 ust. 1  | 1 (0+1)
	art. 14 ust. 2  | 1 (0+1)
	art. 14 ust. 3  | 1 (0+1)
	art. 14 ust. 4  | 1 (0+1)
	art. 14 ust. 5  | 1 (0+1)
	art. 14 ust. 6  | 1 (0+1)
	art. 14 ust. 7  | 1 (0+1)
	art. 29 ust. 1  | 1 (0+1)
	art. 7 ust. 4   | 1 (1+0)
	art. 7 ust. 1a  | 1 (1+0)
	art. 7 ust. 8   | 1 (1+0)
	art. 7 ust. 1   | 1 (1+0)
	art. 7 ust. 2   | 1 (1+0)
	art. 7 ust. 3   | 1 (1+0)
	art. 7 ust. 4   | 1 (1+0)
	art. 7 ust. 5   | 1 (1+0)
	art. 7 ust. 6   | 1 (1+0)
	art. 7 ust. 7   | 1 (1+0)
	art. 7 ust. 2b  | 1 (1+0)
	art. 110 ust. 1 | 1 (0+1)
	art. 110 ust. 3 | 1 (0+1)
	

TypeError: 'NoneType' object is not subscriptable

## 3. Count occurrences of "ustawa"
---

Count all occurrences of the word ustawa in all inflected forms (ustawa, ustawie, ustawę, etc.), and all spelling forms (ustawa, Ustawa, USTAWA), excluding other words with the same prefix (e.g. ustawić).

---

In [436]:
def match_in_bills(bills):
    results = {}
    
    flexes = ['ustawa', 'ustawy', 'ustaw', 'ustawie', 
              'ustawom',  'ustawę', 'ustawą', 'ustawami',
              'ustawach', 'ustawo'] 
    
    flexes_with_opt_spaces = ['\s*'.join(list(flex)) for flex in flexes]
    
    alternatives = '|'.join(flexes_with_opt_spaces)

    pattern = r'(?i)\b' + alternatives + '\b'
    
    for file_name, bill in bills.items():
        result = regex.findall(pattern, bill.content)
        results[file_name] = len(result)
        
    return results

In [437]:
def summary(results):
    least_n, least_v = None, None
    most_n, most_v = None, None
    total = 0
    for file_name, result in results.items():
        if least_v is None or least_v > result: 
            least_n, least_v = file_name, result
        if most_v is None or most_v < result:
            most_n, most_v = file_name, result
        total += result
        
    print('---------- SUMMARY ----------')
    print('least: {0:15} {1}'.format(least_n, least_v))
    print('most:  {0:15} {1}'.format(most_n, most_v))
    print('total: {0:21}'.format(total))
    print('-----------------------------')

In [438]:
results = match_in_bills(files)
summary(results)

---------- SUMMARY ----------
least: 1996_400.txt    0
most:  2000_696.txt    304
total:                 26138
-----------------------------
