## Assignment 1 - Regular Expressions

Łukasz Kaźmierczak

---

In [4]:
import os

from regex import regex
from collections import Counter

In [5]:
data_dir = 'data'

In [6]:
def read_file(path):
    with open(path, 'r') as f:
        lines = f.readlines()
        content = ''.join(lines)
        return content

In [25]:
def resolve_title(bill_content):
    """
    Extract the title from the beginning of a document.
    """
    
    try:
        title_pattern = r'U\s*S\s*T\s*A\s*W\s*A[\s\d\p{L}\p{P}]+?(?=Rozdział|Art)'
        title = regex.search(title_pattern, bill_content).group(0)
        cleansed = regex.sub('\s+', ' ', title)
        cleansed = regex.sub('U\s*S\s*T\s*A\s*W\s*A', 'Ustawa', cleansed).strip(' ')
        return cleansed
    except:
        return None

In [8]:
class BillFile:
    def __init__(self, file_name, content):
        self.file_name = file_name
        self.content = content
        self.title = resolve_title(content)

def read_files():
    files = {}
    for file_name in os.listdir(data_dir):
        path = os.path.join(data_dir, file_name)
        content = read_file(path)
        files[file_name] = BillFile(file_name, content)
    return files

In [240]:
class BillReference:
    def __init__(self, position, number, year, title=None):
        self.position = position
        self.number = number
        self.year = year
        self.title = title
        
    def pretty(self):
        title = '' if self.title is None else ', title: {0}'.format(self.title)
        clean_title = regex.sub('\s+', ' ', title).strip(' ').strip('-')
        return 'pos: {0}, nr: {1}, year: {2}{3}'.format(self.position, self.number, self.year, clean_title)
    
    def __repr__(self):
        return '{0}-{1}-{2}'.format(self.position, self.number, self.year)
    
    def __eq__(self, other):
        return self.position == other.position and self.year == other.year
    
    def __hash__(self):
        return self.__repr__().__hash__()

## 1. External references
---

Find all external references to bills, e.g. ustawie z dnia 4 marca 1994 r. o zakładowym funduszu świadczeń socjalnych (Dz. U. z 2012 r. poz. 592). The result should be aggregated by bill ID (year and position) and sorted by descending number of reference counts. The reference format should include:
- the title of the regulation (if present)
- the year of the regulation
- the number of the Journal of Laws of the Republic of Poland (Dziennik Ustaw) - if applicable
- the position of the regulation


---

In [234]:
def positions(text):
    """
    Given something like:
        z roku 2016 r. Nr 5, poz. 3 i 4, Nr 7 poz. 2
        
    Return:
        [(3, 5),
         (4, 5),
         (2, 7)]
    """
    
    nr_parts_pattern = r'Nr\s*(\d*)\,(\s*[^N]*)'
    nr_parts_matches = list(regex.finditer(nr_parts_pattern, text))
    
    results = []
    
    for nr_match in nr_parts_matches:
        nr = nr_match.group(1)
        rest = nr_match.group(2)
        
        position_pattern = r'\d+'
        positions = regex.findall(position_pattern, rest)
        
        for pos in positions:
            results.append((pos, nr))
            
    return results

In [169]:
def references_in_year_groups(text):
    references = []
    
    year_group_pattern = r'z\s(\d{4})\sr.\s((Nr\s*(\d*),\s*poz\.\s*(\d*)[,\si\d]*)*)'
    year_group_matches = list(regex.finditer(year_group_pattern, text))

    for year_group_match in year_group_matches:
        year = year_group_match.group(1)
        rest = year_group_match.group(2)
        
        for pos, nr in positions(rest):
            references.append(BillReference(pos, nr, year))
    return references

In [233]:
def handle_b(year, title, text):
    """
    Given something like:
      year = 2017
      title = o zmianie czegośtam
      text = Dz. U. Nr 5, pos 13 i 14
      
    Return:
      [BillReference(13, 5, 2017, 'o zmianie ustawy'),
       BillReference(14, 5, 2017, 'o zmianie ustawy')]
    """
    references = []
    
    for pos, nr in positions(text):
        references.append(BillReference(pos, nr, year, title))
        
    return references

In [217]:
def external_references(year, title, rest_match):
    """
    rest_match   - something like "Dz. U. z 2004 r. (...)"
    year & title - year and title that were match befor the rest_match 
                   like in:  (...) 29 lipca 2017 r. o zmianie ustawy (Dz. U. (...))
                   in which case year = 2017, title = "o zmianie ustawy", rest_match = "Dz. U. (...)"
    
    there are two cases:
    a) full reference
       when rest_match start's with a year: Dz. U. z 2016 r. poz. 1510 i 2074
       in which case we take "2016" as a year
    b) partial reference
       when rest_match start's with a Nr:  Dz. U. Nr 183, poz. 1538 
       in which case the year was somewhere before
    """

    references = []
    
    is_b = len(list(regex.finditer(r'Dz\.\s*U\.\s*z\s*\d{4}', rest_match))) == 0
    
    if is_b:
        year_matches = list(regex.finditer(r'z\s*\d{4}\s*r\.', rest_match))
        if len(year_matches) == 0:
            references += handle_b(year, title, rest_match)
        else:
            start = year_matches[0].start()
            references += handle_b(year, title, rest_match[:start])
            references += references_in_year_groups(rest_match[start:])
    else:
        references += references_in_year_groups(rest_match)
    
    return references

In [235]:
def external_journal_matches(text):
    """    
    Given someting like:    
        Art. 2.
        W ustawie z dnia
        29 lipca 2005 r. o obrocie instrumentami finansowymi (Dz. U. Nr 183, poz. 1538,
        z późn. zm.[3]))
        w art. 70 w ust. 2 pkt 1 otrzymuje brzmienie:
        „1) art. 69
    
    It matches:
    - "2005" as a year
    - "o obrocie instrumentami finansowymi" as a title
    - (Dz. U (...)) - as something that will be processed later
    """
    
    references = []
    
    pattern = r'(?<=(\d{4})\sr\.)([\p{L}\p{P}\s]*)(\(Dz.\s?U.[^\)]*\))'
    matches = list(regex.finditer(pattern, text))
     
    for match in matches:
        year = match.group(1)
        title = match.group(2).strip(' ')
        rest = match.group(3).strip('()')
        
        references += external_references(year, title, rest)
        
    return references 

In [173]:
def external_footnote_matches(text):
    """
     Matches the part: Dz. U. z 2004 r. (...)
     
     In foot notes like:
     
    [4]) Zmiany tekstu jednolitego wymienionej ustawy zostały
    ogłoszone w Dz. U. z 2004 r. Nr 273, poz. 2703, z 2005 r. Nr 155,
    poz. 1297 i Nr 172, poz. 1440, z 2006 r. Nr 12, poz. 61, z 2007 r. Nr 23, poz.
    136 i Nr 99, poz. 666, z 2008 r. Nr 218, poz. 1391 oraz z 2009 r. Nr 3, poz.
    11, Nr 19, poz. 101, Nr 86, poz. 720, Nr 105, poz. 877, Nr 115, poz. 966,
    Nr 143, poz. 1164 i Nr 157, poz. 1241.
    """
    
    references = []
    
    footnote_pattern = r'\[\d*\]([\s\p{L}\p{P}]*(?=Dz\.\s*U\.))(Dz\.\s*U\.\s*(z\s*\d{4}\s*r\.\s*|Nr\s*\d*[,\.]\s*|poz\.\s*\d*[,\.]?\s*|i\s*\d*,?\s*|oraz\s*)*)'
    footnote_matches = regex.finditer(footnote_pattern, text)

    for match in footnote_matches:
        references += references_in_year_groups(match.group(2))
    
    return references

In [174]:
def all_external_references(bill_content): 
    return external_journal_matches(bill_content) + external_footnote_matches(bill_content)

In [175]:
def print_external_references_in(bill):
    """
    For a given bill summarize the references to other bills
    """
    references = all_external_references(bill.content)
    reference_count = sorted(list(Counter(references).items()), key=lambda x: x[1], reverse=True)
    
    print('{0} - {1}...'.format(bill.file_name, bill.title[:30]))
    for ref, count in reference_count:
        print('\t{0:15}: {1}'.format(str(ref), count))

In [230]:
print_external_references_in(files['2004_962.txt'])

2004_962.txt - Ustawa z dnia 20 listopada 200...
	2571-256-2004  : 2
	2703-273-2004  : 2
	1297-155-2005  : 2
	1440-172-2005  : 2
	61-12-2006     : 2
	136-23-2007    : 2
	666-99-2007    : 2
	1391-218-2008  : 2
	11-3-2009      : 2
	101-19-2009    : 2
	720-86-2009    : 2
	877-105-2009   : 2
	966-115-2009   : 2
	1164-143-2009  : 2
	1241-157-2009  : 2
	1538-183-2005  : 1
	3-183-2005     : 1
	708-104-2006   : 1
	1119-157-2006  : 1
	1056-171-2008  : 1
	69-13-2009     : 1
	341-42-2009    : 1
	649-77-2009    : 1
	659-78-2009    : 1
	1316-165-2009  : 1
	1317-166-2009  : 1
	1323-168-2009  : 1


In [238]:
def summary(bills):
    references = []
    for bill in bills:
        references += all_external_references(bill.content)
    reference_count = sorted(list(Counter(references).items()), key=lambda x: x[1], reverse=True)
    for ref, count in reference_count:
        print('{0:5} | {1}'.format(count, ref.pretty()))


### Results
---

In [241]:
summary(files.values())

  677 | pos: 668, nr: 106, year: 1998
  459 | pos: 496, nr: 106, year: 1996
  427 | pos: 770, nr: 121, year: 1997
  352 | pos: 136, nr: 12, year: 2000
  275 | pos: 554, nr: 88, year: 1997
  263 | pos: 153, nr: 28, year: 1997
  238 | pos: 198, nr: 34, year: 1990
  222 | pos: 1118, nr: 162, year: 1998, title: o systemie ubezpieczeń społecznych
  202 | pos: 1268, nr: 120, year: 2000
  191 | pos: 1126, nr: 162, year: 1998, title: o systemie ubezpieczeń społecznych
  180 | pos: 943, nr: 141, year: 1997
  161 | pos: 926, nr: 137, year: 1997
  159 | pos: 254, nr: 54, year: 1992
  152 | pos: 708, nr: 104, year: 2006
  147 | pos: 550, nr: 48, year: 2000
  140 | pos: 110, nr: 24, year: 1996
  138 | pos: 769, nr: 121, year: 1997
  134 | pos: 509, nr: 105, year: 1994
  133 | pos: 756, nr: 117, year: 1998
  132 | pos: 17, nr: 4, year: 1995
  130 | pos: 192, nr: 35, year: 1989
  129 | pos: 1255, nr: 110, year: 1999
  129 | pos: 676, nr: 74, year: 2002
  128 | pos: 1178, nr: 101, year: 1999
  128 | p

    2 | pos: 474, nr: 75, year: 2010
    2 | pos: 804, nr: 119, year: 2010
    2 | pos: 279, nr: 50, year: 1945, title: o własności i użytkowaniu gruntów na obszarze m.st. Warszawy
    2 | pos: 1, nr: 1, year: 2000
    2 | pos: 881, nr: 147, year: 2011
    2 | pos: 1281, nr: 217, year: 2011
    2 | pos: 371, nr: 83, year: 1991
    2 | pos: 874, nr: 125, year: 2007
    2 | pos: 1044, nr: 92, year: 1999
    2 | pos: 291, nr: 50, year: 2008
    2 | pos: 1186, nr: 168, year: 2007
    2 | pos: 189, nr: 34, year: 2010
    2 | pos: 676, nr: 117, year: 2011
    2 | pos: 66, nr: 11, year: 2010
    2 | pos: 1447, nr: 220, year: 2010
    2 | pos: 938, nr: 112, year: 2005
    2 | pos: 626, nr: 108, year: 2011
    2 | pos: 434, nr: 53, year: 2009
    2 | pos: 531, nr: 92, year: 2011
    2 | pos: 1111, nr: 187, year: 2011
    2 | pos: 1045, nr: 127, year: 2009
    2 | pos: 388, nr: 72, year: 2011
    2 | pos: 715, nr: 126, year: 2011
    2 | pos: 984, nr: 165, year: 2011
    2 | pos: 619, nr: 72, ye

    1 | pos: 7, nr: 3, year: 1946, title: o przejęciu na własność Państwa podstawowych gałęzi gospodarki narodowej
    1 | pos: 270, nr: 58, year: 1956
    1 | pos: 154, nr: 54, year: 1992
    1 | pos: 41, nr: 5, year: 2001
    1 | pos: 2020, nr: 238, year: 2002
    1 | pos: 1192, nr: 110, year: 2001
    1 | pos: 1310, nr: 122, year: 2000, title: o objęciu poręczeniami Skarbu Państwa spłaty niektórych kredytów mieszkaniowych
    1 | pos: 1196, nr: 111, year: 2001, title: o restrukturyzacji hutnictwa żelaza i stali
    1 | pos: 495, nr: 56, year: 2003
    1 | pos: 8, nr: 110, year: 1999, title: o kształtowaniu wynagrodzeń w państwowej sferze budżetowej oraz o zmianie niektórych ustaw
    1 | pos: 218, nr: 170, year: 2006, title: o służbie cywilnej
    1 | pos: 431, nr: 64, year: 2007
    1 | pos: 2, nr: 68, year: 2004, title: – Prawo celne
    1 | pos: 1965, nr: 231, year: 2005
    1 | pos: 4, nr: 29, year: 1989, title: o stosunku Państwa do Kościoła Katolickiego w Rzeczypospolitej Pols

## 2. Internal references
---
Find all internal references to regulations, e.g. art. 5 ust. 2, art. 5 ust. 7, etc. The result should exclude the internal numbering of the bill (e.g. Art. 1. W ustawie ...). The result should be aggregated by regulation ID (as described below) and sorted by descending number of reference counts inside particular bill. The bills should be sorted by descending number of internal references. The reference format should include all elements necessary to identify the regulation, e.g.:
- art. 1, ust. 2 - if an article inside the regulation is referenced,
- ust. 2 - if a paragraph inside the same article is referenced,
- etc.

---

In [105]:
class ArticleReference:
    def __init__(self, paragraph, article, same_article):
        self.article = article
        self.paragraph = paragraph
        self.same_article = same_article

### Different corner cases

From 2004_964.txt:

ust. 1-3  
w art. 343 ust. 2, 3 i 5  
art. 185 ust. 1a 

In [36]:
test = files['2004_962.txt']

In [45]:
def search_for_internal(bill):
    articles_pattern = r'Art\.\s*\d*\.\s*[\p{L}\p{P}\d\s]*?(?=Art|\Z|\n+)'
    articles_matches = list(regex.finditer(articles_pattern, bill.content))
    
    for article in articles_matches:
        article_pattern =  r'art\.\s*(\d*)\s*ust\.\s*(\d*\w?(\s*(i|,|oraz)*\s*\d*)*)'
    
    return articles_matches

In [43]:
len(search_for_internal(test))

3

In [46]:
# (?<=art\.)\s*(\d*)[\s\p{L}\p{P}]*(ust\.\s*(\d)*)

## 3. Count occurrences of "ustawa"
---

Count all occurrences of the word ustawa in all inflected forms (ustawa, ustawie, ustawę, etc.), and all spelling forms (ustawa, Ustawa, USTAWA), excluding other words with the same prefix (e.g. ustawić).

---

In [21]:
def match_in_bills(bills):
    results = {}
    
    flexes = ['ustawa', 'ustawy', 'ustawy', 'ustaw', 'ustawie', 
              'ustawom',  'ustawę', 'ustawy', 'ustawą', 'ustawami',
              'ustawie', 'ustawach', 'ustawo', 'ustawy'] 
    
    alternatives = '|'.join(flexes)

    pattern = r'(?i)\b' + alternatives + '\b'
    
    for file_name, bill in bills.items():
        result = regex.findall(pattern, bill.content)
        results[file_name] = len(result)
        
    return results

In [22]:
def summary(results):
    least_n, least_v = None, None
    most_n, most_v = None, None
    total = 0
    for file_name, result in results.items():
        if least_v is None or least_v > result: 
            least_n, least_v = file_name, result
        if most_v is None or most_v < result:
            most_n, most_v = file_name, result
        total += result
        
    print('---------- SUMMARY ----------')
    print('least: {0:15} {1}'.format(least_n, least_v))
    print('most:  {0:15} {1}'.format(most_n, most_v))
    print('total: {0:21}'.format(total))
    print('-----------------------------')

In [23]:
results = match_in_bills(files)
summary(results)

---------- SUMMARY ----------
least: 1996_400.txt    0
most:  2000_696.txt    304
total:                 25940
-----------------------------
