## Assignment 1 - Regular Expressions

Łukasz Kaźmierczak

---

In [266]:
import os
import pandas as pd 

from regex import regex
from collections import Counter

In [5]:
data_dir = 'data'

In [6]:
def read_file(path):
    with open(path, 'r') as f:
        lines = f.readlines()
        content = ''.join(lines)
        return content

In [25]:
def resolve_title(bill_content):
    """
    Extract the title from the beginning of a document.
    """
    
    try:
        title_pattern = r'U\s*S\s*T\s*A\s*W\s*A[\s\d\p{L}\p{P}]+?(?=Rozdział|Art)'
        title = regex.search(title_pattern, bill_content).group(0)
        cleansed = regex.sub('\s+', ' ', title)
        cleansed = regex.sub('U\s*S\s*T\s*A\s*W\s*A', 'Ustawa', cleansed).strip(' ')
        return cleansed
    except:
        return None

In [8]:
class BillFile:
    def __init__(self, file_name, content):
        self.file_name = file_name
        self.content = content
        self.title = resolve_title(content)

def read_files():
    files = {}
    for file_name in os.listdir(data_dir):
        path = os.path.join(data_dir, file_name)
        content = read_file(path)
        files[file_name] = BillFile(file_name, content)
    return files

In [273]:
class BillReference:
    def __init__(self, position, number, year, title=None):
        self.position = position
        self.number = number
        self.year = year
        self.title = title
        
    def pretty(self):
        title = '' if self.title is None else ', title: {0}'.format(self.title)
        clean_title = regex.sub('\s+', ' ', title).strip(' ').strip('-')
        return 'pos: {0}, nr: {1}, year: {2}{3}'.format(self.position, self.number, self.year, clean_title)
    
    def __repr__(self):
        return '{0}-{1}-{2}'.format(self.position, self.number, self.year)
    
    def __eq__(self, other):
        return self.position == other.position and self.year == other.year
    
    def __hash__(self):
        return self.__repr__().__hash__()

## 1. External references
---

Find all external references to bills, e.g. ustawie z dnia 4 marca 1994 r. o zakładowym funduszu świadczeń socjalnych (Dz. U. z 2012 r. poz. 592). The result should be aggregated by bill ID (year and position) and sorted by descending number of reference counts. The reference format should include:
- the title of the regulation (if present)
- the year of the regulation
- the number of the Journal of Laws of the Republic of Poland (Dziennik Ustaw) - if applicable
- the position of the regulation


---

In [234]:
def positions(text):
    """
    Given something like:
        z roku 2016 r. Nr 5, poz. 3 i 4, Nr 7 poz. 2
        
    Return:
        [(3, 5),
         (4, 5),
         (2, 7)]
    """
    
    nr_parts_pattern = r'Nr\s*(\d*)\,(\s*[^N]*)'
    nr_parts_matches = list(regex.finditer(nr_parts_pattern, text))
    
    results = []
    
    for nr_match in nr_parts_matches:
        nr = nr_match.group(1)
        rest = nr_match.group(2)
        
        position_pattern = r'\d+'
        positions = regex.findall(position_pattern, rest)
        
        for pos in positions:
            results.append((pos, nr))
            
    return results

In [169]:
def references_in_year_groups(text):
    references = []
    
    year_group_pattern = r'z\s(\d{4})\sr.\s((Nr\s*(\d*),\s*poz\.\s*(\d*)[,\si\d]*)*)'
    year_group_matches = list(regex.finditer(year_group_pattern, text))

    for year_group_match in year_group_matches:
        year = year_group_match.group(1)
        rest = year_group_match.group(2)
        
        for pos, nr in positions(rest):
            references.append(BillReference(pos, nr, year))
    return references

In [233]:
def handle_b(year, title, text):
    """
    Given something like:
      year = 2017
      title = o zmianie czegośtam
      text = Dz. U. Nr 5, pos 13 i 14
      
    Return:
      [BillReference(13, 5, 2017, 'o zmianie ustawy'),
       BillReference(14, 5, 2017, 'o zmianie ustawy')]
    """
    references = []
    
    for pos, nr in positions(text):
        references.append(BillReference(pos, nr, year, title))
        
    return references

In [217]:
def external_references(year, title, rest_match):
    """
    rest_match   - something like "Dz. U. z 2004 r. (...)"
    year & title - year and title that were match befor the rest_match 
                   like in:  (...) 29 lipca 2017 r. o zmianie ustawy (Dz. U. (...))
                   in which case year = 2017, title = "o zmianie ustawy", rest_match = "Dz. U. (...)"
    
    there are two cases:
    a) full reference
       when rest_match start's with a year: Dz. U. z 2016 r. poz. 1510 i 2074
       in which case we take "2016" as a year
    b) partial reference
       when rest_match start's with a Nr:  Dz. U. Nr 183, poz. 1538 
       in which case the year was somewhere before
    """

    references = []
    
    is_b = len(list(regex.finditer(r'Dz\.\s*U\.\s*z\s*\d{4}', rest_match))) == 0
    
    if is_b:
        year_matches = list(regex.finditer(r'z\s*\d{4}\s*r\.', rest_match))
        if len(year_matches) == 0:
            references += handle_b(year, title, rest_match)
        else:
            start = year_matches[0].start()
            references += handle_b(year, title, rest_match[:start])
            references += references_in_year_groups(rest_match[start:])
    else:
        references += references_in_year_groups(rest_match)
    
    return references

In [235]:
def external_journal_matches(text):
    """    
    Given someting like:    
        Art. 2.
        W ustawie z dnia
        29 lipca 2005 r. o obrocie instrumentami finansowymi (Dz. U. Nr 183, poz. 1538,
        z późn. zm.[3]))
        w art. 70 w ust. 2 pkt 1 otrzymuje brzmienie:
        „1) art. 69
    
    It matches:
    - "2005" as a year
    - "o obrocie instrumentami finansowymi" as a title
    - (Dz. U (...)) - as something that will be processed later
    """
    
    references = []
    
    pattern = r'(?<=(\d{4})\sr\.)([\p{L}\p{P}\s]*)(\(Dz.\s?U.[^\)]*\))'
    matches = list(regex.finditer(pattern, text))
     
    for match in matches:
        year = match.group(1)
        title = match.group(2).strip(' ')
        rest = match.group(3).strip('()')
        
        references += external_references(year, title, rest)
        
    return references 

In [173]:
def external_footnote_matches(text):
    """
     Matches the part: Dz. U. z 2004 r. (...)
     
     In foot notes like:
     
    [4]) Zmiany tekstu jednolitego wymienionej ustawy zostały
    ogłoszone w Dz. U. z 2004 r. Nr 273, poz. 2703, z 2005 r. Nr 155,
    poz. 1297 i Nr 172, poz. 1440, z 2006 r. Nr 12, poz. 61, z 2007 r. Nr 23, poz.
    136 i Nr 99, poz. 666, z 2008 r. Nr 218, poz. 1391 oraz z 2009 r. Nr 3, poz.
    11, Nr 19, poz. 101, Nr 86, poz. 720, Nr 105, poz. 877, Nr 115, poz. 966,
    Nr 143, poz. 1164 i Nr 157, poz. 1241.
    """
    
    references = []
    
    footnote_pattern = r'\[\d*\]([\s\p{L}\p{P}]*(?=Dz\.\s*U\.))(Dz\.\s*U\.\s*(z\s*\d{4}\s*r\.\s*|Nr\s*\d*[,\.]\s*|poz\.\s*\d*[,\.]?\s*|i\s*\d*,?\s*|oraz\s*)*)'
    footnote_matches = regex.finditer(footnote_pattern, text)

    for match in footnote_matches:
        references += references_in_year_groups(match.group(2))
    
    return references

In [252]:
test = '''
Dz. U. z 1993 r. Nr 90, poz. 416 i Nr 134, poz.
  646, z 1994 r. Nr 43, poz. 163, Nr 90, poz. 419, Nr 113, poz. 547,
  Nr 123, poz. 602 i Nr 126, poz. 626, z 1995 r. Nr 5, poz. 25 i Nr
  133, poz. 654, z 1996 r. Nr 25, poz. 113, Nr 87, poz. 395, Nr 137,
  poz. 638, Nr 147, poz. 686 i Nr 156, poz. 776, z 1997 r. Nr 28, poz.
  153, Nr 30, poz. 164, Nr 71, poz. 449, Nr 85, poz. 538, Nr 96, poz.
  592, Nr 121, poz. 770, Nr 123, poz. 776, Nr 137, poz. 926, Nr 139,
  poz. 932-934 i Nr 141, poz. 943 i 945 oraz z 1998 r. Nr 66, poz.
  430, Nr 74, poz. 471, Nr 108, poz. 685, Nr 117, poz. 756 i Nr 137,
  poz. 887
'''

references_in_year_groups(test)

[416-90-1993,
 646-134-1993,
 163-43-1994,
 419-90-1994,
 547-113-1994,
 602-123-1994,
 626-126-1994,
 25-5-1995,
 654-133-1995,
 113-25-1996,
 395-87-1996,
 638-137-1996,
 686-147-1996,
 776-156-1996,
 153-28-1997,
 164-30-1997,
 449-71-1997,
 538-85-1997,
 592-96-1997,
 770-121-1997,
 776-123-1997,
 926-137-1997,
 932-139-1997,
 430-66-1998,
 471-74-1998,
 685-108-1998,
 756-117-1998,
 887-137-1998]

In [248]:
def all_external_references(bill_content): 
    return external_journal_matches(bill_content) + external_footnote_matches(bill_content)

In [175]:
def print_external_references_in(bill):
    """
    For a given bill summarize the references to other bills
    """
    references = all_external_references(bill.content)
    reference_count = sorted(list(Counter(references).items()), key=lambda x: x[1], reverse=True)
    
    print('{0} - {1}...'.format(bill.file_name, bill.title[:30]))
    for ref, count in reference_count:
        print('\t{0:15}: {1}'.format(str(ref), count))

In [263]:
def title_map(references):
    ref_map = {}
    for ref in references:
        if ref_map.get(ref) is None and ref.title is not None:
            ref_map[ref] = ref.title
    return ref_map

In [230]:
print_external_references_in(files['2004_962.txt'])

2004_962.txt - Ustawa z dnia 20 listopada 200...
	2571-256-2004  : 2
	2703-273-2004  : 2
	1297-155-2005  : 2
	1440-172-2005  : 2
	61-12-2006     : 2
	136-23-2007    : 2
	666-99-2007    : 2
	1391-218-2008  : 2
	11-3-2009      : 2
	101-19-2009    : 2
	720-86-2009    : 2
	877-105-2009   : 2
	966-115-2009   : 2
	1164-143-2009  : 2
	1241-157-2009  : 2
	1538-183-2005  : 1
	3-183-2005     : 1
	708-104-2006   : 1
	1119-157-2006  : 1
	1056-171-2008  : 1
	69-13-2009     : 1
	341-42-2009    : 1
	649-77-2009    : 1
	659-78-2009    : 1
	1316-165-2009  : 1
	1317-166-2009  : 1
	1323-168-2009  : 1


In [271]:
def summary(bills):
    references = []
    for bill in bills:
        references += all_external_references(bill.content)
    reference_count = sorted(list(Counter(references).items()), key=lambda x: x[1], reverse=True)
    
    t_map = title_map(references)
    results = []
    for ref, count in reference_count:
        results.append([count, ref.position, ref.number, ref.year, t_map.get(ref)])
        
    return pd.DataFrame(results, columns=['Count', 'Position', 'Number', 'Year', 'Title'])

### Results
---

In [272]:
summary(files.values())

Unnamed: 0,Count,Position,Number,Year,Title
0,677,668,106,1998,o zmianie niektórych ustaw określających\nkomp...
1,459,496,106,1996,o Służbie Więziennej
2,427,770,121,1997,- Kodeks celny
3,352,136,12,2000,o zmianie niektórych ustaw związanych z\nfunkc...
4,275,554,88,1997,- Przepisy\n wprowadzające Kodeks karny
5,263,153,28,1997,o powszechnym\n ubezpieczeniu zdrowotnym
6,238,198,34,1990,o podziale zadań i kompetencji określonych \nw...
7,222,1118,162,1998,o systemie ubezpieczeń społecznych
8,202,1268,120,2000,
9,191,1126,162,1998,o systemie ubezpieczeń społecznych


## 2. Internal references
---
Find all internal references to regulations, e.g. art. 5 ust. 2, art. 5 ust. 7, etc. The result should exclude the internal numbering of the bill (e.g. Art. 1. W ustawie ...). The result should be aggregated by regulation ID (as described below) and sorted by descending number of reference counts inside particular bill. The bills should be sorted by descending number of internal references. The reference format should include all elements necessary to identify the regulation, e.g.:
- art. 1, ust. 2 - if an article inside the regulation is referenced,
- ust. 2 - if a paragraph inside the same article is referenced,
- etc.

---

In [105]:
class ArticleReference:
    def __init__(self, paragraph, article, same_article):
        self.article = article
        self.paragraph = paragraph
        self.same_article = same_article

### Different corner cases

From 2004_964.txt:

ust. 1-3  
w art. 343 ust. 2, 3 i 5  
art. 185 ust. 1a 

In [36]:
test = files['2004_962.txt']

In [45]:
def search_for_internal(bill):
    articles_pattern = r'Art\.\s*\d*\.\s*[\p{L}\p{P}\d\s]*?(?=Art|\Z|\n+)'
    articles_matches = list(regex.finditer(articles_pattern, bill.content))
    
    for article in articles_matches:
        article_pattern =  r'art\.\s*(\d*)\s*ust\.\s*(\d*\w?(\s*(i|,|oraz)*\s*\d*)*)'
    
    return articles_matches

In [43]:
len(search_for_internal(test))

3

In [46]:
# (?<=art\.)\s*(\d*)[\s\p{L}\p{P}]*(ust\.\s*(\d)*)

## 3. Count occurrences of "ustawa"
---

Count all occurrences of the word ustawa in all inflected forms (ustawa, ustawie, ustawę, etc.), and all spelling forms (ustawa, Ustawa, USTAWA), excluding other words with the same prefix (e.g. ustawić).

---

In [21]:
def match_in_bills(bills):
    results = {}
    
    flexes = ['ustawa', 'ustawy', 'ustawy', 'ustaw', 'ustawie', 
              'ustawom',  'ustawę', 'ustawy', 'ustawą', 'ustawami',
              'ustawie', 'ustawach', 'ustawo', 'ustawy'] 
    
    alternatives = '|'.join(flexes)

    pattern = r'(?i)\b' + alternatives + '\b'
    
    for file_name, bill in bills.items():
        result = regex.findall(pattern, bill.content)
        results[file_name] = len(result)
        
    return results

In [22]:
def summary(results):
    least_n, least_v = None, None
    most_n, most_v = None, None
    total = 0
    for file_name, result in results.items():
        if least_v is None or least_v > result: 
            least_n, least_v = file_name, result
        if most_v is None or most_v < result:
            most_n, most_v = file_name, result
        total += result
        
    print('---------- SUMMARY ----------')
    print('least: {0:15} {1}'.format(least_n, least_v))
    print('most:  {0:15} {1}'.format(most_n, most_v))
    print('total: {0:21}'.format(total))
    print('-----------------------------')

In [23]:
results = match_in_bills(files)
summary(results)

---------- SUMMARY ----------
least: 1996_400.txt    0
most:  2000_696.txt    304
total:                 25940
-----------------------------
