In [1]:
import utils
import requests

text_by_file = utils.load_files('./ustawy/*')

for file,text in text_by_file.items():
    text_by_file[file] = ' '.join(text.split())

1. Read the classification of [Named Entities](https://clarin-pl.eu/dspace/bitstream/handle/11321/294/WytyczneKPWr-jednostkiidentyfikacyjne.pdf).
1. Read the [API of NER](https://wiki.clarin-pl.eu/pl/nlpws/services/liner2) in [Clarin](https://ws.clarin-pl.eu/ner.shtml).
1. Read the [documentation of CCL format](https://wiki.clarin-pl.eu/pl/nlpws/services/ccl) or [more tourough documentation of CCL format](http://nlp.pwr.wroc.pl/redmine/projects/corpus2/wiki/CCL_format).
1. Sort bills according to their size and take top 50 (largest) bills.

In [4]:
text_sorted = {k: v for k, v in sorted(text_by_file.items(), key=lambda item: len(item[1]), reverse=True)[:50]}
text_sorted.keys()

dict_keys(['./ustawy/2000_696.txt', './ustawy/2001_627.txt', './ustawy/1996_465.txt', './ustawy/2002_1689.txt', './ustawy/1997_555.txt', './ustawy/1998_1118.txt', './ustawy/2000_1186.txt', './ustawy/2003_2277.txt', './ustawy/2003_1750.txt', './ustawy/2001_1070.txt', './ustawy/2001_499.txt', './ustawy/1997_117.txt', './ustawy/2004_2065.txt', './ustawy/2001_1368.txt', './ustawy/2004_1693.txt', './ustawy/2001_1229.txt', './ustawy/2000_1268.txt', './ustawy/1997_926.txt', './ustawy/1994_195.txt', './ustawy/2001_1545.txt', './ustawy/2004_880.txt', './ustawy/2004_177.txt', './ustawy/2003_423.txt', './ustawy/2000_1104.txt', './ustawy/1997_714.txt', './ustawy/1997_553.txt', './ustawy/1997_349.txt', './ustawy/1994_591.txt', './ustawy/2004_2533.txt', './ustawy/2001_1381.txt', './ustawy/1999_930.txt', './ustawy/2000_1315.txt', './ustawy/2000_136.txt', './ustawy/1999_95.txt', './ustawy/1996_460.txt', './ustawy/2003_2256.txt', './ustawy/1997_557.txt', './ustawy/1996_110.txt', './ustawy/1996_561.txt'

In [69]:
import shutil

for file in text_sorted.keys():
    shutil.copy(file, 'ustawy_top_50')


5. Use the lemmatized and sentence split documents (from ex. 5) to identify the expressions that consist of consecutive
   words starting with a capital letter (you will have to look at the inflected form of the word to check its
   capitalization) that do not occupy the first position in a sentence. E.g. the sentence:
   ```
   Wczoraj w Krakowie miało miejsce spotkanie prezydentów Polski i Stanów Zjednoczonych.
   ```
   should yield the following entries: `Kraków`, `Polska`, `Stan Zjednoczony`.

In [12]:
lem_files = ['./ustawy_lem/' + '%'.join(f.split('/')[1:]) for f in text_sorted.keys()]

In [29]:
import xml.etree.ElementTree as ET
import glob

def parse_xml(path: str) -> list[str]:
    tree = ET.parse(path)
    root = tree.getroot()

    sentences = []
    
    for chunk in root.iter('chunk'):
        sentences.append(list(chunk.iter('sentence')))
    
    return sentences

def parse_files(paths: list[str]) -> dict[str, str]:
    files = []

    for file in paths:
        files.append(parse_xml(file))

    return files

files_xml = parse_files(lem_files)

In [66]:
def find_ne_in_sentence(sentence):
    nes = []
    ne = ''
    toks = list(sentence.iter('tok'))
    for tok in toks[1:]:
        orth = tok.find('orth').text
        base = tok.find('lex').find('base').text
        if (orth[0].isupper() or (ne != '' and orth == '.')) and orth not in ['Dz','U','Art','Poz']: # Dz.U. is technically a NE
            ne += base + ' '
        elif ne != '':
            nes.append(ne)
            ne = ''
    return nes


In [67]:
nes = [ne for chunk in files_xml for sentences in chunk for sentence in sentences for ne in find_ne_in_sentence(sentence)]
nes[:10]

['minister ',
 'nr ',
 'minister sprawa wewnętrzny ',
 'minister ',
 'minister sprawa wewnętrzny ',
 'minister ',
 'prawa ',
 'nr ',
 'minister sprawa wewnętrzny ',
 'administracja ']

6. Compute the frequency of each identified expression and print 50 results with the largest number of occurrences.

In [68]:
from collections import Counter

nes_freq = Counter(nes)

a = sorted(nes_freq.items(), key=lambda item: item[1], reverse=True)[:50]
print(a)

[('nr ', 4778), ('rzeczpospolita polski ', 587), ('kodeks ', 522), ('policja ', 384), ('skarb państwo ', 320), ('prawo ', 298), ('unia europejski ', 291), ('kasa chora ', 261), ('straż graniczny ', 258), ('minister ', 255), ('prezes urząd ', 242), ('zmiana ', 205), ('państwowy straż pożarny ', 201), ('zakład ', 196), ('pozostały ', 179), ('rad minister ', 170), ('fundusz ', 165), ('sprawiedliwość ', 163), ('państwowy komisja wyborczy ', 161), ('rad ', 157), ('obrona narodowy ', 156), ('w ', 152), ('urząd patentowy ', 152), ('EFTA ', 151), ('minister sprawiedliwość ', 144), ('europejski porozumienie ', 141), ('Wolny handel ', 140), ('tkanina ', 138), ('minister obrona narodowy ', 136), ('europejski obszar gospodarczy ', 128), ('finanse ', 115), ('SKW ', 115), ('i ', 113), ('urząd ', 110), ('przepis ', 102), ('komisja ', 101), ('biuro ', 98), ('inspektor nadzór wewnętrzny ', 93), ('damski ', 91), ('SWW ', 88), ('ordynacja ', 86), ('opieka społeczny ', 84), ('urząd ochrona państwo ', 81),

'rzeczpospolita polski' i 'Rzeczpospolita Polska' - z jakiegoś powodu w zależności od przypadku 'Polski' we frazie jest ona przez Clarin traktowana inaczej: 

'Rzeczpospolitej Polskiej' -> 'rzeczpospolita polski'

'Rzeczpospolita Polska', 'Rzeczpospolitą Polską' -> 'Rzeczpospolita Polska'

przykład [`ustawy%2004_963.txt`](./ustawy_lem/ustawy%252004_963.txt)

7. Apply the NER algorithm to identify the named entities in the same set of documents (not lemmatized) using the `n82` model.

In [None]:
from lpmn_client import download_file, upload_file
from lpmn_client import Task

# not working, use web page instead
task = Task(lpmn='any2txt|wcrft2|liner2({"model":"n82"})')
task.email = "pawel.kopel2@gmail.com"  # change e-mail

file_id = upload_file("./ustawy_top_50.zip")
output_file_id = task.run(file_id)
download_file(output_file_id, "./ustawy_top_50_nes")

8. Plot the frequency (histogram) of the coarse-grained classes (e.g. `nam_adj`, `nam_eve`, `nam_fac`).

9. Display 10 most frequent Named Entities for each coarse-grained type.

10. Display 50 most frequent Named Entities including their count and fine-grained type.

11. Display 5 sentences containing at least 2 recognized named entities with different types. Highlight the recognized spans with color.
   (For demo application [Streamlit](https://streamlit.io/) might be useful for displaying NER results).

12. Answer the following questions:
      1. Which of the method (counting expressions with capital letters vs. NER) worked better for the task concerned with
         identification of the proper names?
      1. What are the drawbacks of the method based on capital letters?
      1. What are the drawbacks of the method based on NER?
      1. Which of the coarse-grained NER groups has the best and which has the worst results? Try to justify this
         observation.
      1. Do you think NER is sufficient for identifying different occurrences of the same entity (i.e. consider "USA" and
         "Stany Zjednoczone" and "Stany Zjednoczone Ameryki Północnej")? If not, can you suggest an algorithm or a tool that
         would be able to group such names together?
      1. Can you think of a real world problem that would benefit the most from application of Named Entity Recognition
         algorithm?