#### 1. Use SpaCy tokenizer API to tokenize the text from the law corpus.

In [25]:
import glob
import pandas as pd
import regex
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.pl import Polish
from spacy.tokens.doc import Doc as SpacyDoc
from spacy.tokens.span import Span as SpacySpan

nlp = spacy.load("pl_core_news_sm")
tokenizer = nlp.tokenizer

In [26]:
from typing import Dict

In [27]:
def load_bills(
    path,
    extension= "txt"
):
    paths = glob.glob(f"{path}/*.{extension}")
    for file_path in paths:
        with open(file_path) as f:
            yield (
                file_path.split("/")[-1],
                f.read(),
            )

bills = list(load_bills(path="/Users/mateusz/nlp/bills"))


def normalize(content: str):
    return regex.sub(r"\s+", " ", content.strip().lower())


for idx in range(len(bills)):
    filename, content = bills[idx]
    normalized_content = normalize(content)
    tokenized_content = tokenizer(normalized_content)
    bills[idx] = (filename, tokenized_content)

In [28]:
print(type(bills[0][1][6]))
print(type(bills[0][1][6].text))
print(type(bills[0][1]))

<class 'spacy.tokens.token.Token'>
<class 'str'>
<class 'spacy.tokens.doc.Doc'>


#### 2. Compute bigram counts of downcased tokens.

In [29]:
type(bills[0][1][1:3])

spacy.tokens.span.Span

In [30]:
def ngram_moving_window(n: int, tokenized_content: SpacyDoc) -> SpacySpan:
    ngrams = []
    for idx in range(len(tokenized_content) - n + 1):
        ngram = tokenized_content[idx:idx+n]
        ngrams.append(str(ngram).rstrip())

    return ngrams

In [31]:
ngrams = []
for (filename, tokenized_bill_content) in bills:
    ngrams.extend(ngram_moving_window(2, tokenized_bill_content))

In [32]:
len(ngrams)

5222177

In [33]:
ngrams[0:10]

['dz.u.',
 '. z',
 'z 2001',
 '2001 r',
 'r.',
 '. nr',
 'nr 81',
 '81,',
 ', poz',
 'poz.']

In [34]:
from collections import Counter
ngram_counts = Counter(ngrams)

In [35]:
ngram_counts.most_common(15)

[('art.', 83778),
 ('ust.', 53552),
 ('poz.', 45198),
 (', poz', 43188),
 ('. 1', 39543),
 ('--', 36541),
 ('r.', 33008),
 ('w art', 32042),
 (', o', 29919),
 ('mowa w', 28471),
 ('. 2', 26563),
 ('w ust', 23557),
 ('. art', 22922),
 (', w', 22477),
 ('. nr', 21425)]

#### 3. Discard bigrams containing characters other than letters. Make sure that you discard the invalid entries after computing the bigram counts.

In [36]:
ngram_counts_valid = {}

def is_valid_ngram(ngram: str) -> bool: 
    return True if regex.search(r"[^a-zA-Z\s]", ngram) is None else False

for ngram_count in ngram_counts.items():
    ngram, count = ngram_count
    if is_valid_ngram(ngram):
        ngram_counts_valid[ngram] = count

ngram_counts_valid = sorted(ngram_counts_valid.items(), key=lambda x: x[1], reverse=True)

In [37]:
df_ngram_counts = pd.DataFrame(ngram_counts_valid, columns=['ngram', 'count'])

In [38]:
df_ngram_counts.head(15)

Unnamed: 0,ngram,count
0,w art,32042
1,mowa w,28471
2,w ust,23557
3,otrzymuje brzmienie,9553
4,z dnia,9527
5,do spraw,8715
6,i nr,8435
7,w brzmieniu,7280
8,w drodze,7127
9,na podstawie,6674


#### 4. Use pointwise mutual information to compute the measure for all pairs of words.

Skorzystanie z poprzedniego laboratorium.
Określenie liczby wystąpień każdego tokenu (spełniajacego odpowiednie warunki) w korpusie.

In [39]:
def reject(token_str):
    search_res = regex.search(r"[\W\d_]", token_str)

    if search_res is None:
        return False
    return True

def get_frequencies(tokenized_content: str):
    counts = {}
    for token in tokenized_content:
        token = token.text
        if token not in counts:
            counts[token] = 1
        else:
            counts[token] += 1

    tmp = dict(sorted(
        counts.items(), key=lambda pair: pair[1], reverse=True
    ))

    return dict(filter(lambda pair: reject(pair[0]) == False, tmp.items()))

frequencies = []

for _, tokenized_content in bills:
    frequencies.append(
        get_frequencies(tokenized_content)
    )

global_frequencies = {}

for d_frequency in frequencies:
    for token, value in d_frequency.items():
        if token not in global_frequencies:
            global_frequencies[token] = 0
        global_frequencies[token] += value

global_frequencies = dict(sorted(global_frequencies.items(), key=lambda pair: pair[1], reverse=True))

word_count = pd.DataFrame({
    'word': list(global_frequencies.keys()),
    'count': list(global_frequencies.values())
})

In [40]:
word_count.head()

Unnamed: 0,word,count
0,w,201199
1,i,90006
2,art,83804
3,z,82438
4,o,64776


In [41]:
total_words_occurences = word_count['count'].sum(axis=0)
total_bigrams_occurences = df_ngram_counts['count'].sum(axis=0)

In [42]:
counted_words: Dict[str, int] = word_count.set_index('word').to_dict()['count']
counted_words_probabilities: Dict[str, float] = dict(map(lambda kv: (kv[0], kv[1]/total_words_occurences), counted_words.items()))
counted_bigrams: Dict[str, int] = df_ngram_counts.set_index('ngram').to_dict()['count']
counted_bigrams_probabilities: Dict[str, float] = dict(map(lambda kv: (kv[0], kv[1]/total_bigrams_occurences), counted_bigrams.items()))

In [43]:
def calculate_bigram_pmi(bigram: str):
    import math
    word1, word2 = bigram.split(" ")
    p_bigram = counted_bigrams_probabilities[bigram]
    p_word1 = counted_words_probabilities[word1]
    p_word2 = counted_words_probabilities[word2]

    return math.log2(p_bigram / (p_word1*p_word2))

In [44]:
df_ngram_counts.columns = ['bigram', 'bigram_count']

In [45]:
df_ngram_counts.head(3)

Unnamed: 0,bigram,bigram_count
0,w art,32042
1,mowa w,28471
2,w ust,23557


In [46]:
df_ngram_counts['bigram_pmi'] = df_ngram_counts['bigram'].apply(calculate_bigram_pmi)

#### 5. Sort the word pairs according to that measure in the descending order and determine top 10 entries.

In [47]:
df_ngram_counts.sort_values('bigram_pmi', ascending=False).head(10)

Unnamed: 0,bigram,bigram_count,bigram_pmi
191346,polichlorkiem winylu,1,23.246572
189095,peganum harmala,1,23.246572
189082,normorfina demetylomorfina,1,23.246572
166759,agenci ubezpieczeniowi,1,23.246572
189085,nymphaea caerulea,1,23.246572
189092,oksykodon eukodal,1,23.246572
102198,studzianki pancerne,1,23.246572
154492,wyczuwalne dotykiem,1,23.246572
117342,przedemery talne,1,23.246572
148126,radzieckim wodzem,1,23.246572


#### 6. Filter bigrams with number of occurrences lower than 5. Determine top 10 entries for the remaining dataset (>=5 occurrences).

In [48]:
df_ngram_counts[df_ngram_counts['bigram_count'] >= 5].head(10)

Unnamed: 0,bigram,bigram_count,bigram_pmi
0,w art,32042,4.241254
1,mowa w,28471,5.612585
2,w ust,23557,4.441263
3,otrzymuje brzmienie,9553,9.836095
4,z dnia,9527,6.001339
5,do spraw,8715,7.154949
6,i nr,8435,4.375235
7,w brzmieniu,7280,5.625339
8,w drodze,7127,5.617821
9,na podstawie,6674,7.581156


#### 7. Use KRNNT or Clarin-PL API(https://ws.clarin-pl.eu/tager.shtml) to tag and lemmatize the corpus.

In [2]:
from xml.etree.ElementTree import Element
from typing import Dict, List

In [3]:
def calculate_clarin_tokens_metadata(root: Element):
    """Iterate over analalyzed bill content from Clarin API.
    Assosciate each token with its base form and morfologic tag.
    Return list with all tokens - tokenized bill content
    and dictionary with tokens metadata that associate each
    unique token in the content with its metadata.
    """
    tokens_metadata = {}
    all_tokens = []

    for token_tag in root.iter(tag='tok'):
        token = None
        for orth_tag in token_tag.iter(tag='orth'):
            token = orth_tag.text
            all_tokens.append(token)

        base_forms = []
        for base_tag in token_tag.iter(tag='base'):
            base_forms.append(base_tag.text)

        ctags = []
        for ctag_tag in token_tag.iter(tag='ctag'):
            ctags.append(ctag_tag.text)

        
        if token not in tokens_metadata.keys():
            tokens_metadata[token] = {
                "base": base_forms[0].lower(),
                "tag": ctags[0].lower().split(':')[0],
            }

    return all_tokens, tokens_metadata

Kluczami w słowniku tokens_metadata będą tokeny danej ustawy, wartością dla danego tokenu będzie słownik zawierający formę podstawową słowa oraz odpowiednią część tagu morfosyntaktycznego słowa. Lista all_tokens zawiera wszystkie dokeny w tanej ustawie w kolejności zgodnej z treścią ustawy.

In [4]:
import xml.etree.ElementTree as ET

clarin_bills = {}
paths = glob.glob(f"./bills_clarin/*.txt")
for file_path in paths:
    with open(file_path) as f:
        bill_content = f.read()
        root = ET.fromstring(bill_content)
        all_tokens, tokens_metadata = calculate_clarin_tokens_metadata(root)
        clarin_bills[file_path.split("/")[-1]] = {
            "tokenized_content": all_tokens,
            "tokens_metadata": tokens_metadata,
        }

In [5]:
print("Straży ->", clarin_bills['2001_874.txt']['tokens_metadata']['Straży'])
print("Państwowej ->", clarin_bills['2001_874.txt']['tokens_metadata']['Państwowej'])

Straży -> {'base': 'straż', 'tag': 'subst'}
Państwowej -> {'base': 'państwowy', 'tag': 'adj'}


In [6]:
len(clarin_bills.keys())

1178

In [7]:
# How to access the tokenized content of the bill?
len(clarin_bills['2001_974.txt']['tokenized_content'])

612

#### 8. Using the tagged corpus compute bigram statistic for the tokens containing: a. lemmatized, downcased word b. morphosyntactic category of the word (subst, fin, adj, etc.)

In [8]:
def ngram_clarin(n: int, clarin_tokenized_content: List[str], tokens_metadata: Dict[str, Dict[str, str]]):
    ngrams = []

    for idx in range(len(clarin_tokenized_content) - n + 1):
        ngram = clarin_tokenized_content[idx:idx+n]
        ngrams_representation = []
        for token in ngram:
            base_form = tokens_metadata[token]['base']
            tag = tokens_metadata[token]['tag']
            ngrams_representation.append(f"{base_form}:{tag}")
        ngrams.append(" ".join(ngrams_representation))

    return ngrams

Example

In [9]:
sample_bill_tokenized_content = clarin_bills['2001_974.txt']['tokenized_content']
sample_bill_tokens_metadata = clarin_bills['2001_974.txt']['tokens_metadata']

ngram_clarin(2, sample_bill_tokenized_content, sample_bill_tokens_metadata)[15:25]

['z:prep dzień:subst',
 'dzień:subst 20:num',
 '20:num lipiec:subst',
 'lipiec:subst 2001:num',
 '2001:num r:ign',
 'r:ign .:interp',
 '.:interp o:prep',
 'o:prep świadczenie:subst',
 'świadczenie:subst dla:prep',
 'dla:prep cywilny:adj']

For each bill' tokens_metadata, obtain list of ngrams and filter out valid ngram according to the rule and order from 3rd point which is "Discard bigrams containing characters other than letters. Make sure that you discard the invalid entries after computing the bigram counts."

Get clarin bigrams from the entire corpus.

In [10]:
clarin_bigrams = []
for bill_filename in clarin_bills.keys():
    tokenized_content = clarin_bills[bill_filename]['tokenized_content']
    tokens_metadata = clarin_bills[bill_filename]['tokens_metadata']

    clarin_bigrams.extend(
        ngram_clarin(2, tokenized_content, tokens_metadata)
    )

In [11]:
len(clarin_bigrams)

5272789

Count bigrams

In [12]:
from collections import Counter

clarin_bigrams_counts = Counter(clarin_bigrams)

Filter out invalid bigrams

In [13]:
clarin_bigrams_counts_valid = {}

def is_valid_clarin_bigram(bigram: str) -> bool:
    token1_meta, token2_meta = bigram.split(" ")

    if ':interp' in token1_meta or ':interp' in token2_meta:
        return False

    token_1 = token1_meta.split(":")[0]
    token_2 = token2_meta.split(":")[0]
    bigram = token_1 + " " + token_2
    return True if regex.search(r"[^a-zA-Z\s\p{L}]", bigram) is None else False

for clarin_bigram_count in clarin_bigrams_counts.items():
    clarin_bigram, count = clarin_bigram_count
    if is_valid_clarin_bigram(clarin_bigram):
        clarin_bigrams_counts_valid[clarin_bigram] = count

In [14]:
clarin_bigrams_counts_valid = sorted(clarin_bigrams_counts_valid.items(), key=lambda x: x[1], reverse=True)

In [15]:
df_clarin_ngram_counts = pd.DataFrame(
    clarin_bigrams_counts_valid, columns=['bigram', 'bigram_count'])

In [16]:
df_clarin_ngram_counts.head(20)

Unnamed: 0,bigram,bigram_count
0,w:prep art:ign,32044
1,o:prep który:adj,28656
2,który:adj mowa:subst,28538
3,mowa:subst w:prep,28473
4,w:prep usta:subst,23557
5,z:prep dzień:subst,11360
6,otrzymywać:fin brzmienie:subst,10536
7,określony:adj w:prep,10240
8,do:prep sprawa:subst,8718
9,ustawa:subst z:prep,8625


#### 10. Compute the same statistics as for the non-lemmatized words (i.e. PMI) and print top-10 entries with at least 5 occurrences.

Count the number of lemmatized tokens in the lemmatized corpus

In [17]:
clarin_lemmatized_tokens = []
for bill_filename in clarin_bills.keys():
    tokenized_content = clarin_bills[bill_filename]['tokenized_content']
    tokens_metadata = clarin_bills[bill_filename]['tokens_metadata']
    lematized_tokens = [tokens_metadata[token]['base'] for token in tokenized_content]

    clarin_lemmatized_tokens.extend(lematized_tokens)

In [20]:
clarin_all_lematized_tokens_frequencies = Counter(clarin_lemmatized_tokens)
clarin_lemmatized_tokens_frequencies = {}

for lemmatized_token, count in clarin_all_lematized_tokens_frequencies.items():
    if not reject(lemmatized_token):
        clarin_lemmatized_tokens_frequencies[lemmatized_token] = count

In [21]:
lemmatized_token_count = pd.DataFrame({
    'lemmatized_token': list(clarin_lemmatized_tokens_frequencies.keys()),
    'count': list(clarin_lemmatized_tokens_frequencies.values())
})

lemmatized_token_count.sort_values('count', ascending=False).reset_index(drop=True).head(10)

Unnamed: 0,lemmatized_token,count
0,w,202950
1,i,90044
2,z,87991
3,art,83805
4,o,64809
5,do,60768
6,usta,53641
7,na,50657
8,który,49382
9,się,45887


In [49]:
counted_lemmatized_tokens: Dict[str, int] = (
    lemmatized_token_count.set_index('lemmatized_token').to_dict()['count'])

total_lemmatized_tokens_occurences = lemmatized_token_count['count'].sum(axis=0)

counted_lemmatized_tokens_probabilities: Dict[str, float] = dict(
    map(lambda kv: (kv[0], kv[1]/total_lemmatized_tokens_occurences), counted_lemmatized_tokens.items()))

total_clarin_bigrams_occurences = df_clarin_ngram_counts['bigram_count'].sum(axis=0)

counted_clarin_bigrams: Dict[str, int] = (
    df_clarin_ngram_counts.set_index('bigram').to_dict()['bigram_count'])

counted_clarin_bigrams_probabilities: Dict[str, float] = dict(
    map(lambda kv: (kv[0], kv[1]/total_bigrams_occurences), counted_clarin_bigrams.items()))

In [50]:
def calculate_clarin_bigram_pmi(clarin_bigram: str):
    import math
    token1_meta, token2_meta = clarin_bigram.split(" ")
    token1 = token1_meta.split(":")[0]
    token2 = token2_meta.split(":")[0]

    p_clarin_bigram = counted_clarin_bigrams_probabilities[clarin_bigram]
    p_token1 = counted_lemmatized_tokens_probabilities[token1]
    p_token2 = counted_lemmatized_tokens_probabilities[token2]

    return math.log2(p_clarin_bigram / (p_token1*p_token2))

In [51]:
df_clarin_ngram_counts['pmi'] = df_clarin_ngram_counts['bigram'].apply(calculate_clarin_bigram_pmi)

In [52]:
df_clarin_ngram_counts.head(5)

Unnamed: 0,bigram,bigram_count,pmi
0,w:prep art:ign,32044,4.23774
1,o:prep który:adj,28656,6.486433
2,który:adj mowa:subst,28538,7.650354
3,mowa:subst w:prep,28473,5.607998
4,w:prep usta:subst,23557,4.437542


#### BIGRAMS - RESULTS

Compare the results for copora without and with tagging and lemmatization for bigrams.

In [53]:
df_ngram_counts.head(10)

Unnamed: 0,bigram,bigram_count,bigram_pmi
0,w art,32042,4.241254
1,mowa w,28471,5.612585
2,w ust,23557,4.441263
3,otrzymuje brzmienie,9553,9.836095
4,z dnia,9527,6.001339
5,do spraw,8715,7.154949
6,i nr,8435,4.375235
7,w brzmieniu,7280,5.625339
8,w drodze,7127,5.617821
9,na podstawie,6674,7.581156


In [54]:
df_clarin_ngram_counts.head(10)

Unnamed: 0,bigram,bigram_count,pmi
0,w:prep art:ign,32044,4.23774
1,o:prep który:adj,28656,6.486433
2,który:adj mowa:subst,28538,7.650354
3,mowa:subst w:prep,28473,5.607998
4,w:prep usta:subst,23557,4.437542
5,z:prep dzień:subst,11360,5.581957
6,otrzymywać:fin brzmienie:subst,10536,9.026412
7,określony:adj w:prep,10240,5.23692
8,do:prep sprawa:subst,8718,6.439875
9,ustawa:subst z:prep,8625,5.298901


#### 11 & 12 Ad1. - trigrams without tagging and lemmatization (SpaCy)

In [55]:
trigrams = []
for (filename, tokenized_bill_content) in bills:
    trigrams.extend(ngram_moving_window(3, tokenized_bill_content))

In [56]:
trigram_counts = Counter(trigrams)

In [57]:
trigram_counts_valid = {}

for trigram_count in trigram_counts.items():
    trigram, count = trigram_count
    if is_valid_ngram(trigram):
        trigram_counts_valid[trigram] = count

trigram_counts_valid = sorted(trigram_counts_valid.items(), key=lambda x: x[1], reverse=True)

In [58]:
df_trigram_counts = pd.DataFrame(trigram_counts_valid, columns=['trigram', 'count'])
df_trigram_counts.head(10)

Unnamed: 0,trigram,count
0,mowa w ust,13474
1,mowa w art,12311
2,ustawie z dnia,3649
3,w ustawie z,3645
4,ustawy z dnia,3053
5,dni od dnia,2070
6,w porozumieniu z,1652
7,na podstawie art,1495
8,porozumieniu z ministrem,1334
9,terytorium rzeczypospolitej polskiej,1218


In [59]:
counted_trigrams: Dict[str, int] = df_trigram_counts.set_index('trigram').to_dict()['count']
total_trigrams_occurences = df_trigram_counts['count'].sum(axis=0)
counted_trigram_probabilities: Dict[str, float] = dict(map(lambda kv: (kv[0], kv[1]/total_trigrams_occurences), counted_trigrams.items()))

In [60]:
def calculate_trigram_pmi(trigram: str):
    import math
    token1, token2, token3 = trigram.split(" ")
    p_trigram = counted_trigram_probabilities[trigram]

    p_token1 = counted_words_probabilities[token1]
    p_token2 = counted_words_probabilities[token2]
    p_token3 = counted_words_probabilities[token3]

    return math.log2(p_trigram / (p_token1*p_token2*p_token3))

In [61]:
df_trigram_counts['pmi'] = df_trigram_counts['trigram'].apply(calculate_trigram_pmi)

In [62]:
df_trigram_counts.sort_values('pmi', ascending=False).head(10)

Unnamed: 0,trigram,count,pmi
270115,nieskarmelizowanym sokiem winogronowym,1,45.914519
246044,virtus et fraternitas,1,45.914519
246122,benzimidazol leonotis leonurus,1,45.914519
249483,clavibacter michiganensis ssp,1,45.914519
246202,metylotioamfetamina etycyklidyna pce,1,45.914519
232141,agregatach pralniczych szczenia,1,45.914519
243184,mink virus enteritis,1,45.914519
155780,implantacji stymulatora nerwu,1,45.914519
187385,jewish restitution organisation,1,45.914519
246140,hostilis mitragyna speciosa,1,45.914519


In [63]:
df_trigram_counts[df_trigram_counts['count'] >= 5].sort_values('pmi', ascending=False).head(10)

Unnamed: 0,trigram,count,pmi
4645,profilem zaufanym epuap,13,38.513639
18236,centralnemu biuru antykorupcyjnemu,5,38.06277
6503,turnieju mistrzostw europy,10,37.989112
4644,potwierdzonym profilem zaufanym,13,37.966151
9338,kurtki anorak etc,8,36.870124
16807,najnowszych zdobyczy techniki,5,36.59259
1758,socjalistycznych republik radzieckich,27,36.352276
16320,terminalu regazyfikacyjnego skroplonego,5,36.159631
14805,bankowemu funduszowi gwarancyjnemu,5,36.129884
14733,drewna tartacznego iglastego,5,36.112325


#### 11 & 12 Ad1. - trigrams with tagging and lemmatization (Clarin)

In [64]:
clarin_trigrams = []
for bill_filename in clarin_bills.keys():
    tokenized_content = clarin_bills[bill_filename]['tokenized_content']
    tokens_metadata = clarin_bills[bill_filename]['tokens_metadata']

    clarin_trigrams.extend(
        ngram_clarin(3, tokenized_content, tokens_metadata)
    )

In [65]:
clarin_trigrams_counts = Counter(clarin_trigrams)

In [68]:
clarin_trigrams_counts_valid = {}

def is_valid_clarin_trigram(trigram: str) -> bool:
    token1_meta, token2_meta, token3_meta = trigram.split(" ")

    if ':interp' in token1_meta or ':interp' in token2_meta or ':interp' in token3_meta:
        return False

    token_1 = token1_meta.split(":")[0]
    token_2 = token2_meta.split(":")[0]
    token_3 = token3_meta.split(":")[0]

    trigram = token_1 + " " + token_2 + " " + token_3
    return True if regex.search(r"[^a-zA-Z\s\p{L}]", trigram) is None else False

for clarin_trigram_count in clarin_trigrams_counts.items():
    clarin_trigram, count = clarin_trigram_count
    if is_valid_clarin_trigram(clarin_trigram):
        clarin_trigrams_counts_valid[clarin_trigram] = count

clarin_trigrams_counts_valid = sorted(clarin_trigrams_counts_valid.items(), key=lambda x: x[1], reverse=True)

In [69]:
df_clarin_trigram_counts = pd.DataFrame(
    clarin_trigrams_counts_valid, columns=['trigram', 'trigram_count'])

In [70]:
total_clarin_trigrams_occurences = df_clarin_trigram_counts['trigram_count'].sum(axis=0)

counted_clarin_trigrams: Dict[str, int] = (
    df_clarin_trigram_counts.set_index('trigram').to_dict()['trigram_count'])

counted_clarin_trigrams_probabilities: Dict[str, float] = dict(
    map(lambda kv: (kv[0], kv[1]/total_trigrams_occurences), counted_clarin_trigrams.items()))

In [71]:
def calculate_clarin_trigram_pmi(clarin_trigram: str):
    import math
    token1_meta, token2_meta, token3_meta = clarin_trigram.split(" ")

    token1 = token1_meta.split(":")[0]
    token2 = token2_meta.split(":")[0]
    token3 = token3_meta.split(":")[0]

    p_clarin_trigram = counted_clarin_trigrams_probabilities[clarin_trigram]

    p_token1 = counted_lemmatized_tokens_probabilities[token1]
    p_token2 = counted_lemmatized_tokens_probabilities[token2]
    p_token3 = counted_lemmatized_tokens_probabilities[token3]

    return math.log2(p_clarin_trigram / (p_token1*p_token2*p_token3))

In [72]:
df_clarin_trigram_counts['pmi'] = df_clarin_trigram_counts['trigram'].apply(calculate_clarin_trigram_pmi)

In [73]:
df_clarin_trigram_counts[df_clarin_trigram_counts['trigram_count'] >= 5].sort_values('pmi', ascending=False).head(10)

Unnamed: 0,trigram,trigram_count,pmi
58987,porcelanowy:adj młyn:subst kulowy:adj,5,39.798608
38111,wymiennik:subst przeponowy:adj rurowy:adj,7,38.927891
58434,reakcja:subst łańcuchowa:subst rozszczepienie:...,5,38.095001
23922,piłka:subst nożny:adj uefa:subst,10,37.169001
42208,stany:subst zjednoczyć:ppas ameryka:subst,6,36.927891
23919,finałowy:adj turniej:subst mistrzostwa:subst,10,36.886231
26870,przedwczesny:adj wyrąb:subst drzewostan:subst,9,36.713571
33680,kurtka:subst anorak:subst etc:ign,8,36.570339
59358,mecz:subst piłka:subst nożny:adj,5,36.432036
17223,profil:subst zaufany:adj epuap:ign,13,36.344808


#### TRIGRAMS - RESULTS

Compare the results for copora without and with tagging and lemmatization for bigrams.

In [74]:
df_trigram_counts.head(10)

Unnamed: 0,trigram,count,pmi
0,mowa w ust,13474,11.490302
1,mowa w art,12311,10.716254
2,ustawie z dnia,3649,14.865331
3,w ustawie z,3645,11.377503
4,ustawy z dnia,3053,13.350322
5,dni od dnia,2070,16.892624
6,w porozumieniu z,1652,11.844659
7,na podstawie art,1495,11.735964
8,porozumieniu z ministrem,1334,18.210488
9,terytorium rzeczypospolitej polskiej,1218,21.534617


In [75]:
df_clarin_trigram_counts.head(10)

Unnamed: 0,trigram,trigram_count,pmi
0,o:prep który:adj mowa:subst,28535,14.3387
1,który:adj mowa:subst w:prep,28442,12.687132
2,mowa:subst w:prep usta:subst,13474,11.489936
3,mowa:subst w:prep art:ign,12311,10.716006
4,ustawa:subst z:prep dzień:subst,8589,13.245133
5,właściwy:adj do:prep sprawa:subst,7966,15.206835
6,minister:subst właściwy:adj do:prep,7888,15.257547
7,w:prep droga:subst rozporządzenie:subst,4751,14.845896
8,zastępować:fin się:qub wyraz:subst,3653,16.449759
9,w:prep ustawa:subst z:prep,3646,9.09833
