In [1]:
import pandas as pd
import regex as re

import pickle
import spacy

from tqdm.notebook import tqdm
from spacy.tokenizer import Tokenizer
from spacy.lang.pl import Polish
from math import log2
from dataclasses import dataclass, field
from collections import defaultdict
from functools import cached_property

In [2]:
text = pd.read_json('../../data/corpus.jsonl', lines=True)
text.head()

Unnamed: 0,_id,title,text,metadata
0,3,,"Nie mówię, że nie podoba mi się też pomysł szk...",{}
1,31,,Tak więc nic nie zapobiega fałszywym ocenom po...,{}
2,56,,Nigdy nie możesz korzystać z FSA dla indywidua...,{}
3,59,,Samsung stworzył LCD i inne technologie płaski...,{}
4,63,,Oto wymagania SEC: Federalne przepisy dotycząc...,{}


In [3]:
corpus = text.set_index('_id')['text'].tolist()
corpus = [text.lower() for text in corpus]

Turns out later, that for some reason we have multiple spaces in some documents, causing huge problems in bigrams. Due to that, we will trim spaces in this step

In [4]:
corpus = [re.sub(r"\s+", " ", text) for text in corpus]

# 1. Use SpaCy tokenizer API to tokenize the text from the PiQA corpus.

In [5]:
nlp = spacy.load("pl_core_news_sm")
tokenizer = Tokenizer(nlp.vocab)

In [6]:
# piped_data = list(tqdm(nlp.pipe(corpus), total=len(corpus)))

In [7]:
# with open('piped_data.pickle', 'wb') as f:
#     pickle.dump(piped_data, f)

# 2. Compute bigram counts of downcased tokens. Given the sentence: "The quick brown fox jumps over the lazy dog.", the bigram counts are as follows:

In [8]:
@dataclass
class NGramData:
    unigram_counter: dict = field(default_factory=lambda: defaultdict(lambda: 0))
    bigram_counter: dict = field(default_factory=lambda: defaultdict(lambda: 0))

    @cached_property
    def total_unigrams(self) -> int:
        return sum(self.unigram_counter.values())

    @cached_property
    def total_bigrams(self) -> int:
        return sum(self.bigram_counter.values())
    
    def __pmi(self, bigram):
        unigram_x, unigram_y = bigram
        p_x = self.unigram_counter[unigram_x] / self.total_unigrams
        p_y = self.unigram_counter[unigram_y] / self.total_unigrams
        p_xy = self.bigram_counter[bigram] / self.total_bigrams
    
        ratio = p_xy / (p_x * p_y)
    
        return log2(ratio)
    def pmi(self, min_occurrences=None):
        return {
            k: self.__pmi(k)
            for k, v in self.bigram_counter.items()
            if min_occurrences is None or min_occurrences < v
        }

    def count_unigram(self, unigram) -> None:
        self.unigram_counter[unigram] += 1

    def count_bigram(self, bigram) -> None:
        self.bigram_counter[bigram] += 1
        
    def sanitize_bigrams(self, blacklist=r"[^a-zA-Z\s]"):
        bigrams_to_drop = [
            key
            for key in self.bigram_counter.keys()
            if re.search(blacklist, " ".join(key)) is not None
        ]
        for bigram in bigrams_to_drop:
            del self.bigram_counter[bigram]

def calculate_ngrams(corpus, unigram_processor=lambda x: str(x.text), unigram_validator=lambda x: True):
    data = NGramData()
    
    with open('piped_data.pickle', 'rb') as f:
        piped_data = pickle.load(f)
    
    for doc in tqdm(piped_data):
        last_text = None
        for token in doc:
            text = unigram_processor(token)
            
            if not unigram_validator(token):
                last_text = None
                continue

            data.count_unigram(text)
            
            if last_text is not None:
                bigram = (last_text, text)
                data.count_bigram(bigram)

            last_text = text
    return data

In [9]:
ngram_data = calculate_ngrams(corpus) 

  0%|          | 0/57638 [00:00<?, ?it/s]

# 3. Discard bigrams containing characters other than letters. Make sure that you discard the invalid entries after computing the bigram counts.

In [10]:
ngram_data.sanitize_bigrams()

# 4. Use pointwise mutual information to compute the measure for all pairs of words.

In [11]:
bigrams_pmi = ngram_data.pmi()

# 5. Sort the word pairs according to that measure in the descending order and determine top 10 entries.

In [12]:
def get_best_results(pmi_values, limit=None):
    order = sorted([(v, k) for k, v in pmi_values.items()], reverse=True)
    if limit is not None:
        return order[:limit]
    return order

In [13]:
get_best_results(bigrams_pmi, limit=10)

[(24.824523043569048, ('zygmunt', 'freud')),
 (24.824523043569048, ('zwaartekracht', 'pijnlijk')),
 (24.824523043569048, ('zure', 'bikotearekin')),
 (24.824523043569048, ('zur', 'tagespflege')),
 (24.824523043569048, ('zum', 'einsatz')),
 (24.824523043569048, ('zszywacz', 'pneumatyczny')),
 (24.824523043569048, ('zowel', 'anker')),
 (24.824523043569048, ('zorgeloze', 'spevener')),
 (24.824523043569048, ('zonas', 'rurales')),
 (24.824523043569048, ('zoho', 'invoice'))]

# Filter bigrams with number of occurrences lower than 5. Determine top 10 entries for the remaining dataset (>=5 occurrences).

In [14]:
filtered_bigrams_pmi = ngram_data.pmi(min_occurrences=5)

In [15]:
get_best_results(filtered_bigrams_pmi, limit=10)

[(22.23956054284789, ('seair', 'exim')),
 (22.23956054284789, ('sameer', 'thakar')),
 (22.23956054284789, ('gone', 'fishin')),
 (22.23956054284789, ('electro', 'plating')),
 (22.23956054284789, ('deming', 'electro')),
 (22.017168121511443, ('stwardnienia', 'rozsianego')),
 (22.017168121511443, ('kuala', 'lumpur')),
 (22.017168121511443, ('autot', 'ldr')),
 (21.794775700174995, ('metalami', 'szlachetnymi')),
 (21.794775700174995, ('limo', 'mia'))]

# 7/8/9. Use SpaCy to lemmatize and tag the sentences in the corpus. Using the tagged corpus compute bigram statistic for the tokens containing: a. lemmatized, downcased word b. morphosyntactic category of the word (subst, fin, adj, etc.)

In [16]:
[f"{x.lemma_}:{x.tag_}" 
 for z in nlp.pipe(['To jest testowe zdanie'])
 for x in z]

['to:PRED', 'być:FIN', 'testowy:ADJ', 'zdanie:ADJ']

In [17]:
ngram_data_lemma = calculate_ngrams(None, unigram_processor=lambda x: f"{x.lemma_}:{x.tag_}", unigram_validator=lambda x: len(str(x.tag_)) > 0)

  0%|          | 0/57638 [00:00<?, ?it/s]

# 10. Compute the same statistics as for the non-lemmatized words (i.e. PMI) and print top-10 entries with at least 5 occurrences.

In [18]:
ngram_data_lemma.sanitize_bigrams(blacklist="[^a-zA-Z:\s]")

In [19]:
get_best_results(ngram_data_lemma.pmi(), limit=10)

[(25.19152377725881, ('zygmunt:SUBST', 'freud:SUBST')),
 (25.19152377725881, ('zwei:XXX', 'investmentfonds:XXX')),
 (25.19152377725881, ('zwei:SUBST', 'anlagen:SUBST')),
 (25.19152377725881, ('zwaartekracht:SUBST', 'pijnlijk:SUBST')),
 (25.19152377725881, ('zure:SUBST', 'bikotearekin:SUBST')),
 (25.19152377725881, ('zur:SUBST', 'tagespflege:ADJ')),
 (25.19152377725881, ('zum:SUBST', 'einsatz:SUBST')),
 (25.19152377725881, ('zszywacz:SUBST', 'pneumatyczny:SUBST')),
 (25.19152377725881, ('zredukowany:ADJ', 'kompresoer:SUBST')),
 (25.19152377725881, ('zowel:XXX', 'anker:XXX'))]

In [20]:
get_best_results(ngram_data_lemma.pmi(min_occurrences=5), limit=10)

[(22.606561276537654, ('seair:SUBST', 'exim:SUBST')),
 (22.606561276537654, ('sameer:SUBST', 'thakar:SUBST')),
 (22.606561276537654, ('gone:PPAS', 'fishin:SUBST')),
 (22.606561276537654, ('electro:SUBST', 'plating:SUBST')),
 (22.384168855201207, ('autot:SUBST', 'ldr:SUBST')),
 (22.021598775816496, ('agenzija:SUBST', 'sedqa:SUBST')),
 (21.79920635448005, ('mia:SUBST', 'gt:SUBST')),
 (21.776486277979966, ('dreamz:SUBST', 'infr:SUBST')),
 (21.732092158621512, ('seeking:SUBST', 'alpha:ADJ')),
 (21.732092158621512, ('document:SUBST', 'shredding:SUBST'))]

# 11. Group the bigrams by morphosyntactic tag, i.e. a pair of words belongs to a given group if all pairs have the same syntactic category for the first and the second word. E.g. one group would be words with subst as the first words and adj as the second word.

In [39]:
@dataclass
class NGramDataGrouped:
    unigram_counter: dict = field(default_factory=lambda: defaultdict(lambda: 0))
    bigram_counter: dict = field(default_factory=lambda: defaultdict(lambda: 0))
    unigram_tag_counter: dict = field(default_factory=lambda: defaultdict(lambda: 0))
    bigram_tag_counter: dict = field(default_factory=lambda: defaultdict(lambda: 0))
    bigram_mapping: dict = field(default_factory=lambda: defaultdict(lambda: []))

    @cached_property
    def total_unigrams(self) -> int:
        return sum(self.unigram_counter.values())

    @cached_property
    def total_bigrams(self) -> int:
        return sum(self.bigram_counter.values())
    
    @cached_property
    def total_tag_unigrams(self) -> int:
        return sum(self.unigram_tag_counter.values())
    
    @cached_property
    def total_tag_bigrams(self) -> int:
        return sum(self.bigram_tag_counter.values())

    def __pmi(self, bigram, bg_counter, ug_counter, bg_total, ug_total):
        unigram_x, unigram_y = bigram
        p_x = ug_counter[unigram_x] / ug_total
        p_y = ug_counter[unigram_y] / ug_total
        p_xy = bg_counter[bigram] / bg_total

        ratio = p_xy / (p_x * p_y)

        return log2(ratio)

    def pmi(self, tags=None, min_occurrences=None):
        bigram_source = self.bigram_counter.items() if tags is None else self.bigram_mapping[tags]
        return {
            k: self.__pmi(k, self.bigram_counter, self.unigram_counter, self.total_bigrams, self.total_unigrams)
            for k in bigram_source
            if min_occurrences is None or min_occurrences < self.bigram_counter[k]
        }
    
    def pmi_tag(self, min_occurrences=None):
        return {
            k: self.__pmi(k, self.bigram_tag_counter, self.unigram_tag_counter, self.total_tag_bigrams, self.total_tag_unigrams)
            for k, v in self.bigram_tag_counter.items()
            if min_occurrences is None or min_occurrences < v
        }

    def count_unigram(self, unigram, tag) -> None:
        self.unigram_counter[unigram] += 1
        self.unigram_tag_counter[tag] += 1

    def count_bigram(self, bigram, tag) -> None:
        self.bigram_counter[bigram] += 1
        self.bigram_tag_counter[tag] += 1
        self.bigram_mapping[tag].append(bigram)

    def sanitize_bigrams(self, blacklist=r"[^a-zA-Z\s]"):
        bigrams_to_drop = [
            key
            for key in self.bigram_counter.keys()
            if re.search(blacklist, " ".join(key)) is not None
        ]
        for bigram in bigrams_to_drop:
            del self.bigram_counter[bigram]

def calculate_grouped_ngrams():
    data = NGramDataGrouped()

    with open('piped_data.pickle', 'rb') as f:
        piped_data = pickle.load(f)

    for doc in tqdm(piped_data):
        last_text = None
        last_tag = None
        for token in doc:
            text = f"{token.lemma_}"
            tag = str(token.tag_)

            if len(str(tag)) == 0 or \
                    re.search("[^a-zA-Z]", text) is not None:
                last_text, last_tag = None, None
                continue

            data.count_unigram(text, tag)

            if last_text is not None:
                bigram = (last_text, text)
                bigram_tag = (last_tag, tag)
                data.count_bigram(bigram, bigram_tag)

            last_text, last_tag = text, tag
    return data

In [40]:
res = calculate_grouped_ngrams()

  0%|          | 0/57638 [00:00<?, ?it/s]

# 12. Print top-10 categories (sort them by total count of bigrams) and print top-5 pairs for each category.

In [44]:
best_10_tags = get_best_results(res.pmi_tag(min_occurrences=50), limit=10)
best_10_tags

[(9.352670381004948, ('XXX', 'XXX')),
 (5.169646017394302, ('COMP', 'INF')),
 (3.8302810165623566, ('ADJP', 'IMPT')),
 (3.5414485128731608, ('PREP', 'ADJP')),
 (3.523885000324015, ('PREP', 'BURK')),
 (3.23781305841335, ('BURK', 'PREP')),
 (3.1574339469857073, ('IMPT', 'SIEBIE')),
 (2.644106911747437, ('QUB', 'WINIEN')),
 (2.6364093908207247, ('ADJP', 'FIN')),
 (2.339366374336545, ('PRED', '_SP'))]

In [47]:
for score, tag in best_10_tags:
    print("=" * 50)
    print(f" ===> TAGS {tag}")
    
    best_5_examples = get_best_results(res.pmi(tags=tag, min_occurrences=5), limit=5)
    for _score, example in best_5_examples:
        print(f"{_score} => {example}")

 ===> TAGS ('XXX', 'XXX')
18.704033921165102 => ('thiet', 'ke')
18.366998933887533 => ('ding', 'ding')
17.97468151110877 => ('pbs', 'frontline')
17.97468151110877 => ('make', 'sure')
17.907567315250233 => ('vous', 'devez')
 ===> TAGS ('COMP', 'INF')
7.1064877186191175 => ('aby', 'zapobiec')
4.460683257054871 => ('aby', 'uciec')
 ===> TAGS ('ADJP', 'IMPT')
5.5040952601909146 => ('prosty', 'zapytaj')
4.75754361719551 => ('prosty', 'przeczytaj')
4.595068920238405 => ('prosty', 'mi')
4.480248518236547 => ('prosty', 'wybierz')
4.049919367005113 => ('prosty', 'zacznij')
 ===> TAGS ('PREP', 'ADJP')
8.015027622404938 => ('po', 'prosty')
7.888004665541665 => ('po', 'trzydziestka')
7.343684149317855 => ('po', 'cichy')
7.121877823185555 => ('co', 'gorsza')
6.765952301593563 => ('od', 'dawny')
 ===> TAGS ('PREP', 'BURK')
8.303042164820509 => ('po', 'trochu')
8.015027622404938 => ('po', 'prosty')
6.765952301593563 => ('od', 'dawny')
6.2561925248912535 => ('z', 'dala')
6.091553375484033 => ('na', 'j

# 13. Create a table comparing the results for copora without and with tagging and lemmatization.

In [24]:
res_df = pd.DataFrame(
    data={
        'no_lemma_no_limit': [" ".join(x[1]) for x in get_best_results(ngram_data.pmi(), limit=10)],
        'no_lemma_limit': [" ".join(x[1]) for x in get_best_results(ngram_data.pmi(min_occurrences=5), limit=10)],
        'lemma_no_limit': [" ".join(x[1]) for x in get_best_results(ngram_data_lemma.pmi(), limit=10)],
        'lemma_limit': [" ".join(x[1]) for x  in get_best_results(ngram_data_lemma.pmi(min_occurrences=5), limit=10)]
    }
)
res_df

Unnamed: 0,no_lemma_no_limit,no_lemma_limit,lemma_no_limit,lemma_limit
0,zygmunt freud,seair exim,zygmunt:SUBST freud:SUBST,seair:SUBST exim:SUBST
1,zwaartekracht pijnlijk,sameer thakar,zwei:XXX investmentfonds:XXX,sameer:SUBST thakar:SUBST
2,zure bikotearekin,gone fishin,zwei:SUBST anlagen:SUBST,gone:PPAS fishin:SUBST
3,zur tagespflege,electro plating,zwaartekracht:SUBST pijnlijk:SUBST,electro:SUBST plating:SUBST
4,zum einsatz,deming electro,zure:SUBST bikotearekin:SUBST,autot:SUBST ldr:SUBST
5,zszywacz pneumatyczny,stwardnienia rozsianego,zur:SUBST tagespflege:ADJ,agenzija:SUBST sedqa:SUBST
6,zowel anker,kuala lumpur,zum:SUBST einsatz:SUBST,mia:SUBST gt:SUBST
7,zorgeloze spevener,autot ldr,zszywacz:SUBST pneumatyczny:SUBST,dreamz:SUBST infr:SUBST
8,zonas rurales,metalami szlachetnymi,zredukowany:ADJ kompresoer:SUBST,seeking:SUBST alpha:ADJ
9,zoho invoice,limo mia,zowel:XXX anker:XXX,document:SUBST shredding:SUBST


# 14. Answer the following questions

### Why do we have to filter the bigrams, rather than the token sequence?

If I understand correctly (token sequence as filtering the text during processing and traversing it), we wouldn't be able to remove some bigrams without affecting the others. For example:

`I like cats`

If we would like to drop the sequence `I like` during the processing, we would also lose other bigram - `like cats`.

### What types of expressions are discovered by the methods.

In case of general **PMI** this method finds mostly proper nouns, such as people's names (Zygmunt Freud) or company name (especially with min ocurrences), such as Seair Exim or Gone Fishin.