In [76]:
import pandas as pd
import regex as re

from tqdm.notebook import tqdm
from spacy.tokenizer import Tokenizer
from spacy.lang.pl import Polish
from math import log2
from dataclasses import dataclass, field
from collections import defaultdict

In [72]:
text = pd.read_json('../../data/piqa/train.jsonl', lines=True)
text.head()

Unnamed: 0,id,goal,sol1,sol2
0,f6be5fcc-d686-4549-8207-7904068693d7,"When boiling butter, when it's ready, you can",Pour it onto a plate,Pour it into a jar
1,ee9783b5-76a7-4beb-bbbb-9b179b11c43e,"To permanently attach metal legs to a chair, y...",Weld the metal together to get it to stay firm...,Nail the metal together to get it to stay firm...
2,7230f9f4-06f7-4eb3-9994-762957427a96,how do you indent something?,leave a space before starting the writing,press the spacebar
3,e3304ee5-cdca-4830-b04d-a3a7cf77f6a9,how do you shake something?,move it up and down and side to side quickly.,stir it very quickly.
4,b316c350-d435-4d35-a101-92ed4c9fc14a,Clean tires,"Pour water, cape off caked on dirt. Use speed...","Pour water, scrape off caked on dirt. Use a st..."


In [73]:
corpus = text.melt(id_vars=['id'], value_vars=['goal', 'sol1', 'sol2'])
corpus = corpus.set_index('id')['value'].tolist()
corpus = [text.lower() for text in corpus]

Turns out later, that for some reason we have multiple spaces in some documents, causing huge problems in bigrams. Due to that, we will trim spaces in this step

In [74]:
corpus = [re.sub(r"\s+", " ", text) for text in corpus]

# 1. Use SpaCy tokenizer API to tokenize the text from the PiQA corpus.

In [75]:
nlp = Polish()
tokenizer = Tokenizer(nlp.vocab)

# 2. Compute bigram counts of downcased tokens. Given the sentence: "The quick brown fox jumps over the lazy dog.", the bigram counts are as follows:

In [77]:
@dataclass
class NGramData:
    unigram_counter: dict[str, int] = field(default_factory=lambda: defaultdict(lambda: 0))
    bigram_counter: dict[str, int] = field(default_factory=lambda: defaultdict(lambda: 0))

    @property
    def total_unigrams(self) -> int:
        return sum(self.unigram_counter)

    @property
    def total_bigrams(self) -> int:
        return sum(self.bigram_counter)

    def count_unigram(self, unigram) -> None:
        self.unigram_counter[unigram] += 1

    def count_bigram(self, bigram) -> None:
        self.bigram_counter[bigram] += 1

def calculate_ngrams(corpus, unigram_processor=lambda x: x.text):
    data = NGramData()
    for doc in tqdm(tokenizer.pipe(corpus), total=len(corpus)):
        last_text = None
        for token in doc:
            text = unigram_processor(token)
            unigram_counting[text] = unigram_counting.get(text, 0) + 1

            if last_text is not None:
                bigram = f"{last_text} {text}"
                bigram_counting[bigram] = bigram_counting.get(bigram, 0) + 1

            last_text = token

In [51]:
unigram_counting = {}
bigram_counting = {}

In [52]:
for doc in tqdm(tokenizer.pipe(corpus), total=len(corpus)):
    last_text = None
    for token in doc:
        text = token.text
        unigram_counting[text] = unigram_counting.get(text, 0) + 1

        if last_text is not None:
            bigram = f"{last_text} {text}"
            bigram_counting[bigram] = bigram_counting.get(bigram, 0) + 1

        last_text = token

  0%|          | 0/48339 [00:00<?, ?it/s]

# 3. Discard bigrams containing characters other than letters. Make sure that you discard the invalid entries after computing the bigram counts.

In [53]:
bigrams_to_drop = [
    key
    for key in bigram_counting.keys()
    if re.search("[^a-zA-Z\s]", key) is not None
]

In [54]:
bigrams_to_drop[:10]

['boiling butter,',
 'butter, when',
 "when it's",
 "it's ready,",
 'ready, you',
 'a chair,',
 'chair, you',
 'indent something?',
 'shake something?',
 'taste something?']

In [55]:
for item in bigrams_to_drop:
    del bigram_counting[item]

# 4. Use pointwise mutual information to compute the measure for all pairs of words.

In [56]:
total_unigrams = sum(unigram_counting.values())
total_unigrams

719983

In [57]:
total_bigrams = sum(bigram_counting.values())
total_bigrams

530056

In [58]:
def pmi(bigram):
    bigrams = bigram.split(' ')
    if len(bigrams) != 2:
        print(f"!!! {bigram}/{bigrams}")
        return None
    unigram_x, unigram_y = bigrams
    p_x = unigram_counting[unigram_x] / total_unigrams
    p_y = unigram_counting[unigram_y] / total_unigrams
    p_xy = bigram_counting[bigram] / total_bigrams

    ratio = p_xy / (p_x * p_y)

    return log2(ratio)

In [59]:
bigrams_pmi = {
    bigram: pmi(bigram)
    for bigram in bigram_counting.keys()
}

!!!   is/['', '', 'is']


# 5. Sort the word pairs according to that measure in the descending order and determine top 10 entries.

In [60]:
invalid_pmis = [k for k, v in bigrams_pmi.items() if v is None]

In [61]:
for k in invalid_pmis:
    del bigrams_pmi[k]

In [62]:
def get_best_results(pmi_values, limit=None):
    order = sorted([(v, k) for k, v in pmi_values.items()], reverse=True)
    if limit is not None:
        return order[:limit]
    return order

In [63]:
get_best_results(bigrams_pmi, limit=10)

[(19.899421372150968, 'zelda ii'),
 (19.899421372150968, 'wholesale distributor'),
 (19.899421372150968, 'vics vapo'),
 (19.899421372150968, 'tyrannosaurus rex'),
 (19.899421372150968, 'strontium chloride'),
 (19.899421372150968, 'storebought sheetcake'),
 (19.899421372150968, 'stinging nettle'),
 (19.899421372150968, 'stemless wineglass'),
 (19.899421372150968, 'soba noddles'),
 (19.899421372150968, 'shonda rhimes')]

# Filter bigrams with number of occurrences lower than 5. Determine top 10 entries for the remaining dataset (>=5 occurrences).

In [64]:
bigrams_to_filter = [k for k, v in bigram_counting.items() if v < 5]

filtered_bigrams_pmi = bigrams_pmi.copy()

for k in bigrams_to_filter:
    if k not in filtered_bigrams_pmi:
        continue
    del filtered_bigrams_pmi[k]

In [65]:
get_best_results(filtered_bigrams_pmi, limit=10)

[(17.092066450093366, 'chow mein'),
 (17.092066450093363, 'ping pong'),
 (17.092066450093363, 'cheeseburger copycat'),
 (16.899421372150968, 'lazy susan'),
 (16.729496370708656, 'tic tac'),
 (16.577493277263606, 'guinea pig'),
 (16.36692629132395, 'mod podge'),
 (16.092066450093366, 'girl scout'),
 (15.99253077654245, 'melon baller'),
 (15.869674028756917, 'raspberry cloud')]

# 7/8/9. Use SpaCy to lemmatize and tag the sentences in the corpus. Using the tagged corpus compute bigram statistic for the tokens containing: a. lemmatized, downcased word b. morphosyntactic category of the word (subst, fin, adj, etc.)

In [69]:
unigram_counting = {}
bigram_counting = {}

In [70]:
for doc in tqdm(tokenizer.pipe(corpus), total=len(corpus)):
    last_text = None
    for token in doc:
        text = f"{token.lemma_}:{token.tag_}"
        unigram_counting[text] = unigram_counting.get(text, 0) + 1

        if last_text is not None:
            bigram = f"{last_text} {text}"
            bigram_counting[bigram] = bigram_counting.get(bigram, 0) + 1

        last_text = token

  0%|          | 0/48339 [00:00<?, ?it/s]