In [1]:
import sys
import os
import re
import time
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm 
from collections import OrderedDict 
from operator import itemgetter
from collections import Counter
import itertools
from typing import *
from scipy.stats import mode
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

%config IPCompleter.greedy=True

# Load data

In [2]:
polish_corpora_path = '../../../polish_corpora.txt'
poleval2_path = '../../../poleval_2grams.txt'
poleval3_path = '../../../poleval_3grams.txt'
supertags_path = 'supertags.txt'

In [4]:
unigrams: Mapping[bytes, int] = {}  # word -> number of occurrences

with open(polish_corpora_path, encoding="utf8") as f:
    for line in tqdm(f, desc='Loading data...', position=0, leave=True, total=23011601):
        line = line.strip().lower().split()
        for word in line:
            _word = bytes(bytearray(word, 'UTF-8'))
            if _word in unigrams:
                unigrams[_word] += 1
            else:
                unigrams[_word] = 1

Loading data...:  67%|██████▋   | 15343657/23011601 [08:51<04:25, 28850.95it/s]


KeyboardInterrupt: 

In [3]:
bigrams: Mapping[Tuple[bytes, bytes], int] = {}
    
with open(poleval2_path, encoding="utf8") as f:
    for line in tqdm(f, desc='Loading data...', position=0, leave=True, total=59134224):
        line = line.strip().lower().split()
        
        key: Tuple[bytes, bytes] = (bytes(bytearray(line[1], 'UTF-8')), bytes(bytearray(line[2], 'UTF-8')))
        value: int =  int(line[0])
            
        if key in bigrams:
            bigrams[key].append(value)
        else:
            bigrams[key] = [value]

Loading data...: 100%|██████████| 59134224/59134224 [07:08<00:00, 138036.14it/s]


In [None]:
trigrams: Mapping[Tuple[bytes, bytes], Tuple[bytes, int]] = {} 
total_iters, skipped1, skipped2 = 0, 0, 0

with open(poleval3_path, encoding="utf8") as f:
    for line in tqdm(f, desc='Loading data...', position=0, leave=True):
        total_iters += 1
        line = line.strip().lower().split()

        if len(line) != 4:
            skipped1 += 1
            continue
            
        if int(line[0]) == 1:
            skipped2 += 1
            continue
            
        key: Tuple[bytes, bytes] = (bytes(bytearray(line[1], 'UTF-8')),  bytes(bytearray(line[2], 'UTF-8')))
        value: Tuple[bytes, int] = (bytes(bytearray(line[3], 'UTF-8')), int(line[0]))
 
        if key in trigrams:
            trigrams[key].append(value)
        else:
            trigrams[key] = [value]

In [5]:
%%time
bigram_tags: Mapping[bytes, Mapping[bytes, int]] = {} 

for k, v in tqdm(bigrams.items()):
    if k[0] not in bigram_tags:
        bigram_tags[k[0]] = {k[1]: v}
    else:
        bigram_tags[k[0]][k[1]] = v

100%|██████████| 59134224/59134224 [02:09<00:00, 457620.29it/s] 

CPU times: user 2min 1s, sys: 2.92 s, total: 2min 4s
Wall time: 2min 9s





In [6]:
%%time
tags: Mapping[bytes, bytes] = {} 

with open(supertags_path, encoding='UTF-8') as f:
    for line in tqdm(f, position=0, leave=True, total=1781995):
        word, tag = line.strip().split(' ')
        tags[bytes(bytearray(word, 'UTF-8'))] = bytes(bytearray(tag, 'UTF-8'))

100%|█████████▉| 1781994/1781995 [00:07<00:00, 234885.60it/s]

CPU times: user 6.77 s, sys: 312 ms, total: 7.08 s
Wall time: 7.59 s





# Task 2
![alt text](images/task2.png "Title")

In [7]:
sentences = []
with open('sentences.txt', encoding='utf8') as f:
    for line in f:
        line = line.strip().lower().split()
        sentences.append(line)
        
print(len(sentences))

68


In [8]:
def simple_pbb_bigrams(w1: bytes, w2: bytes) -> int:
    """ Calculate number of occurrences of bigram (w1, w2) """
    assert type(w1) == bytes and type(w2) == bytes, f'Given type: {type(w1)} | {type(w2)}'
    return bigrams[(w1, w2)][0] if (w1, w2) in bigrams else 0


def simple_score(sentence: List[bytes], cond_pbb_func) -> float:
    pbb: float = 0.0
    n: int = len(sentence)
    for i in range(1, n):
        value = cond_pbb_func(w1=sentence[i - 1], w2=sentence[i]) + 1
        pbb += np.log(value)
    return pbb


def simple_score_with_tags(sentence: List[bytes], cond_pbb_func, alpha=0.1) -> float:
    pbb: float = 0.0
    n: int = len(sentence)
    for i in range(1, n):
        value = cond_pbb_func(w1=sentence[i - 1], w2=sentence[i]) + 1
        value_tag = 1
        
        if sentence[i - 1] in tags and sentence[i] in tags:
            t0, t1 = tags[sentence[i - 1]], tags[sentence[i]]
            if t0 in bigram_tags and t1 in bigram_tags[t0]:
                value_tag = bigram_tags[t0][t1]
                
        pbb += np.log(alpha * value + (1 - alpha) * value_tag)
    return pbb

In [9]:
def generate_permutations(sentence: List[bytes]) -> list:
    return list(itertools.permutations(sentence))

def order_permutations(permutations: List[bytes], scoring_function, cond_pbb_func) -> List[bytes]:
    scores = [scoring_function(x, cond_pbb_func) for x in permutations]
    return sorted(zip(scores, permutations), reverse=True)

def score_permuatations(valid_sentence: List[bytes], scoring_function, cond_pbb_func) -> Tuple[float, str]:
    permutations: List[bytes] = generate_permutations(sentence=valid_sentence)
    permutations = order_permutations(
        permutations=permutations, scoring_function=scoring_function, cond_pbb_func=cond_pbb_func
    )
    permutations = [y for x, y in permutations]
    best_decoded = list(map(lambda x: x.decode('utf8'), permutations[0]))
    for i, sentence in enumerate(permutations):
        if tuple(sentence) == tuple(valid_sentence):
            return 1 / (i + 1), best_decoded
    
    return 0.0, best_decoded

def final_score(cond_pbb_func, scoring_function) -> Tuple[float, int]:
    total_score: float = 0.0
    n_skipped: int = 0
    perfect_score: int = 0
    for i, sentence in enumerate(sentences):
        time.sleep(1)
        if len(sentence) > 7:
            n_skipped += 1
            print(f'{i+1} / {len(sentences)} | {sentence} ===> ??? / 1.0')
            print('Skipped because sentence is too long.')
            continue
        sentence_bytes = list(map(lambda x: bytes(bytearray(x, encoding='UTF-8')), sentence))
        score, best_sentence = score_permuatations(
            valid_sentence=sentence_bytes, 
            scoring_function=scoring_function,
            cond_pbb_func=cond_pbb_func
        )
        total_score += score
        if score == 1:
            perfect_score += 1
        print(f'{i+1} / {len(sentences)} | {sentence} ===> {score:.3f} / 1.0')
        print('My best: ', *best_sentence)
        
    return total_score / (len(sentences) - n_skipped), perfect_score

### Only bigrams

In [10]:
%%time
f_score = final_score(
    cond_pbb_func=simple_pbb_bigrams, 
    scoring_function=simple_score
)
print(f'Final score: {f_score}')

1 / 68 | ['judyta', 'dała', 'wczoraj', 'stefanowi', 'czekoladki'] ===> 0.111 / 1.0
My best:  stefanowi judyta dała wczoraj czekoladki
2 / 68 | ['babuleńka', 'miała', 'dwa', 'rogate', 'koziołki'] ===> 0.091 / 1.0
My best:  rogate miała dwa koziołki babuleńka
3 / 68 | ['wczoraj', 'wieczorem', 'spotkałem', 'pewną', 'piękną', 'kobietę'] ===> 0.250 / 1.0
My best:  spotkałem wczoraj wieczorem pewną piękną kobietę
4 / 68 | ['nasz', 'przedmiot', 'to', 'przetwarzanie', 'języka', 'naturalnego'] ===> 0.020 / 1.0
My best:  przetwarzanie języka naturalnego to nasz przedmiot
5 / 68 | ['jedną', 'grupę', 'prowadzi', 'jakub', 'michaliszyn'] ===> 0.091 / 1.0
My best:  prowadzi jedną grupę michaliszyn jakub
6 / 68 | ['wykładowcą', 'jest', 'paweł', 'rychlikowski'] ===> 0.333 / 1.0
My best:  rychlikowski paweł jest wykładowcą
7 / 68 | ['pan', 'paweł', 'prowadzi', 'również', 'dwie', 'pozostałe', 'grupy'] ===> 0.050 / 1.0
My best:  pozostałe dwie grupy prowadzi również pan paweł
8 / 68 | ['na', 'stronie', 'w

### Bigrams + tags

In [30]:
%%time
f_score = final_score(
    cond_pbb_func=simple_pbb_bigrams, 
    scoring_function=lambda x, y: simple_score_with_tags(x, y, alpha=0.8)
)
print(f'Final score: {f_score}')

1 / 68 | ['judyta', 'dała', 'wczoraj', 'stefanowi', 'czekoladki'] ===> 0.111 / 1.0
My best:  stefanowi judyta dała wczoraj czekoladki
2 / 68 | ['babuleńka', 'miała', 'dwa', 'rogate', 'koziołki'] ===> 0.091 / 1.0
My best:  rogate miała dwa koziołki babuleńka
3 / 68 | ['wczoraj', 'wieczorem', 'spotkałem', 'pewną', 'piękną', 'kobietę'] ===> 0.333 / 1.0
My best:  spotkałem wczoraj wieczorem pewną piękną kobietę
4 / 68 | ['nasz', 'przedmiot', 'to', 'przetwarzanie', 'języka', 'naturalnego'] ===> 0.018 / 1.0
My best:  przetwarzanie języka naturalnego to nasz przedmiot
5 / 68 | ['jedną', 'grupę', 'prowadzi', 'jakub', 'michaliszyn'] ===> 0.091 / 1.0
My best:  prowadzi jedną grupę michaliszyn jakub
6 / 68 | ['wykładowcą', 'jest', 'paweł', 'rychlikowski'] ===> 0.333 / 1.0
My best:  rychlikowski paweł jest wykładowcą
7 / 68 | ['pan', 'paweł', 'prowadzi', 'również', 'dwie', 'pozostałe', 'grupy'] ===> 0.053 / 1.0
My best:  pozostałe dwie grupy prowadzi również pan paweł
8 / 68 | ['na', 'stronie', 'w

# Task 3
![alt text](images/task3.png "Title")


In [14]:
%%time

word_to_tag, sufix_to_tag, tag_to_word = {}, {}, {}

with open('./supertags.txt','r',encoding='UTF-8') as f:
    for line in f:
        word, tag = line.strip().split(' ')
        word_to_tag[word] = tag
        
        if word[-3 :] in sufix_to_tag:
            sufix_to_tag[word[-3 :]].append(tag)
        else:
            sufix_to_tag[word[-3 :]] = [tag]
            
        if tag in tag_to_word:
            tag_to_word[tag].append(word)
        else:
            tag_to_word[tag] = [word]

            
for x in sufix_to_tag.keys():
    sufix_to_tag[x] = max(set(sufix_to_tag[x]), key=sufix_to_tag[x].count) 
    
len(word_to_tag), len(sufix_to_tag), len(tag_to_word)

CPU times: user 9.1 s, sys: 212 ms, total: 9.31 s
Wall time: 9.32 s


(1781994, 5824, 4034)

In [15]:
def generate_sentence(base_sentence: str) -> str:
    generated_sentence = ''

    for i, w in enumerate(base_sentence.lower().split(' ')):
        if w in word_to_tag:
            next_words = tag_to_word[
                word_to_tag[w]
            ]
        elif w[-3:] in sufix_to_tag:
            next_words = tag_to_word[
                sufix_to_tag[w[-3 :]]
            ]
        else:
            next_words = tag_to_word[
                np.random.choice(list(tag_to_word.keys()))
            ]
        next_word = np.random.choice(next_words)
        if i == 0:
            next_word = next_word.capitalize()
        generated_sentence += next_word + ' '
        
    return generated_sentence.strip() + '.'

In [4]:
generate_sentence(
    base_sentence='Mały Piotruś spotkał w niewielkiej restauracyjce wczoraj poznaną koleżankę.'
)

'Formalistyczny refrenista zaśnieżył pod niewybaczalnej rundzie pilno zmilitaryzowaną handicapa.'

In [5]:
generate_sentence(
    base_sentence='Gruby Stefan przeczytał we wczorajszej gazecie starannie przygotowaną analizę.'
)

'Fermowy legniczanin przystrzelał bazgrała przyjemniejszej calówce wodno pozalepianą zastaliby.'

In [27]:
generate_sentence(
    base_sentence='Zgodność gramatyczną sprawdzamy za pomocą tagów z pliku supertags.'
)

'Artystyczność ichtiolową gawrujemy za egerią kolejków z spisku popadło.'

# Task 4

In [137]:
def get_bigram_value(w1: str, w2: str):
    w1, w2 = bytes(bytearray(w1, 'UTF-8')), bytes(bytearray(w2, 'UTF-8'))
    if (w1, w2) in bigrams:
        return bigrams[(w1, w2)][0]
    return 0

def generate_sentence_bigrams(base_sentence: str) -> str:
    generated_sentence = [' ']

    for i, w in enumerate(base_sentence.lower().split(' ')):
        if w in word_to_tag:
            next_words = tag_to_word[
                word_to_tag[w]
            ]
        elif w[-3:] in sufix_to_tag:
            next_words = tag_to_word[
                sufix_to_tag[w[-3 :]]
            ]
        else:
            next_words = tag_to_word[
                np.random.choice(list(tag_to_word.keys()))
            ]
            
        pbb = np.array([get_bigram_value(w1=generated_sentence[-1], w2=w) for w in next_words])
        if pbb.sum() > 0:
            pbb = pbb / pbb.sum()
        else:
            pbb = np.ones(pbb.shape) / len(pbb)
            generated_sentence.append(' | ')

        next_word = np.random.choice(next_words, p=pbb)
        generated_sentence.append(next_word)
        
    return ' '.join(generated_sentence).strip()

In [138]:
generate_sentence_bigrams(
    base_sentence='Mały Piotruś spotkał w niewielkiej restauracyjce wczoraj poznaną koleżankę.'
)

'|  drewnopochodny  |  władymir  |  zapieprzył  |  na niezbędnej ochronie szczególnie wyspecjalizowaną  |  vide'

# Task 5