In [1]:
import sys
import os
import re
import time
import pickle
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm 
from collections import OrderedDict 
from operator import itemgetter
from collections import Counter
import itertools
from typing import *
from scipy.stats import mode
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

%config IPCompleter.greedy=True

In [2]:
polish_corpora_path = '../../../polish_corpora.txt'
poleval2_path = '../../../poleval_2grams.txt'
poleval3_path = '../../../poleval_3grams.txt'
supertags_path = 'supertags.txt'

In [3]:
def regex(text):
    return re.sub(r'[^\w\s]', '', text).rstrip().lower().split()

In [4]:
unigrams: Mapping[bytes, int] = {}  # word -> number of occurrences

with open(polish_corpora_path, encoding="utf8") as f:
    for line in tqdm(f, desc='Loading data...', position=0, leave=True, total=23011601):
        for word in regex(text=line):
            _word = bytes(bytearray(word, 'UTF-8'))
            if _word in unigrams:
                unigrams[_word] += 1
            else:
                unigrams[_word] = 1

Loading data...: 100%|██████████| 23011601/23011601 [06:39<00:00, 57549.97it/s]


In [8]:
def change_polish_letters(word: str) -> str:
    assert type(word) == str, 'Wrong type!'
    polish = 'ąćęłńóśźż'
    tokenized = 'acelnoszz'
    translator = str.maketrans(polish, tokenized)
    return word.translate(translator)

In [14]:
word_to_tokenize: Mapping[bytes, Set[bytes]] = {}

with open(polish_corpora_path, encoding="utf8") as f:
    for line in tqdm(f, desc='Loading data...', position=0, leave=True, total=23011601):
        for word in regex(text=line):
            tokenized = change_polish_letters(word)
            _word = bytes(bytearray(word, 'UTF-8'))
            _tokenized = bytes(bytearray(tokenized, 'UTF-8'))
            if _tokenized in word_to_tokenize:
                word_to_tokenize[_tokenized].add(_word)
            else:
                word_to_tokenize[_tokenized] = set([_word])

Loading data...: 100%|██████████| 23011601/23011601 [17:48<00:00, 21537.92it/s]


In [65]:
def reconstruct_tokenized_text(text: str) -> str:
    """ Asserts that text is tokenized using:
            1. regex
            2. change_polish_letters
    """
    reconstructed = []
    for word in text.strip().split():
        word = bytes(bytearray(word, 'UTF-8'))
        if word in word_to_tokenize:
            possible_reconstructions: Set = word_to_tokenize[word]
            reconstructions_with_scores: List[Tuple[bytes, int]] \
                = [(x, unigrams[x]) for x in possible_reconstructions]
            best_word = max(reconstructions_with_scores, key=lambda x: x[1])
            reconstructed.append(best_word[0].decode("UTF-8") )
        else:
            reconstructed.append('?')
    reconstructed[0] = reconstructed[0].capitalize()
    reconstructed[-1] += '.'
    return ' '.join(reconstructed)

In [68]:
reconstruct_tokenized_text(text='moze bede robic cos smiesznego')

'Może będę robić coś śmiesznego.'