In [1]:
import re
import numpy as np

In [2]:
def regex(text):
    return re.sub(r'[^\w\s]', '', text).rstrip().lower().split()

In [3]:
unigram_dict = {}

with open('polish_corpora.txt', 'r', encoding='utf8') as f:
    for line in f:
        for x in regex(line):
            if x not in unigram_dict:
                unigram_dict[x] = 1
            else:
                unigram_dict[x] += 1

In [6]:
def remove_pol_chars(text):
    pol_chars = 'ąćęłńóśźżĄĆĘŁŃÓŚŹŻ'
    replacements = 'acelnoszzACELNOSZZ'
    translator = str.maketrans(pol_chars, replacements)
    return text.translate(translator)

In [11]:
word_dict = {}

with open('polish_corpora.txt', 'r', encoding='utf8') as f:
    for line in f:
        for x in regex(line):
            trans = remove_pol_chars(x)
            if trans not in word_dict:
                word_dict[trans] = [x]
            elif x not in word_dict[trans]:
                word_dict[trans].append(x)

In [214]:
def reconstruct(text):
    res = ''
    for w in text:
        if w in word_dict:
            scores = word_dict[w]
            scores = [unigram_dict[x] for x in scores]
            max_idx = scores.index(max(scores))
            if res == '':
                res += word_dict[w][max_idx].capitalize() + ' '
            else:
                res += word_dict[w][max_idx] + ' '
        else:
            if res == '':
                res += w.capitalize() + ' '
            else:
                res += w + ' '
    return res.strip()

In [215]:
def measure(t1, t2):
    return sum([1 for i, j in zip(t1, t2) if i == j]) / len(t1)

In [225]:
N = 1000000
i = 0
flag = False
results = []
with open('polish_corpora.txt', 'r', encoding='utf8') as f:
    for line in f:
        if i == N-1:
            flag = True
        full_acc = line.rstrip().split()
        polish_acc = line.rstrip().lower().split()      
        trans = [remove_pol_chars(x) for x in polish_acc]
        result = reconstruct(trans).split()
        low_result = [x.lower() for x in result]
        results.append(np.sqrt(measure(polish_acc, low_result) * measure(full_acc, result)))
        i += 1
        if flag:
            break
print(np.mean(results))

0.93445759798372


In [223]:
example = ['wymyslilem', 'zatrwazajaco', 'interesujacy', 'przyklad', 'tekstu', 'wejsciowego', '!']

In [224]:
print(reconstruct(example))

Wymyśliłem zatrważająco interesujący przykład tekstu wejściowego !
