In [1]:
import itertools

import pandas as pd
from nltk import ngrams
from nltk.metrics.distance import edit_distance
from tqdm.auto import tqdm

In [2]:
def preprocess(word):

    new_word = ""
    final_word = ""

    s1 = {"ó": "o",
     "ę": "e",
     "ą": "a",
     "ć": "c",
     "ź": "z",
     "ż": "z"}

    s2 = {"rz": "ż",
     "ch": "h"}

    for i in range(len(word)):
        if (i+1 < len(word)) and (word[i] + word[i+1] in s2):
            new_word += s2[word[i] + word[i+1]]
        else:
            new_word += word[i]

    for i in range(len(new_word)):
        if new_word[i] in s1:
            final_word += s1[new_word[i]]
        else:
            final_word += new_word[i]

    final_word = ''.join(c[0] for c in itertools.groupby(final_word))
    return final_word

In [3]:
WORDS = set(pd.read_csv('../List_3/data/words.txt', sep=";", header=None)[0])
WORDS_EXT = set(pd.read_csv('../List_3/data/words.txt', sep=";", header=None)[1])

In [4]:
N_GRAMS = {word: set(map("".join, ngrams(preprocess(word), 2))) for word in WORDS}
N_GRAMS_EXT = {word: set(map("".join, ngrams(preprocess(word), 2))) for word in WORDS_EXT}

In [5]:
ERRORS = pd.read_csv('../List_3/data/literowki1.txt', sep=" ", header=None, names=['correct', 'error'])

In [6]:
def correct_error(word):
    if word in WORDS:
        return [word]
    else:
        processed_word = preprocess(word)
        possible_words = {}
        word_ngrams = set(map("".join, ngrams(processed_word, 2)))
        for correct_word, correct_ngrams in N_GRAMS.items():
            if len(word_ngrams & correct_ngrams) > 0.30 * len(word_ngrams):
                possible_words[correct_word] = preprocess(correct_word)
        min_editdist = 100
        best_word = []
        for correct_word, processed_correct_word in possible_words.items():
            editdist = edit_distance(processed_correct_word, processed_word, transpositions=True)
            if editdist == min_editdist:
                best_word.append(correct_word)
            if editdist < min_editdist:
                min_editdist = editdist
                best_word = [correct_word]
        if min_editdist > 3:
            possible_words = {}
            for correct_word, correct_ngrams in N_GRAMS_EXT.items():
                if len(word_ngrams & correct_ngrams) > 0.30 * len(word_ngrams):
                    possible_words[correct_word] = preprocess(correct_word)
            min_editdist = 100
            best_word = []
            for correct_word, processed_correct_word in possible_words.items():
                editdist = edit_distance(processed_correct_word, processed_word, transpositions=True)
                if editdist == min_editdist:
                    best_word.append(correct_word)
                if editdist < min_editdist:
                    min_editdist = editdist
                    best_word = [correct_word]
        return best_word

In [9]:
def find_from_best(w, best):
    scores = {}
    w = preprocess(w)
    for b in best:
        cb = b
        b = preprocess(b)
        i = 0
        scores[cb] = 0
        tries = 1
        while b[i] == w[i] or tries:
            if b[i] != w[i]:
                tries = 0
            scores[cb] += 1 + tries
            i += 1
            if i == min(len(b), len(w)) or i > 3:
                break
        i = -1
        tries = 1
        while b[i] == w[i] or tries:
            if b[i] != w[i]:
                tries = 0
            scores[cb] += 1 + tries
            i -= 1
            if i == - min(len(b), len(w)) - 1 or i < -3:
                break
        scores[cb] -= 2 * abs(len(b) - len(w))
    s = sorted(scores, key=scores.get, reverse=True)
    return s[0], s

In [8]:
acc = 0
r = 1
for i, (correct, error) in tqdm(ERRORS[::-1].iterrows()):
    best = correct_error(error)
    corrected, scores = find_from_best(error, best)
    if corrected == correct:
        acc += 1
    else:
        print(corrected, correct, error)
        print(scores)
    print(acc / r)
    r+=1
print(100 * acc/len(ERRORS))

0it [00:00, ?it/s]

1.0
1.0
pospuszczać podpuszczać posdupszcać
['pospuszczać', 'poduszczać']
0.6666666666666666
poduszczać podpuszczać podpszżczać
['poduszczać', 'podpuszczać']
0.5
0.6
0.6666666666666666
0.7142857142857143
0.75
0.7777777777777778
0.8
wydatkowego wyjątkowego wyatkowrego
['wydatkowego', 'wyjątkowego', 'wątkowego']
0.7272727272727273
0.75
0.7692307692307693
0.7857142857142857
0.8
0.8125
0.8235294117647058
0.8333333333333334
0.8421052631578947
0.85
0.8571428571428571
kontrreakcja komunikacja konuniakcaja
['kontrreakcja', 'kontrreakcją', 'koniunkcja', 'kontrakcja', 'kontrakcją', 'koniunkcją']
0.8181818181818182
0.8260869565217391
0.8333333333333334
komunizacja komunikacja komunijajca
['komunizacja', 'komunikacja', 'komunizacją', 'komunikacją']
0.8
kominiarska komunikacja kominiakaca
['kominiarska', 'kominiarską', 'komunikacja', 'komiśniaka', 'komunikacją', 'kominiarscy', 'kominiarka', 'komiśniakach', 'kominiarkach', 'kominiarzami', 'kominiarką', 'kominiarza', 'kominiarce', 'kominiarzach', 'ko