In [23]:
import re
import textdistance
from collections import Counter
from nltk import sent_tokenize
from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances


punctuation += "«»—…“”"
punct = set(punctuation)

In [2]:
def normalize(text):
    normalized_text = [
        (word.strip(punctuation)) \
        for word in text.lower().split()
    ]
    normalized_text = [word for word in normalized_text if word]
    return normalized_text

In [3]:
corpus = []
for text in open('correct_sents.txt').read().splitlines():
    sents = sent_tokenize(text)
    norm_sents = [normalize(sent) for sent in sents]
    corpus += norm_sents

In [4]:
WORDS = Counter()
for sent in corpus:
    WORDS.update(sent)

In [5]:
WORDS.most_common(10)

[('и', 338),
 ('в', 292),
 ('не', 207),
 ('на', 189),
 ('что', 146),
 ('с', 118),
 ('а', 111),
 ('я', 102),
 ('очень', 68),
 ('все', 67)]

In [6]:
N = sum(WORDS.values())
def P(word, N=N):
    return WORDS[word] / N

In [7]:
def one_del1(word):
    letters = 'йцукенгшщзхъфывапролджэячсмитьбюё'
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    return set(deletes)

def two_del2(word): 
    return (e2 for e1 in one_del1(word) for e2 in one_del1(e1))

In [8]:
deletions_vocab = {
    1: {(tuple(one_del1(d)), d) for d in WORDS},
    2: {(tuple(two_del2(d)), d) for d in WORDS},
}

In [9]:
def correction(word):
    return max(candidates(word), key=P)

def candidates(word): 
    sure = known([word])
    not_sure = compare_w_deletions(one_del1(word), deletions_vocab[1])
    maybe = compare_w_deletions(two_del2(word), deletions_vocab[2])
    no_idea = [word]
    return sure or not_sure or maybe or no_idea

def compare_w_deletions(word_options, vocab):
    candidates = []
    for word in word_options:
        for possible, correct in vocab:
            if word in possible:
                candidates.append(correct)
    return candidates

def known(words): 
    return set(w for w in words if w in WORDS)

In [10]:
correction('питон')

'потом'

In [11]:
true = open('correct_sents.txt', encoding='utf8').read().splitlines()
bad = open('sents_with_mistakes.txt', encoding='utf8').read().splitlines()

def align_words(sent_1, sent_2):
    tokens_1 = sent_1.lower().split()
    tokens_2 = sent_2.lower().split()
    
    tokens_1 = [re.sub('(^\W+|\W+$)', '', token) for token in tokens_1 if (set(token)-punct)]
    tokens_2 = [re.sub('(^\W+|\W+$)', '', token) for token in tokens_2 if (set(token)-punct)]
    
    return list(zip(tokens_1, tokens_2))

In [12]:
correct = 0
total = 0

total_mistaken = 0
mistaken_fixed = 0

total_correct = 0
correct_broken = 0

cashed = {}
for i in range(len(true)):
    word_pairs = align_words(true[i], bad[i])
    for pair in word_pairs:
        predicted = cashed.get(pair[1], correction(pair[1]))
        cashed[pair[0]] = predicted
        if predicted == pair[0]:
            correct += 1
        total += 1
        
        if pair[0] == pair[1]:
            total_correct += 1
            if pair[0] !=  predicted:
                correct_broken += 1
        else:
            total_mistaken += 1
            if pair[0] == predicted:
                mistaken_fixed += 1
        
    if not i % 100:
        print(i)

0
100


KeyboardInterrupt: 

In [13]:
# Работает безумно долго, не досчиталось
print(correct/total)
print(mistaken_fixed/total_mistaken)
print(correct_broken/total_correct)

0.8323353293413174
0.3132075471698113
0.0885566417481311


In [15]:
corpus_news = [['<start>', '<start>'] + sent + ['<end>'] for sent in corpus]
def ngrammer(tokens, n=2):
    ngrams = []
    for i in range(0,len(tokens)-n+1):
        ngrams.append(' '.join(tokens[i:i+n]))
    return ngrams

unigrams = Counter()
bigrams = Counter()
trigrams = Counter()

for sentence in corpus_news:
    unigrams.update(sentence)
    bigrams.update(ngrammer(sentence))
    trigrams.update(ngrammer(sentence), n=3)

In [31]:
def get_closest_hybrid_match(text, X, vec, topn=5, metric=textdistance.damerau_levenshtein):
    candidates = get_closest_match_vec(text, X, vec, topn*4)
    sims = Counter()
    lookup = [cand[0] for cand in candidates]
    closest = get_closest_match_with_metric(text, lookup,topn, metric=metric)

    
    return closest

def get_closest_match_vec(text, X, vec, topn=20):
    # превращаем слово в вектор такой же размерности
    v = vec.transform([text])
    
    # вся эффективноть берется из того, что мы сразу считаем близость 
    # 1 вектора ко всей матрице (словам в словаре)
    # считать по отдельности циклом было бы дольше
    # вместо одного вектора может даже целая матрица
    # тогда считаться в итоге будет ещё быстрее
    
    similarities = cosine_distances(v, X)[0] #distance - чем больше, тем хуже, а similarity наоборот
    topn = similarities.argsort()[:topn] 
    
    return [(id2word[top], similarities[top]) for top in topn]

def get_closest_match_with_metric(text, lookup,topn=20, metric=textdistance.levenshtein):
    # Counter можно использовать и с не целыми числами
    similarities = Counter()
    
    for word in lookup:
        similarities[word] = metric.normalized_similarity(text, word) 
    
    return similarities.most_common(topn)

In [33]:
vocab = list(WORDS.keys())
id2word = {i:word for i, word in enumerate(vocab)}

vec = CountVectorizer(analyzer='char', ngram_range=(1,1), min_df=10)
X = vec.fit_transform(vocab)

In [34]:
mistakes = []
total_mistaken = 0
mistaken_fixed = 0

total_correct = 0
correct_broken = 0

total = 0
correct = 0



for i in range(len(true)):
    word_pairs = align_words(true[i], bad[i])
    
    word_pairs = [('<start>', '<start>')] + word_pairs
    pred_sent = []
    for j in range(1, len(word_pairs)):
        
        pred = None
        predicted = get_closest_hybrid_match(word_pairs[j][1], X, vec)
        
        
        prev_word = word_pairs[j-1][1]
        
        
        if prev_word not in unigrams:
            pred = predicted[0][0]
            
        
        else:
            
            lm_predicted = []
            for word, m in predicted:
                bigram = ' '.join([prev_word, word])
                # домножаем полученную метрику для слова на вероятность биграма
                # биграм - предыдущее слово + текущее слово кандидат
                lm_predicted.append((word, (m)*(1+(bigrams[bigram]/unigrams[prev_word]))))
            if lm_predicted:
                
                pred = sorted(lm_predicted, key=lambda x: -x[1])[0][0]
            
        
        if pred is None:
            pred = word_pairs[j][1]
        

        
        if pred == word_pairs[j][0]:
            correct += 1
        else:
            mistakes.append((word_pairs[j][0], word_pairs[j][1], pred))
        total += 1
            
        if word_pairs[j][0] == word_pairs[j][1]:
            total_correct += 1
            if word_pairs[j][0] !=  pred:
                correct_broken += 1
        else:
            total_mistaken += 1
            if word_pairs[j][0] == pred:
                mistaken_fixed += 1
    
    if not i % 50:
        print(i)
        print(correct/total)

0
1.0
50
0.9677938808373591
100
0.9731182795698925
150
0.9740177439797212
200
0.9730639730639731
250
0.9779654792508263
300
0.9768629807692307
350
0.9761842449620518
400
0.9781953143122245
450
0.9795833333333334
500
0.9782689450222882
550
0.9796334012219959
600
0.9808789514263685
650
0.981043994907342
700
0.9791338582677165
750
0.9772783099975436
800
0.9764624437521634
850
0.9750134625740442
900
0.9757261832216129


In [35]:
print(correct/total)
print(mistaken_fixed/total_mistaken)
print(correct_broken/total_correct)

0.9756243756243757
0.8227168073676132
0.001493051567704146
