In [2]:
!pip install textdistance

Collecting textdistance
  Downloading https://files.pythonhosted.org/packages/90/19/0ec54a05e269537341b489da4e8a7b86e34b94e0096f67ecfcbab98cd3aa/textdistance-4.1.3-py3-none-any.whl
Installing collected packages: textdistance
Successfully installed textdistance-4.1.3


You are using pip version 10.0.1, however version 19.1.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [55]:
import requests
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
import textdistance
from collections import Counter
from string import punctuation
punct = set(punctuation)
import re

In [59]:
corpus = [sent.split() for sent in open(r'D:\corpus_ng.txt', encoding='utf-8').read().splitlines()]
WORDS = Counter()
for sent in corpus:
    WORDS.update(sent)

In [60]:
vocab = list(WORDS.keys())
id2word = {i:word for i, word in enumerate(vocab)}

vec = TfidfVectorizer(analyzer='char', ngram_range=(1,1))
X = vec.fit_transform(vocab)

In [61]:
def get_closest_hybrid_match(text, X, vec, TOPN=3, metric=textdistance.levenshtein):
    v = vec.transform([text])
    similarities = cosine_distances(v, X)
    topn = similarities.argsort()[0][:TOPN]
    close_words = [id2word[top] for top in topn]
    sim = Counter()
    for word in close_words:
        sim[word] = metric.normalized_similarity(text, word)
    closest = sim.most_common(1)[0]
    
    return closest

In [31]:
get_closest_hybrid_match('опофеоз', X, vec, TOPN=3, metric=textdistance.levenshtein)

('апофеоз', 0.8571428571428572)

In [62]:
correct = open(r'D:\correct_sents.txt', 'r', encoding = 'utf-8').read().splitlines()
incorrect = open(r'D:\sents_with_mistakes.txt', 'r', encoding = 'utf-8').read().splitlines()

In [63]:
def align_words(sent_1, sent_2):
    tokens_1 = sent_1.lower().split()
    tokens_2 = sent_2.lower().split()
    
    tokens_1 = [re.sub('(^\W+|\W+$)', '', token) for token in tokens_1 if (set(token)-punct)]
    tokens_2 = [re.sub('(^\W+|\W+$)', '', token) for token in tokens_2 if (set(token)-punct)]
    
    return list(zip(tokens_1, tokens_2))  

In [66]:
mistakes = []

for i in range(len(correct)):
    
    word_pairs = align_words(correct[i], incorrect[i])
    
    for pair in word_pairs:
        if pair[0] != pair[1]:
            mistakes.append(pair[1])

In [67]:
mistakes[:5]

['симпатичнейшое', 'опофеозом', 'пояним', 'полчатся', 'оччччень']

In [68]:
true = []
for word in mistakes:
    cor_word = get_closest_hybrid_match(word, X, vec, TOPN=3, metric=textdistance.levenshtein)
    true.append(cor_word)

In [69]:
true

[('пластичнейшими', 0.4285714285714286),
 ('апофеозом', 0.8888888888888888),
 ('поясним', 0.8571428571428572),
 ('ополчатся', 0.8888888888888888),
 ('чечни', 0.375),
 ('защищено', 0.375),
 ('отсутствие', 0.9),
 ('основная', 0.875),
 ('роснано', 0.2857142857142857),
 ('общем', 0.8333333333333334),
 ('как', 1.0),
 ('вы', 1.0),
 ('знаете', 1.0),
 ('из', 1.0),
 ('моего', 1.0),
 ('не', 1.0),
 ('давнего', 1.0),
 ('ящика', 0.8),
 ('предлагаю', 0.8888888888888888),
 ('сегодняшнее', 0.9090909090909091),
 ('хорошее', 0.8571428571428572),
 ('выходных', 0.875),
 ('трампу', 0.33333333333333337),
 ('штат', 0.75),
 ('лучше', 0.8),
 ('аффектов', 0.33333333333333337),
 ('компьютерная', 0.9166666666666666),
 ('ссоре', 0.8),
 ('что', 0.6),
 ('машинный', 0.625),
 ('хорошее', 0.8571428571428572),
 ('молодежи', 0.875),
 ('участвовать', 0.9166666666666666),
 ('сиднея', 0.6666666666666667),
 ('вешать', 1.0),
 ('ответственность', 0.9333333333333333),
 ('это', 0.6666666666666667),
 ('канаш', 0.5714285714285714)