In [6]:
import nltk

In [7]:
import requests as rq
import json
import pandas as pd
import functools
from tqdm import tqdm_notebook
import pickle

In [11]:
proxies = {
  "http": None,
  "https": None,
}
def malt_parse(text, url='http://localhost:2000'):
    res = json.loads(
        rq.get(f'{url}/parse?text={text}', proxies=proxies).content)
    df = pd.DataFrame(columns=['Id', 'Form', 'Lemma', 'CPOSTAG', 'Postag', 'Features', 
                               'Head', 'Deprel', 'Phead', 'Pdeprel'])
    for i in range(len(res)):
        df.loc[i] = res[i].split('\t')
    df['Head'] = df['Head'].astype('int')
    df['Id'] = df['Id'].astype('int')
    df.index.name = 'ind'
    return df

In [12]:
def extract_dependency_features(data):
    df = pd.DataFrame(columns = ['Root features', 'Child features', 'Deprel'])
    sent_begin = 0
    with tqdm_notebook(total = len(data)) as bar:
        for index, row in data.iterrows():
            if row['Features'] == 'SENT':
                #the sentence has ended
                sent_begin = index
                continue
            head = data[(data['Id'] == row['Head']) & (data.index >= sent_begin)]['Features']
            if head.size > 0:
                df.loc[len(df)] = [head.iloc[0], row['Features'], row['Deprel']]
            bar.update(1)
    return df.drop_duplicates()

In [13]:
maltparser_endpoint = 'http://192.168.99.100:2000'

In [14]:
mparse = functools.partial(malt_parse, url=maltparser_endpoint)

In [15]:
def process(fname):
    # грузим все предложения
    with open(fname, encoding='utf-8') as f:
        sents = f.readlines()
    # удаляем \n
    sents = [s.replace('\n','')for s in sents]
    parsed = mparse(sents[0])
    print('Parsing sentences')
    with tqdm_notebook(total = len(sents)) as prog:
        for i in range(0, len(sents), 1):
            parsed = parsed.append(mparse(' '.join(sents[i:i+1])))
            prog.update(1)
    print('Extracting deprels')
    return extract_dependency_features(parsed)

In [13]:
deps = process('sents_don.txt')

Parsing sentences


HBox(children=(IntProgress(value=0, max=19353), HTML(value='')))


Extracting deprels


HBox(children=(IntProgress(value=0, max=194516), HTML(value='')))




In [112]:
from random import random, randint, sample

In [118]:
cyrillic_letters = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"

In [137]:
def make_errors(text, word_prob=0.1):
    """
    Args:
        text (str): The target text
        word_prob (float): probability of making a mistake in a single word
    """
    res = text.split()
    for i in range(len(res)):
        if(random() <= word_prob):
            word = res[i]
            word_len = len(word)
            if not word[-1].isalpha():
                word_len -= 1
            ind_error = randint(0, word_len - 1)
            word = word[:ind_error] + sample(cyrillic_letters, 1)[0] + word[ind_error+1:]
            res[i] = word
    return ' '.join(res)

In [140]:
test = "Предложение для? проверки, функции создания ошибок."

In [157]:
make_errors(test, 0.2)

'Предложение для? эроверки, функциЫ соПдания ошибВк.'

## Читаем зависимости из файлов в ./deps/ и составляем множество троек (head_features, tail_features, dep_rel)

In [21]:
def get_deps(df):
    for i, row in df.iterrows():
        yield(row['Root features'],row['Child features'],row['Deprel'])

In [17]:
# читаем deps
with open('deps/deps_war_and_peace', 'rb') as f:
    deps = pickle.load(f)

In [18]:
# читаем deps
with open('deps/deps_don', 'rb') as f:
    deps_don = pickle.load(f)

In [20]:
possible_deps = set()

In [22]:
for i, row in deps_don.iterrows():
    possible_deps.add((row['Root features'],row['Child features'],row['Deprel']))

In [24]:
for i, row in deps.iterrows():
    possible_deps.add((row['Root features'],row['Child features'],row['Deprel']))

In [25]:
len(possible_deps)

47283

In [27]:
test_parsed = mparse("Будет дождь.")
test_deps = extract_dependency_features(test_parsed)

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




In [28]:
def check_sent(sent, possible_deps):
    """
    Args:
        sent (str): Sentence to check
        possible_deps (set): The set of all possible dependency relations
    """
    test_parsed = mparse(sent)
    test_deps = extract_dependency_features(test_parsed)
    res = ""
    for ind, row in test_deps.iterrows():
        print((row['Root features'],row['Child features'],row['Deprel']))
        print((row['Root features'],row['Child features'],row['Deprel']) in possible_deps)
        print()
        res = res + str((row['Root features'],row['Child features'],row['Deprel'])) + "&&" + str((row['Root features'],row['Child features'],row['Deprel']) in possible_deps) + "\\"
    return res, test_parsed, test_deps

In [30]:
a = check_sent("Я иду дом", possible_deps)

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


('Vmip1s-a-e', 'P-1-snn', 'предик')
True

('Vmip1s-a-e', 'Ncmsan', '1-компл')
True



In [118]:
b = check_sent("Я иду дом", possible_deps)

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


('Vmip1s-a-e', 'P-1-snn', 'предик')
True

('Vmip1s-a-e', 'Ncmsan', '1-компл')
True



In [96]:
check_sent("Люди ходят.", possible_deps)

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


('Vmip3p-a-e', 'Ncmpny', 'предик')
False



("('Vmip3p-a-e', 'Ncmpny', 'предик')&&False\\",
      Id   Form    Lemma CPOSTAG Postag    Features  Head  Deprel Phead Pdeprel
 ind                                                                           
 0     1   Люди  человек       N      N      Ncmpny     2  предик     _       _
 1     2  ходят   ходить       V      V  Vmip3p-a-e     0    ROOT     _       _
 2     3      .        .       S      S        SENT     2    PUNC     _       _,
   Root features Child features  Deprel
 0    Vmip3p-a-e         Ncmpny  предик)

In [64]:
r,t,d = check_sent("Ты выходишь учить тех, кому уже шесть тысяч лет.", possible_deps)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


('Vmip2s-a-e', 'P-2-snn', 'предик')
False

('Vmip2s-a-e', 'Vmn----a-e', '1-компл')
False

('Vmn----a-e', 'P---pga', '1-компл')
False

('P---pga', 'P--msdn', 'опред')
False

('Ncfpgn', 'R', 'огранич')
True

('Ncfpgn', 'Mc--n', 'количест')
True

('Vmip2s-a-e', 'Ncfpgn', 'длительн')
False

('Ncfpgn', 'Ncmpgn', 'квазиагент')
True



In [66]:
r

"('Vmip2s-a-e', 'P-2-snn', 'предик')&&False\\('Vmip2s-a-e', 'Vmn----a-e', '1-компл')&&False\\('Vmn----a-e', 'P---pga', '1-компл')&&False\\('P---pga', 'P--msdn', 'опред')&&False\\('Ncfpgn', 'R', 'огранич')&&True\\('Ncfpgn', 'Mc--n', 'количест')&&True\\('Vmip2s-a-e', 'Ncfpgn', 'длительн')&&False\\('Ncfpgn', 'Ncmpgn', 'квазиагент')&&True\\"