In [2]:
import pandas as pd

from typing import *

import yargy as y
import yargy.predicates as yp
import yargy.morph as ytm
import yargy.tokenizer as yt

from pymorphy2 import MorphAnalyzer

from functools import lru_cache
CACHE_SIZE=10000

In [3]:
class MostProbMorphAnalyzer(ytm.MorphAnalyzer):

    def __call__(self, word):
        records = self.raw.parse(word)
        max_score = max(x.score for x in records)
        records = list(filter(lambda x: x.score == max_score, records))
        return [ytm.prepare_form(record) for record in records]
    
    
class CachedMostProbMorphAnalyzer(MostProbMorphAnalyzer):
    def __init__(self):
        super(CachedMostProbMorphAnalyzer, self).__init__()
        
    __call__ = lru_cache(CACHE_SIZE)(MostProbMorphAnalyzer.__call__)

In [4]:
rules = pd.read_csv("../data/rules/rules_formatted.csv")

In [5]:
predicates_ = pd.read_csv("../data/rules/predicates.csv")

In [6]:
deverbal_nouns = set(predicates_[predicates_.type == 'deverbal_noun'].predicate.to_list())
predicates = set(predicates_[predicates_.type == 'predicate'].predicate.to_list())
status_categories = set(predicates_[predicates_.type == 'status_category'].predicate.to_list())
rule_specific = set(rules.predicate.to_list())

In [7]:
all_predicates = predicates | deverbal_nouns | status_categories | rule_specific

In [8]:
def create_predicate_rule(
    require_deverbal_noun: str,
    require_reflexive: str,
    require_status_category: str,
    predicate: str,
    **kwargs
):
    rule_id = f"predicate={predicate},deverbal={require_deverbal_noun},reflexive={require_reflexive},status_category={require_status_category}"
    return rule_id, y.rule(
        y.and_(
            req_predicate(predicate),
            req_deverbal(require_deverbal_noun),
            req_reflexive(require_reflexive)
        )
    )

In [9]:
def create_argument_role(argument_type: str, case: str, preposition: str, **kwargs):
    rule_id = f"argument_type={argument_type},case={case},preposition={preposition}"
    arg = y.and_(
        req_argument(),
        req_animacy(argument_type),
        req_case(case)
    )
    internal = y.and_(
        yp.gram("ADJF"), 
        y.or_(
            yp.normalized("этот"),
            yp.normalized("тот")
        )
    )
    
    rule = y.or_(
        y.rule(req_preposition(preposition), arg),
        y.rule(req_preposition(preposition), internal, arg)
    )
    return rule_id, rule

In [10]:
def req_deverbal(require_deverbal_noun: str = '?'):
    if require_deverbal_noun == '1': ## strictly deverbal noun
        return y.and_(
            yp.gram("NOUN"),
            yp.in_caseless(deverbal_nouns)
        )
    elif require_deverbal_noun == '0': ## strictly regular verb
        return y.or_(
            yp.gram("VERB"),
            yp.gram("INFN")
        )
    elif require_deverbal_noun == '?': ## anything
        return y.or_(
            y.and_(
                yp.gram("NOUN"),
                yp.in_caseless(deverbal_nouns)
            ),
            yp.gram("VERB"),
            yp.gram("INFN")
        )
    else:
        raise ValueError("Incorrect deverbal status")

In [11]:
def req_reflexive(reflexive_status: str = '?'):
    
    def is_reflexive_verb(verb: str):
        return verb.endswith("ся")
    
    if reflexive_status == "1":
        return yp.custom(is_reflexive_verb)
    if reflexive_status == "0":
        return y.not_(yp.custom(is_reflexive_verb))
    elif reflexive_status == "?":
        return yp.true()
    else:
        raise ValueError ("Incorrect reflexive status")

In [12]:
def req_animacy(animacy: str = 'любой'):
    if animacy == 'любой':
        return yp.true()
    elif animacy == 'одуш.':
        return y.or_(
            y.not_(yp.gram('inan')),
            yp.gram("anim"),
            yp.gram("NPRO"),
            yp.gram("ADJF")
        )
    else:
        raise ValueError("Incorrect Animacy Type")

In [13]:
def req_argument():
    return y.and_(
        y.not_(
            y.or_(
                yp.gram('PREP'),
                yp.gram("CONJ"),
                yp.gram('PRCL'),
                yp.gram("INTJ")
            )
        ),
        y.or_(
            yp.gram("NOUN"),
            yp.gram("NPRO"),
            yp.gram("ADJF")
        )
    )

In [14]:
def req_predicate(word: str = "?"):
    predicate = y.or_(
        yp.gram("VERB"),
        yp.gram("INFN"),
        yp.gram("NOUN")
    )
    if word != '?':
        predicate = y.and_(
            yp.normalized(word),
            predicate
        )
        
    return predicate

In [15]:
from collections import defaultdict

In [16]:
def req_case(case: str = 'в'):
    if case == 'в':
        pred = yp.gram("accs")
    elif case == 'т':
        pred = yp.gram("ablt")
    elif case == 'д':
        pred = yp.gram('datv')
    elif case == 'р':
        pred = yp.gram("gent")
    elif case == 'и':
        pred = yp.gram("nomn")
    else:
        raise ValueError("Incorrect Case")
    
    return y.or_(pred)

In [17]:
def req_preposition(preposition: str = None):
    if preposition == 'None':
        return y.empty()
    else:
        return y.and_(
            yp.gram("PREP"),
            yp.eq(preposition)
        )

In [18]:
def soft_parser_pass(parser, text):
    matches = []
    for match in parser.findall(text):
        matches.append({
            'text': " ".join([x.value for x in match.tokens]),
            'span': tuple(match.span)
        })

    return matches

In [19]:
def strict_parser_pass(parser, text):
    match = parser.match(text)
    matches.append({
        'text': " ".join([x.value for x in match.tokens]),
        'span': tuple(match.span)
    })

    return [match]

In [20]:
def create_rules(**kwargs):
    predicate_rule_id, predicate_rule = create_predicate_rule(**kwargs)
    argument_rule_id, argument_rule = create_argument_role(**kwargs)
    return {
        'predicate_id': predicate_rule_id,
        'argument_id': argument_rule_id,
        'predicate_parser': y.Parser(predicate_rule, yt.MorphTokenizer(morph=CachedMostProbMorphAnalyzer())),
        'argument_parser': y.Parser(argument_rule, yt.MorphTokenizer(morph=CachedMostProbMorphAnalyzer()))
    }

In [21]:
roleset = set(rules.role)

In [22]:
ruleset = {}

In [23]:
from tqdm.auto import tqdm

In [24]:
for role in roleset:
    ruleset[role] = []
    
    for rule_dict in tqdm(rules[rules.role == role].to_dict(orient='records'), desc=role):
        ruleset[role].append(create_rules(**rule_dict))

HBox(children=(HTML(value='инструмент'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='каузатив'), FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(HTML(value='объект'), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value='каузатор'), FloatProgress(value=0.0, max=31.0), HTML(value='')))




HBox(children=(HTML(value='экспериенцер'), FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [25]:
argument_rules = {}
for role in ruleset.keys():
    for rule in ruleset[role]:
        argument_rules[f"{rule['argument_id']}+{role}"] = {
            'role':role,
            'argument_parser': rule['argument_parser']
        }

In [26]:
predicate_orient = {}
for role in ruleset.keys():
    for rule in ruleset[role]:
        rule_id = rule['predicate_id']
        if rule_id not in predicate_orient:
            predicate_orient[rule_id] = {}
            predicate_orient[rule_id]['predicate_parser'] = rule['predicate_parser']
            predicate_orient[rule_id]['arguments'] = []
            
        predicate_orient[rule_id]['arguments'].append({
            'role': role,
            'argument_id': rule['argument_id'],
            'argument_parser': rule['argument_parser']
        })

In [27]:
predicate_orient_rules = {}
for predicate_id in predicate_orient.keys():
    argument_tuples = set([
        f"{x['argument_id']}+{x['role']}" for x in predicate_orient[predicate_id]['arguments']
    ])
    predicate_orient_rules[predicate_id] = {
        'predicate_parser': predicate_orient[predicate_id]['predicate_parser'],
        'arguments': [argument_rules[key] for key in argument_tuples]
    }

In [28]:
import yargy.pipelines as pipelines

In [29]:
filter_pipeline = y.Parser(
    pipelines.morph_pipeline(list(all_predicates)),
    tokenizer=yt.MorphTokenizer(
        morph=CachedMostProbMorphAnalyzer()
    )
)

In [30]:
def check_parseable(text, parser):
    return len(list(parser.findall(text))) > 0

In [31]:
check_parseable("Вашингтон возмущается бездействием Москвы", filter_pipeline)

True

In [32]:
check_parseable("взволнованные", filter_pipeline)

True

In [33]:
class ArgumentExtractor:
    
    def __init__(self, *args, **kwargs):
        pass
    
    def extract(self, sentence: str) -> List[Dict[str, Any]]:
        pass

In [36]:
from ufal.udpipe import Model, Pipeline, ProcessingError
from predpatt import PredPatt, load_conllu
from predpatt import PredPattOpts
from predpatt.util.ud import dep_v1, dep_v2

class PredPattArgumentExtractor(ArgumentExtractor):
    def __init__(
        self,
        path_to_udpipe: str,
        resolve_relcl: bool = True,
        resolve_appos: bool = True,
        resolve_amod: bool = True,
        resolve_conj: bool = True,
        resolve_poss: bool = True,
        ud = dep_v2.VERSION
    ):
        super().__init__()
        self.model = Model.load(path_to_udpipe)
        self.pipeline = Pipeline(self.model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')
        self._error = ProcessingError()
        self._opts = PredPattOpts(
            resolve_relcl=resolve_relcl,
            resolve_appos=resolve_appos,
            resolve_amod=resolve_amod,
            resolve_conj=resolve_conj,
            resolve_poss=resolve_poss,
            ud=ud
        )
        
    def extract(self, sentence: str) -> List[Dict[str, Any]]:
        processed = self.pipeline.process(sentence, self._error)
        if self._error.occurred():
            print(f"=== Error occurred: {self._error.message}")
            self._error = ProcessingError()
            return None
        else:
            conll_example = [ud_parse for sent_id, ud_parse in load_conllu(processed)][0]
            ppatt = PredPatt(conll_example, opts=self._opts)
            result = []
            for predicate in ppatt.instances:
                structure = {
                    'predicate': predicate.tokens,
                    'arguments': [x.tokens for x in predicate.arguments]
                }
                result.append(structure)
                
            return result        

In [40]:
extractor = PredPattArgumentExtractor("../data/models/russian-syntagrus-ud-2.5-191206.udpipe")

In [41]:
%%timeit -n 100 -r 7
check_parseable('не опечалься на в твоем', filter_pipeline)

3.33 ms ± 202 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [42]:
from navec import Navec
from slovnet import Syntax
navec = Navec.load('../data/models/navec_news_v1_1B_250K_300d_100q.tar')
syntax = Syntax.load('../data/models/slovnet_syntax_news_v1.tar')
_ = syntax.navec(navec)

In [67]:
m = next(syntax.map([['продолжает','радовать']]))

In [68]:
m.tokens

[SyntaxToken(
     id='1',
     text='продолжает',
     head_id='0',
     rel='root'
 ),
 SyntaxToken(
     id='2',
     text='радовать',
     head_id='1',
     rel='xcomp'
 )]

In [100]:
extr = MainPhraseExtractor(syntax, MorphAnalyzer())

In [103]:
extr.get_main_phrase(['бездействием','Москвы'], get_prep=True)

[SyntaxToken(
     id='1',
     text='бездействием',
     head_id='0',
     rel='root'
 )]

In [104]:
from collections import defaultdict
class MainPhraseExtractor:
    
    def __init__(self, syntax_parser, pymorphy_analyzer):
        self.syntax = syntax_parser
        self.morph = pymorphy_analyzer

    def get_main_phrase(self, words, get_prep=False, verbose=False):
        markup = next(self.syntax.map([words]))
        forward = {}
        backward = defaultdict(list)
        token_map = {}
        candidates = []
        for token in markup.tokens:
            if token.head_id not in backward:
                backward[token.head_id] = []

            token_map[token.id] = token
            forward[token.id] = token.head_id
            backward[token.head_id].append(token.id)

            if token.id == token.head_id or token.head_id == '0':
                candidates.append(token.id)
             
        if verbose:
            print("forward ", forward)
            print("backward ", backward)
            print("candidates ", candidates)
                
        if len(candidates) == 0:
            return markup.tokens

        candidate = sorted(candidates, key=lambda x: len(backward[x]))[-1]
        if get_prep:
            prep_candidates = backward[candidate]
            prep_candidates = list(
                filter(lambda x: self.morph.tag(token_map[x].text)[0].POS == 'PREP', prep_candidates)
            )
            if len(prep_candidates) == 0:
                return [token_map[candidate]]
            
            prep = sorted(prep_candidates, key=lambda x: abs(int(x) - int(candidate)))[0]
            return (token_map[prep], token_map[candidate])

        return [token_map[candidate]]

In [105]:
class RoleLabeler:
    def __init__(
        self,
        argument_extractor: ArgumentExtractor,
        main_phrase_extractor: MainPhraseExtractor,
        filter_pipeline,
        predicate_ruleset,
        mode: str = 'soft'
    ):
        
        self.argument_extractor = argument_extractor
        self.main_phrase_extractor = main_phrase_extractor
        self.filter_pipeline = filter_pipeline
        self.ruleset = predicate_ruleset
        if mode == 'soft':
            self.pass_fn = soft_parser_pass
        elif mode == 'strict':
            self.pass_fn = strict_parser_pass
        else:
            raise ValueError(f"Incorrect mode = {mode}, can be 'soft' or 'strict'")
            
    def check_parse(self, text, parser):
        return len(self.pass_fn(parser, text)) > 0
    
    def run(self, sentence):
        arg_groups = self.argument_extractor.extract(sentence)
        arg_groups = list(
            filter(
                lambda x: check_parseable(
                    " ".join([token.text for token in x['predicate']]),
                    self.filter_pipeline
                ),
                arg_groups
            )
        )
        result = []
        for group in arg_groups:
            
            predicate_txt = " ".join([token.text for token in group['predicate']])
            predicate_tokens = [token.text for token in group['predicate']]
            predicate_main = " ".join([x.text for x in self.main_phrase_extractor.get_main_phrase(predicate_tokens)])
            forward_map = {" ".join([token.text for token in argument]): argument for argument in group['arguments']}
            group_name = f"predicate={predicate_txt},arguments=[{','.join(forward_map.keys())}]"
            group_result = []
            for predicate in self.ruleset.values():
                if self.check_parse(predicate_main, predicate['predicate_parser']):
                    predicate_result = {
                        'predicate': predicate_txt,
                        'predicate_analyzed': predicate_main,
                        'predicate_tokens': group['predicate'],
                        'arguments': []
                    }
                    for argument in forward_map.keys():
                        argument_tokens = [x.text for x in forward_map[argument]]
                        argument_main = " ".join([
                            x.text for x in self.main_phrase_extractor.get_main_phrase(argument_tokens, True)
                        ])
                        roles = [
                            rule['role'] for rule in predicate['arguments'] 
                            if self.check_parse(argument_main, rule['argument_parser'])
                        ]
                        if len(roles) > 0: 
                            predicate_result['arguments'].append({
                                'argument': argument,
                                'argument_analyzed': argument_main,
                                'argument_tokens': forward_map[argument],
                                'roles': tuple(roles)
                            })
                    if len(predicate_result['arguments']) > 0:
                        predicate_result['arguments'] = tuple(predicate_result['arguments'])
                        group_result.append(predicate_result)
            result.append({'group': group_name, 'parses': group_result})
        return result
            

In [106]:
main_phrase_extractor = MainPhraseExtractor(syntax, MorphAnalyzer())

In [107]:
labeler = RoleLabeler(extractor, main_phrase_extractor, filter_pipeline, predicate_orient_rules, mode='soft')

In [108]:
from pprint import pprint

In [109]:
import razdel

In [49]:
%%timeit -n 100 -r 7
extractor.extract("Вашингтон возмущается бездействием Москвы")

3.42 ms ± 146 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [110]:
labeler.run("Вашингтон возмущается бездействием Москвы")

[{'group': 'predicate=возмущается,arguments=[Вашингтон,бездействием Москвы]',
  'parses': [{'predicate': 'возмущается',
    'predicate_analyzed': 'возмущается',
    'predicate_tokens': [возмущается/1],
    'arguments': ({'argument': 'бездействием Москвы',
      'argument_analyzed': 'бездействием',
      'argument_tokens': [бездействием/2, Москвы/3],
      'roles': ('каузатор',)},)}]}]

In [51]:
%%timeit -n 100 -r 7
labeler.run("Все остальные более сложные или более сомнительные вещи можно выдавать на факультативных занятиях и кружках , если мелкому будет не интересно.|")

115 ms ± 3.3 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [111]:
def visualize(text, parse):
    tokens = {i: x.text for i, x in enumerate(razdel.tokenize(text))}
    arguments = []
    sample_role = None
    for i, arg in enumerate(parse['arguments']):
        arguments.append(arg['argument_analyzed'])
        if i == 0 or sample_role is None:
            if len(arg['roles']) > 0:
                sample_role = arg['roles'][0]
        new_tokens = [x.text for x in arg['argument_tokens']]
        new_tokens[0] = f"[{new_tokens[0]}"
        new_tokens[-1] = f"{new_tokens[-1]}#{'/'.join(set(arg['roles']))}]"
        new_tokens = {arg['argument_tokens'][i].position: new_tokens[i] for i in range(len(new_tokens))}
        tokens = {**tokens, **new_tokens}
        
    new_tokens = [x.text for x in parse['predicate_tokens']]
    new_tokens[0] = f"[{new_tokens[0]}"
    new_tokens[-1] = f"{new_tokens[-1]}@Предикат]"
    new_tokens = {parse['predicate_tokens'][i].position: new_tokens[i] for i in range(len(new_tokens))}
    tokens = {**tokens, **new_tokens}
    tokens = sorted(tokens.items(), key=lambda x: x[0])
    return {
        'formatted_text': " ".join([x[1] for x in tokens]),
        'orig_text': text,
        'predicate': parse['predicate_analyzed'],
        'arguments': arguments,
        'role': sample_role
    }

In [112]:
import os

In [113]:
from itertools import chain

In [114]:
from pathlib import Path

In [115]:
cats = [list(x.iterdir()) for x in Path("../data/txts/").iterdir()]

In [116]:
cats = list(chain.from_iterable(cats))

In [117]:
cats = list(chain.from_iterable([x.iterdir() for x in cats]))

In [118]:
files = list(chain.from_iterable([x.iterdir() for x in cats]))

In [119]:
len(files)

9429

In [120]:
sentences = []
for file in tqdm(files):
    with open(file, 'r', encoding='utf-8') as f:
        text = f.read()
        sentences += list(razdel.sentenize(text))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9429.0), HTML(value='')))




In [121]:
sentences = [x.text for x in sentences]

In [122]:
import traceback

parse_res = []
for text in tqdm(sentences[:100000]):
    groups = labeler.run(text)
    for group in groups:
        for parse in group['parses']:
            try:
                parse_res.append(visualize(text, parse))
            except Exception:
                print(f"== Error with {text}")
                print(traceback.format_exc())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))




In [123]:
len(parse_res)

1453

In [126]:
parse_res[0]

{'formatted_text': '[Гениальный руководитель#каузатор/экспериенцер] Беглов [продолжает радовать@Предикат] .',
 'orig_text': 'Гениальный руководитель Беглов продолжает радовать.',
 'predicate': 'продолжает',
 'arguments': ['руководитель'],
 'role': 'экспериенцер'}

In [125]:
def tag2str(tag):
    return ",".join(sorted(tag.grammemes_cyr))

In [132]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()
def get_most_prob_tags(word, morph):
    variants = morph.parse(word)
    max_score = max(x.score for x in variants)
    variants = filter(
        lambda x: x.score == max_score,
        variants
    )
    variants = list(map(lambda x: x.tag, variants))
    return variants

def get_morph_string(parse):
    predicate = [x.text for x in razdel.tokenize(parse['predicate'])]
    arguments = parse['arguments']
    arguments = [[x.text for x in razdel.tokenize(arg)] for arg in arguments]
    predicate_tags = [get_most_prob_tags(x, morph) for x in predicate]
    arguments_tags = [[get_most_prob_tags(x, morph) for x in arg] for arg in arguments]
    
    arguments_str = ""
    for i in range(len(arguments)):
        words = arguments[i]
        for j in range(len(words)):
            word = words[j]
            tags = arguments_tags[i][j]
            tags = list(map(tag2str, tags))
            arg_str = f"{word}#[{'/'.join(tags)}]"
            arguments_str += arg_str + ";"
            
    predicate_str = ""
    for i in range(len(predicate)):
        word = predicate[i]
        tags = predicate_tags[i]
        tags = list(map(tag2str, tags))
        predicate_str += f"{word}#[{'/'.join(tags)}]"
        
    result_str = f"Предикат={predicate_str}||Аргументы={arguments_str}"
        
    return result_str

In [133]:
get_morph_string(parse_res[0])

'Предикат=продолжает#[3л,ГЛ,ед,изъяв,наст,несов,перех]||Аргументы=руководитель#[СУЩ,ед,им,мр,од];'

In [136]:
for obj in tqdm(parse_res):
    obj['morph'] = get_morph_string(obj)
    obj['arguments'] = ";".join(obj['arguments'])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1453.0), HTML(value='')))




In [137]:
parse_res[0]

{'formatted_text': '[Гениальный руководитель#каузатор/экспериенцер] Беглов [продолжает радовать@Предикат] .',
 'orig_text': 'Гениальный руководитель Беглов продолжает радовать.',
 'predicate': 'продолжает',
 'arguments': 'руководитель',
 'role': 'экспериенцер',
 'morph': 'Предикат=продолжает#[3л,ГЛ,ед,изъяв,наст,несов,перех]||Аргументы=руководитель#[СУЩ,ед,им,мр,од];'}

In [138]:
parse = pd.DataFrame(parse_res)

In [139]:
parse['_pr'] = parse['predicate'] + "+" + parse['role']

In [140]:
parse.to_csv("parsed_roles-100k.csv", index=False, encoding='utf-8')

In [141]:
pairs = parse['_pr'].value_counts().to_dict().keys()

In [142]:
parse = parse.sample(frac=1.0).reset_index(drop=True)

In [143]:
refined = []
for pair in tqdm(pairs):
    refined += parse[parse['_pr'] == pair].head(10).to_dict(orient='records')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=856.0), HTML(value='')))




In [150]:
new_parse = pd.DataFrame(refined)

In [151]:
new_parse = new_parse.loc[:, ['role','predicate','arguments','morph', 'formatted_text']]

In [152]:
new_parse.to_csv("./parsed_roles-100k.csv")