In [52]:
import pandas as pd

from typing import *

import yargy as y
import yargy.predicates as yp
import yargy.morph as ytm
import yargy.tokenizer as yt

from pymorphy2 import MorphAnalyzer
from isanlp import PipelineCommon
from isanlp.processor_remote import ProcessorRemote
from isanlp.ru.processor_mystem import ProcessorMystem
from isanlp.ru.converter_mystem_to_ud import ConverterMystemToUd 

from pyhash import city_32
import joblib as jb
import json

from rich import print
import razdel

import os

from functools import lru_cache
CACHE_SIZE=10000

In [2]:
class MostProbMorphAnalyzer(ytm.MorphAnalyzer):

    def __call__(self, word):
        records = self.raw.parse(word)
        max_score = max(x.score for x in records)
        records = list(filter(lambda x: x.score == max_score, records))
        return [ytm.prepare_form(record) for record in records]
    
    
class CachedMostProbMorphAnalyzer(MostProbMorphAnalyzer):
    def __init__(self):
        super(CachedMostProbMorphAnalyzer, self).__init__()
        
    __call__ = lru_cache(CACHE_SIZE)(MostProbMorphAnalyzer.__call__)

In [3]:
rules = pd.read_csv("../data/rules/rules_formatted.csv")

In [4]:
predicates_ = pd.read_csv("../data/rules/predicates.csv")

In [5]:
deverbal_nouns = set(predicates_[predicates_.type == 'deverbal_noun'].predicate.to_list())
predicates = set(predicates_[predicates_.type == 'predicate'].predicate.to_list())
status_categories = set(predicates_[predicates_.type == 'status_category'].predicate.to_list())
rule_specific = set(rules.predicate.to_list())

In [6]:
all_predicates = predicates | deverbal_nouns | status_categories | rule_specific

In [7]:
def create_predicate_rule(
    require_deverbal_noun: str,
    require_reflexive: str,
    require_status_category: str,
    predicate: str,
    **kwargs
):
    rule_id = f"predicate={predicate},deverbal={require_deverbal_noun},reflexive={require_reflexive},status_category={require_status_category}"
    return rule_id, y.rule(
        y.and_(
            req_predicate(predicate),
            req_deverbal(require_deverbal_noun),
            req_reflexive(require_reflexive)
        )
    )

In [8]:
def create_argument_role(argument_type: str, case: str, preposition: str, **kwargs):
    rule_id = f"argument_type={argument_type},case={case},preposition={preposition}"
    arg = y.and_(
        req_argument(),
        req_animacy(argument_type),
        req_case(case)
    )
    internal = y.and_(
        yp.gram("ADJF"), 
        y.or_(
            yp.normalized("этот"),
            yp.normalized("тот")
        )
    )
    
    rule = y.or_(
        y.rule(req_preposition(preposition), arg),
        y.rule(req_preposition(preposition), internal, arg)
    )
    return rule_id, rule

In [9]:
def req_deverbal(require_deverbal_noun: str = '?'):
    if require_deverbal_noun == '1': ## strictly deverbal noun
        return y.and_(
            yp.gram("NOUN"),
            yp.in_caseless(deverbal_nouns)
        )
    elif require_deverbal_noun == '0': ## strictly regular verb
        return y.or_(
            yp.gram("VERB"),
            yp.gram("INFN")
        )
    elif require_deverbal_noun == '?': ## anything
        return y.or_(
            y.and_(
                yp.gram("NOUN"),
                yp.in_caseless(deverbal_nouns)
            ),
            yp.gram("VERB"),
            yp.gram("INFN")
        )
    else:
        raise ValueError("Incorrect deverbal status")

In [10]:
def req_reflexive(reflexive_status: str = '?'):
    
    def is_reflexive_verb(verb: str):
        return verb.endswith("ся") or verb.endswith("сь")
    
    if reflexive_status == "1":
        return yp.custom(is_reflexive_verb)
    if reflexive_status == "0":
        return y.not_(yp.custom(is_reflexive_verb))
    elif reflexive_status == "?":
        return yp.true()
    else:
        raise ValueError ("Incorrect reflexive status")

In [11]:
def req_animacy(animacy: str = 'любой'):
    if animacy == 'любой':
        return yp.true()
    elif animacy == 'одуш.':
        return y.or_(
            y.not_(yp.gram('inan')),
            yp.gram("anim"),
            yp.gram("NPRO"),
            yp.gram("ADJF")
        )
    else:
        raise ValueError("Incorrect Animacy Type")

In [12]:
def req_argument():
    return y.and_(
        y.not_(
            y.or_( ## prohibits arguments from being any of following parts-of-speech
                yp.gram('PREP'),
                yp.gram("CONJ"),
                yp.gram('PRCL'),
                yp.gram("INTJ"),
                yp.gram("ADJF")
            )
        ),
        y.or_(
            yp.gram("NOUN"),
            yp.gram("NPRO")
        )
    )

In [13]:
def req_predicate(word: str = "?"):
    predicate = y.or_(
        yp.gram("VERB"),
        yp.gram("INFN"),
        yp.gram("NOUN")
    )
    if word != '?':
        predicate = y.and_(
            yp.normalized(word),
            predicate
        )
        
    return predicate

In [14]:
from collections import defaultdict

In [15]:
def req_case(case: str = 'в'):
    if case == 'в':
        pred = yp.gram("accs")
    elif case == 'т':
        pred = yp.gram("ablt")
    elif case == 'д':
        pred = yp.gram('datv')
    elif case == 'р':
        pred = yp.gram("gent")
    elif case == 'и':
        pred = yp.gram("nomn")
    else:
        raise ValueError("Incorrect Case")
    
    return y.or_(pred)

In [16]:
def req_preposition(preposition: str = None):
    if preposition == 'None':
        return y.empty()
    else:
        return y.and_(
            yp.gram("PREP"),
            yp.eq(preposition)
        )

In [17]:
def soft_parser_pass(parser, text):
    matches = []
    for match in parser.findall(text):
        matches.append({
            'text': " ".join([x.value for x in match.tokens]),
            'span': tuple(match.span)
        })

    return matches

In [18]:
def strict_parser_pass(parser, text):
    match = parser.match(text)
    matches.append({
        'text': " ".join([x.value for x in match.tokens]),
        'span': tuple(match.span)
    })

    return [match]

In [19]:
def create_rules(**kwargs):
    predicate_rule_id, predicate_rule = create_predicate_rule(**kwargs)
    argument_rule_id, argument_rule = create_argument_role(**kwargs)
    return {
        'predicate_id': predicate_rule_id,
        'argument_id': argument_rule_id,
        'predicate_parser': y.Parser(predicate_rule, yt.MorphTokenizer(morph=CachedMostProbMorphAnalyzer())),
        'argument_parser': y.Parser(argument_rule, yt.MorphTokenizer(morph=CachedMostProbMorphAnalyzer()))
    }

In [20]:
roleset = set(rules.role)

In [21]:
ruleset = {}

In [22]:
from tqdm.auto import tqdm

In [23]:
for role in roleset:
    ruleset[role] = []
    
    for rule_dict in tqdm(rules[rules.role == role].to_dict(orient='records'), desc=role):
        ruleset[role].append(create_rules(**rule_dict))

HBox(children=(HTML(value='инструмент'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='каузатив'), FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(HTML(value='каузатор'), FloatProgress(value=0.0, max=31.0), HTML(value='')))




HBox(children=(HTML(value='объект'), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value='экспериенцер'), FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [24]:
argument_rules = {}
for role in ruleset.keys():
    for rule in ruleset[role]:
        argument_rules[f"{rule['argument_id']}+{role}"] = {
            'role':role,
            'argument_parser': rule['argument_parser']
        }

In [25]:
predicate_orient = {}
for role in ruleset.keys():
    for rule in ruleset[role]:
        rule_id = rule['predicate_id']
        if rule_id not in predicate_orient:
            predicate_orient[rule_id] = {}
            predicate_orient[rule_id]['predicate_parser'] = rule['predicate_parser']
            predicate_orient[rule_id]['arguments'] = []
            
        predicate_orient[rule_id]['arguments'].append({
            'role': role,
            'argument_id': rule['argument_id'],
            'argument_parser': rule['argument_parser']
        })

In [26]:
predicate_orient_rules = {}
for predicate_id in predicate_orient.keys():
    argument_tuples = set([
        f"{x['argument_id']}+{x['role']}" for x in predicate_orient[predicate_id]['arguments']
    ])
    predicate_orient_rules[predicate_id] = {
        'predicate_parser': predicate_orient[predicate_id]['predicate_parser'],
        'arguments': [argument_rules[key] for key in argument_tuples]
    }

In [27]:
import yargy.pipelines as pipelines

In [28]:
filter_pipeline = y.Parser(
    pipelines.morph_pipeline(list(all_predicates)),
    tokenizer=yt.MorphTokenizer(
        morph=CachedMostProbMorphAnalyzer()
    )
)

In [29]:
def check_parseable(text, parser):
    return len(list(parser.findall(text))) > 0

In [30]:
check_parseable("Вашингтон возмущается бездействием Москвы", filter_pipeline)

True

In [31]:
check_parseable("коррелируют", filter_pipeline)

False

In [32]:
class ArgumentExtractor:
    
    def __init__(self, *args, **kwargs):
        pass
    
    def extract(self, sentence: str) -> List[Dict[str, Any]]:
        pass

In [33]:
from ufal.udpipe import Model, Pipeline, ProcessingError
from predpatt import PredPatt, load_conllu
from predpatt import PredPattOpts
from predpatt.util.ud import dep_v1, dep_v2

class PredPattArgumentExtractor(ArgumentExtractor):
    def __init__(
        self,
        path_to_udpipe: str,
        resolve_relcl: bool = True,
        resolve_appos: bool = True,
        resolve_amod: bool = True,
        resolve_conj: bool = True,
        resolve_poss: bool = True,
        ud = dep_v2.VERSION
    ):
        super().__init__()
        self.model = Model.load(path_to_udpipe)
        self.pipeline = Pipeline(self.model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')
        self._error = ProcessingError()
        self._opts = PredPattOpts(
            resolve_relcl=resolve_relcl,
            resolve_appos=resolve_appos,
            resolve_amod=resolve_amod,
            resolve_conj=resolve_conj,
            resolve_poss=resolve_poss,
            ud=ud
        )
        
    def extract(self, sentence: str) -> List[Dict[str, Any]]:
        processed = self.pipeline.process(sentence, self._error)
        if self._error.occurred():
            print(f"=== Error occurred: {self._error.message}")
            self._error = ProcessingError()
            return None
        else:
            conll_example = [ud_parse for sent_id, ud_parse in load_conllu(processed)][0]
            ppatt = PredPatt(conll_example, opts=self._opts)
            result = []
            for predicate in ppatt.instances:
                structure = {
                    'predicate': predicate.tokens,
                    'arguments': [x.tokens for x in predicate.arguments]
                }
                result.append(structure)
                
            return result        

In [34]:
from collections import defaultdict
class MainPhraseExtractor:
    
    def __init__(self, syntax_parser, pymorphy_analyzer):
        self.syntax = syntax_parser
        self.morph = pymorphy_analyzer

    def get_main_phrase(self, words, get_prep=False, verbose=False):
        markup = next(self.syntax.map([words]))
        forward = {}
        backward = defaultdict(list)
        token_map = {}
        candidates = []
        for token in markup.tokens:
            if token.head_id not in backward:
                backward[token.head_id] = []

            token_map[token.id] = token
            forward[token.id] = token.head_id
            backward[token.head_id].append(token.id)

            if token.id == token.head_id or token.head_id == '0':
                candidates.append(token.id)
             
        if verbose:
            print("forward ", forward)
            print("backward ", backward)
            print("candidates ", candidates)
                
        if len(candidates) == 0:
            return markup.tokens

        candidate = sorted(candidates, key=lambda x: len(backward[x]))[-1]
        if get_prep:
            prep_candidates = backward[candidate]
            prep_candidates = list(
                filter(lambda x: self.morph.tag(token_map[x].text)[0].POS == 'PREP', prep_candidates)
            )
            if len(prep_candidates) == 0:
                return [token_map[candidate]]
            
            prep = sorted(prep_candidates, key=lambda x: abs(int(x) - int(candidate)))[0]
            return (token_map[prep], token_map[candidate])

        return [token_map[candidate]]

In [35]:
class RstClauseSeparator:
    def __init__(self, udpipe=('tsa05.isa.ru', 3334), rst=('tsa05.isa.ru', 3335), cache_path="./rst-cache.pkl"):
        udpipe_host, udpipe_port = udpipe
        rst_host, rst_port = rst
        self.cache_path = cache_path
        self.ppl = PipelineCommon([
            (ProcessorRemote(udpipe_host, udpipe_port, '0'),
             ['text'],
             {'sentences': 'sentences',
              'tokens': 'tokens',
              'lemma': 'lemma',
              'syntax_dep_tree': 'syntax_dep_tree',
              'postag': 'ud_postag'}),
            (ProcessorMystem(delay_init=False),
             ['tokens', 'sentences'],
             {'postag': 'postag'}),
            (ConverterMystemToUd(),
             ['postag'],
             {'morph': 'morph',
              'postag': 'postag'}),
            (ProcessorRemote(rst_host, rst_port, 'default'),
             ['text', 'tokens', 'sentences', 'postag', 'morph', 'lemma', 'syntax_dep_tree'],
             {'rst': 'rst'})
        ])
        self.__cache = {}
        self.__hasher = city_32()
        if os.path.exists(self.cache_path):
            self.__cache = jb.load(self.cache_path)
        
    def extract(self, text):
        text_hash = self.__hasher(text)
        if text_hash in self.__cache:
            return self.__cache[text_hash]
        else:
            result = self.ppl(text)
            clauses = [x.text for x in result['rst']]
            self.__cache[text_hash] = clauses
            return clauses
        
        
    def flush(self):
        jb.dump(self.__cache, self.cache_path)

In [36]:
class RoleLabeler:
    def __init__(
        self,
        argument_extractor: ArgumentExtractor,
        main_phrase_extractor: MainPhraseExtractor,
        filter_pipeline,
        predicate_ruleset,
        mode: str = 'soft'
    ):
        
        self.argument_extractor = argument_extractor
        self.main_phrase_extractor = main_phrase_extractor
        self.filter_pipeline = filter_pipeline
        self.ruleset = predicate_ruleset
        if mode == 'soft':
            self.pass_fn = soft_parser_pass
        elif mode == 'strict':
            self.pass_fn = strict_parser_pass
        else:
            raise ValueError(f"Incorrect mode = {mode}, can be 'soft' or 'strict'")
            
    def check_parse(self, text, parser):
        return len(self.pass_fn(parser, text)) > 0
    
    def run(self, sentence):
        arg_groups = self.argument_extractor.extract(sentence)
        arg_groups = list(
            filter(
                lambda x: check_parseable(
                    " ".join([token.text for token in x['predicate']]),
                    self.filter_pipeline
                ),
                arg_groups
            )
        )
        result = []
        for group in arg_groups:
            
            predicate_txt = " ".join([token.text for token in group['predicate']])
            predicate_tokens = [token.text for token in group['predicate']]
            predicate_main = " ".join([x.text for x in self.main_phrase_extractor.get_main_phrase(predicate_tokens)])
            forward_map = {" ".join([token.text for token in argument]): argument for argument in group['arguments']}
            group_name = f"predicate={predicate_txt},arguments=[{','.join(forward_map.keys())}]"
            group_result = []
            for predicate in self.ruleset.values():
                if self.check_parse(predicate_main, predicate['predicate_parser']):
                    predicate_result = {
                        'predicate': predicate_txt,
                        'predicate_analyzed': predicate_main,
                        'predicate_tokens': group['predicate'],
                        'arguments': []
                    }
                    for argument in forward_map.keys():
                        argument_tokens = [x.text for x in forward_map[argument]]
                        argument_main = " ".join([
                            x.text for x in self.main_phrase_extractor.get_main_phrase(argument_tokens, True)
                        ])
                        roles = [
                            rule['role'] for rule in predicate['arguments'] 
                            if self.check_parse(argument_main, rule['argument_parser'])
                        ]
                        if len(roles) > 0: 
                            predicate_result['arguments'].append({
                                'argument': argument,
                                'argument_analyzed': argument_main,
                                'argument_tokens': forward_map[argument],
                                'roles': tuple(roles)
                            })
                    if len(predicate_result['arguments']) > 0:
                        predicate_result['arguments'] = tuple(predicate_result['arguments'])
                        group_result.append(predicate_result)
            result.append({'group': group_name, 'parses': group_result})
        return result
            

In [37]:
class ConstraintEnforcer:
    def __init__(self, constraints=None):
        if constraints is None:
            constraints = list()
            
        self.constraints = constraints
        
    def add(self, constraint):
        self.constraints.append(constraint)
        
    def enforce(self, parse):
        a_parse = parse.copy()
        for constraint in self.constraints:
            a_parse = constraint(a_parse)
            if len(a_parse) == 0:
                return a_parse
            
        return a_parse

In [38]:
from navec import Navec
from slovnet import Syntax
navec = Navec.load('../data/models/navec_news_v1_1B_250K_300d_100q.tar')
syntax = Syntax.load('../data/models/slovnet_syntax_news_v1.tar')
_ = syntax.navec(navec)

In [39]:
main_phrase_extractor = MainPhraseExtractor(syntax, MorphAnalyzer())
extractor = PredPattArgumentExtractor("../data/models/russian-syntagrus-ud-2.5-191206.udpipe")
clause_extractor = RstClauseSeparator()

In [40]:
labeler = RoleLabeler(extractor, main_phrase_extractor, filter_pipeline, predicate_orient_rules, mode='soft')

In [41]:
enforcer = ConstraintEnforcer()
def enforce_parseable_predicate(parse):
    if check_parseable(parse['predicate_analyzed'], filter_pipeline):
        return parse
    else:
        return {}
    
def reduce_duplicate_roles(parse):
    new_args = []
    for arg in parse['arguments']:
        arg['roles'] = tuple(set(arg['roles']))
        new_args.append(arg)
    parse['arguments'] = new_args
    return parse
    
def resolve_multiple_expirirencers(parse):
    if len(parse['arguments']) >= 2:
        parse_roles = set(arg['roles'] for arg in parse['arguments'])
        if ('экспериенцер',) in parse_roles:
            new_args = []
            for arg in parse['arguments']:
                if len(arg['roles']) >= 2:
                    new_roles = list(arg['roles'])
                    new_roles.remove('экспериенцер')
                    arg['roles'] = tuple(new_roles)
                new_args.append(arg)
            parse['arguments'] = new_args
    return parse

def resolve_single_expiriencer(parse):
    parse_roles = [arg['roles'] for arg in parse['arguments'] if len(arg['roles']) >= 2]
    if len(parse_roles) > 0:
        n_exp = 0
        for role in parse_roles:
            if 'экспериенцер' in role:
                n_exp += 1
                
        if n_exp == 1:
            new_args = []
            for arg in parse['arguments']:
                if len(arg['roles']) >= 2 and 'экспериенцер' in arg['roles']:
                    arg['roles'] = ('экспериенцер',)
                new_args.append(arg)
            parse['arguments'] = new_args
    return parse
        
enforcer.add(enforce_parseable_predicate)
enforcer.add(reduce_duplicate_roles) # ('каузатор', 'экспериенцер', 'экспериенцер') ->  ('каузатор', 'экспериенцер')
enforcer.add(resolve_multiple_expirirencers)
enforcer.add(resolve_single_expiriencer) # ('каузатор', 'экспериенцер') -> ('экспериенцер')

In [46]:
def visualize(text, parse, main_text):
    tokens = {i: x.text for i, x in enumerate(razdel.tokenize(text))}
    arguments = []
    sample_role = None
    for i, arg in enumerate(parse['arguments']):
        arguments.append(arg['argument_analyzed'])
        if i == 0 or sample_role is None:
            if len(arg['roles']) > 0:
                sample_role = arg['roles'][0]
        new_tokens = [x.text for x in arg['argument_tokens']]
        new_tokens[0] = f"[{new_tokens[0]}"
        new_tokens[-1] = f"{new_tokens[-1]}#{'/'.join(set(arg['roles']))}]"
        new_tokens = {arg['argument_tokens'][i].position: new_tokens[i] for i in range(len(new_tokens))}
        tokens = {**tokens, **new_tokens}
        
    new_tokens = [x.text for x in parse['predicate_tokens']]
    target_token = parse['predicate_analyzed'].split()[-1] # taking last word of analyzed part of predicate phrase
    target_idx = new_tokens.index(target_token)
    new_tokens[0] = f"[{new_tokens[0]}"
    new_tokens[target_idx] = f"{new_tokens[target_idx]}@Предикат"
    new_tokens[-1] = f"{new_tokens[-1]}]"
    new_tokens = {parse['predicate_tokens'][i].position: new_tokens[i] for i in range(len(new_tokens))}
    tokens = {**tokens, **new_tokens}
    tokens = sorted(tokens.items(), key=lambda x: x[0])
    return {
        'main_text': main_text,
        'formatted_clause_text': " ".join([x[1] for x in tokens]),
        'orig_clause_text': text,
        'predicate': parse['predicate_analyzed'],
        'arguments': arguments,
        'role': sample_role
    }

In [43]:
from itertools import chain
from pathlib import Path

In [44]:
cats = [list(x.iterdir()) for x in Path("../data/txts/").iterdir()]
cats = list(chain.from_iterable(cats))
cats = list(chain.from_iterable([x.iterdir() for x in cats]))
files = list(chain.from_iterable([x.iterdir() for x in cats]))

In [45]:
sentences = []
for file in tqdm(files):
    with open(file, 'r', encoding='utf-8') as f:
        text = f.read()
        sentences += list(razdel.sentenize(text))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9429.0), HTML(value='')))




In [47]:
sentences = [x.text for x in sentences]

In [None]:
import traceback

parse_res = []
groups_res = []
parses = []
for text in tqdm(sentences[:100000]):
    clauses = clause_extractor.extract(text)
    for clause in clauses:
        groups = labeler.run(clause)
        groups_res += groups
        for group in groups:
            for parse in group['parses']:
                parse = enforcer.enforce(parse)
                if len(parse) != 0:
                    try:
                        parses.append(parse)
                        parse_res.append(visualize(clause, parse, text))
                    except Exception:
                        print(f"== Error with {text}")
                        print(traceback.format_exc())
clause_extractor.flush()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))

In [51]:
clause_extractor.flush()

In [49]:
len(parses)

638

In [54]:
jb.dump(parses, "parsed-100k.pkl")

['parsed-100k.pkl']

In [55]:
def tag2str(tag):
    return ",".join(sorted(tag.grammemes_cyr))

In [56]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()
def get_most_prob_tags(word, morph):
    variants = morph.parse(word)
    max_score = max(x.score for x in variants)
    variants = filter(
        lambda x: x.score == max_score,
        variants
    )
    variants = list(map(lambda x: x.tag, variants))
    return variants

def get_morph_string(parse):
    predicate = [x.text for x in razdel.tokenize(parse['predicate'])]
    arguments = parse['arguments']
    arguments = [[x.text for x in razdel.tokenize(arg)] for arg in arguments]
    predicate_tags = [get_most_prob_tags(x, morph) for x in predicate]
    arguments_tags = [[get_most_prob_tags(x, morph) for x in arg] for arg in arguments]
    
    arguments_str = ""
    for i in range(len(arguments)):
        words = arguments[i]
        for j in range(len(words)):
            word = words[j]
            tags = arguments_tags[i][j]
            tags = list(map(tag2str, tags))
            arg_str = f"{word}#[{'/'.join(tags)}]"
            arguments_str += arg_str + ";"
            
    predicate_str = ""
    for i in range(len(predicate)):
        word = predicate[i]
        tags = predicate_tags[i]
        tags = list(map(tag2str, tags))
        predicate_str += f"{word}#[{'/'.join(tags)}]"
        
    result_str = f"Предикат={predicate_str}||Аргументы={arguments_str}"
        
    return result_str

In [57]:
get_morph_string(parse_res[0])

'Предикат=беситесь#[2л,ГЛ,изъяв,мн,наст,неперех,несов/ГЛ,выкл,мн,неперех,несов,повел]||Аргументы=вы#[2л,МС,им,мн];'

In [58]:
for obj in tqdm(parse_res):
    obj['morph'] = get_morph_string(obj)
    obj['arguments'] = ";".join(obj['arguments'])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=638.0), HTML(value='')))




In [59]:
parse_res[0]

{'main_text': 'Включить рециркуляцию автокондиционера летом - это, дружище, вы от злобы беситесь.',
 'formatted_clause_text': 'Включить рециркуляцию автокондиционера летом - [это , дружище , [вы#экспериенцер] от злобы беситесь@Предикат] .',
 'orig_clause_text': 'Включить рециркуляцию автокондиционера летом - это, дружище, вы от злобы беситесь.',
 'predicate': 'беситесь',
 'arguments': 'вы',
 'role': 'экспериенцер',
 'morph': 'Предикат=беситесь#[2л,ГЛ,изъяв,мн,наст,неперех,несов/ГЛ,выкл,мн,неперех,несов,повел]||Аргументы=вы#[2л,МС,им,мн];'}

In [60]:
parse = pd.DataFrame(parse_res)

In [61]:
parse['_pr'] = parse['predicate'] + "+" + parse['role']

In [63]:
parse.to_csv("parsed_roles-100k.csv", index=False, encoding='utf-8')

In [64]:
pairs = parse['_pr'].value_counts().to_dict().keys()

In [65]:
parse = parse.sample(frac=1.0).reset_index(drop=True)

In [66]:
refined = []
for pair in tqdm(pairs):
    refined += parse[parse['_pr'] == pair].head(10).to_dict(orient='records')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=380.0), HTML(value='')))




In [67]:
new_parse = pd.DataFrame(refined)

In [68]:
new_parse = new_parse.loc[:, ['role','predicate','arguments','formatted_clause_text', 'morph', 'main_text']]

In [69]:
new_parse = new_parse.drop_duplicates(subset='formatted_clause_text')

In [70]:
new_parse.to_csv("./parsed_roles-100k.csv")

In [71]:
_interest = new_parse.formatted_clause_text.str.contains("@Предикат ")

In [72]:
_interest.value_counts()

False    494
True      93
Name: formatted_clause_text, dtype: int64