In [1]:
import pandas as pd

from typing import *

import yargy as y
import yargy.predicates as yp
import yargy.morph as ytm
import yargy.tokenizer as yt

from pymorphy2 import MorphAnalyzer
from isanlp import PipelineCommon
from isanlp.processor_remote import ProcessorRemote
from isanlp.ru.processor_mystem import ProcessorMystem
from isanlp.ru.converter_mystem_to_ud import ConverterMystemToUd 

from pyhash import city_32
from rich import print, inspect
import joblib as jb
import json

from rich import print
import razdel

import os
from tqdm.auto import tqdm

from functools import lru_cache
CACHE_SIZE=10000
from src.contextual_morphology import UdMorphTokenizer, spacy_udpipe

In [2]:
tqdm.pandas()

  from pandas import Panel


In [3]:
FILENAME = "results/results_youtube_emojipunct_new_morph.pkl"
RST_CACHE_NAME = "cache/cache_w1.pkl"

In [4]:
model = spacy_udpipe.load_from_path(
    lang="ru",
    path="./data/models/russian-syntagrus-ud-2.5-191206.udpipe",
    meta={"description": "Custom 'hr' model"}
)
ud_tokenizer = UdMorphTokenizer(model)

In [5]:
def get_morph(word, cyr=True):
    morph = MorphAnalyzer()
    if cyr:
        return morph.lat2cyr(morph.parse(word)[0].tag)
    else:
        return morph.parse(word)[0].tag

In [11]:
rules = pd.read_csv("data/rules/rules_formatted.csv")

In [12]:
predicates_ = pd.read_csv("data/rules/predicates.csv")

In [13]:
rules['predicate_type'] = rules['require_deverbal_noun'].apply(lambda x: 'сущ' if x == '1' else 'глаг')

In [14]:
rules = rules[rules.require_deverbal_noun != '1']
rules = rules[rules.require_reflexive != '1']

In [15]:
types = set(predicates_.type.tolist())
if 'deverbal_noun' in types:
    deverbal_nouns = set(predicates_[predicates_.type == 'deverbal_noun']['predicate'].to_list())
else:
    deverbal_nouns = set()
    
if 'status_category' in types:
    status_categories = set(predicates_[predicates_.type == 'status_category']['predicate'].to_list())
else:
    status_categories = set()
    
predicates = set(predicates_[predicates_.type == 'predicate']['predicate'].to_list())

rule_specific = set(rules['predicate'].to_list())

In [16]:
all_predicates = predicates | rule_specific # | deverbal_nouns | status_categories |

In [17]:
all_predicates -= {'?'}

In [18]:
def create_predicate_rule(
    require_deverbal_noun: str,
    require_reflexive: str,
    require_status_category: str,
    predicate: str,
    predicate_type: str,
    **kwargs
):
    rule_id = f"predicate={predicate},deverbal={require_deverbal_noun},reflexive={require_reflexive},status_category={require_status_category},predicate_type={predicate_type}"
    return rule_id, y.rule(
        y.and_(
            yp.gram("VERB")
            #req_predicate(predicate, predicate_type),
            #req_deverbal(require_deverbal_noun),
            #req_reflexive(require_reflexive)
        )
    )

In [19]:
def create_argument_role(argument_type: str, case: str, preposition: str, **kwargs):
    rule_id = f"argument_type={argument_type},case={case},preposition={preposition}"
    arg = y.and_(
        req_argument(),
        req_animacy(argument_type),
        req_case(case)
    )
    internal = y.or_(
        y.and_(
            yp.gram("ADJF"), 
            y.or_(
                yp.normalized("этот"),
                yp.normalized("тот")
            )
        ),
        y.not_(yp.gram("ADJF"))
    )
    
    rule = y.or_(
        y.rule(req_preposition(preposition), arg),
        y.rule(req_preposition(preposition), internal, arg)
    )
    return rule_id, rule

In [20]:
def req_deverbal(require_deverbal_noun: str = '?'):
    if require_deverbal_noun == '1': ## strictly deverbal noun
        return y.and_(
            yp.gram("NOUN"),
            yp.in_caseless(deverbal_nouns)
        )
    elif require_deverbal_noun == '0': ## strictly regular verb
        return yp.gram("VERB")
        
    elif require_deverbal_noun == '?': ## anything
        return y.or_(
            y.and_(
                yp.gram("NOUN"),
                yp.in_caseless(deverbal_nouns)
            ),
            yp.gram("VERB"),
        )
    else:
        raise ValueError("Incorrect deverbal status")
    return yp.gram("VERB")

In [21]:
def req_reflexive(reflexive_status: str = '?'):
    
    def is_reflexive_verb(verb: str):
        return verb.endswith("ся") or verb.endswith("сь")
    
    if reflexive_status == "1":
        return yp.custom(is_reflexive_verb)
    if reflexive_status == "0":
        return y.not_(yp.custom(is_reflexive_verb))
    elif reflexive_status == "?":
        return yp.true()
    else:
        raise ValueError ("Incorrect reflexive status")

In [22]:
def req_animacy(animacy: str = 'любой'):
    if animacy == 'любой':
        return yp.true()
    elif animacy == 'одуш.':
        return y.or_(
            y.not_(yp.gram('Inan')),
            yp.gram("Anim")
        )
    elif animacy == 'неодуш.':
        return y.or_(
            yp.gram('Inan'),
            y.not_(yp.gram("Anim"))
        )
    else:
        raise ValueError("Incorrect Animacy Type")

In [23]:
def req_argument():
    return y.and_(
        y.not_(
            y.or_( ## prohibits arguments from being any of following parts-of-speech
                yp.gram('PART'),
                yp.gram("ADP"),
                yp.gram("CCONJ"),
                yp.gram('SCONJ'),
                yp.gram("INTJ"),
                yp.gram("ADJ"),
                yp.gram("VERB")
            )
        ),
        y.or_(
            yp.gram("NOUN"),
            yp.gram("PROPN"),
            yp.gram("PRON")
        )
    )

In [24]:
def req_predicate(word: str = "?", predicate_type: str = 'глаг'):
    if predicate_type == 'глаг':
        predicate = yp.gram("VERB")
        
    elif predicate_type == 'сущ':
        predicate = yp.gram("NOUN")
        
    elif predicate_type == 'любой':
        predicate = y.or_(
            yp.gram("VERB"),
            yp.gram("NOUN")
        )
    else:
        raise ValueError("predicate_type must be глаг or сущ or любой")
    if word != '?':
        if "|" not in word:
            # single-word scope
            predicate = y.and_(
                yp.normalized(word),
                predicate
            )
        else:
            predicate_words = word.split("|")
            scope_rule = list(map(yp.normalized, predicate_words))
            scope_rule = y.or_(*scope_rule)
            predicate = y.and_(
                scope_rule,
                predicate
            )
        
    return predicate

In [25]:
from collections import defaultdict

In [26]:
def req_case(case: str = 'в'):
    mapping = {
        'в': 'Acc',
        'т': 'Abl',
        'д': 'Dat',
        'р': 'Gen',
        'и': 'Nom',
        'п': 'Loc'
    }
    
    if case not in mapping:
        raise ValueError(f"{case} unknown")
        
    case_rule = yp.gram(mapping[case])
    #del mapping[case]
    #not_rule = y.not_(y.or_(*(yp.gram(other_case) for other_case in mapping.values())))
    
    return case_rule#, not_rule)

In [27]:
def req_preposition(preposition: str = None):
    preposition_rule = yp.eq(preposition)
    
    if preposition == 'None':
        return y.not_(preposition_rule)
    else:
        return preposition_rule

In [28]:
def soft_parser_pass(parser, text):
    matches = []
    for match in parser.findall(text):
        matches.append({
            'text': " ".join([x.value for x in match.tokens]),
            'span': tuple(match.span)
        })

    return matches

In [29]:
def strict_parser_pass(parser, text):
    match = parser.match(text)
    if match is not None:
        matches = [{
            'text': " ".join([x.value for x in match.tokens]),
            'span': tuple(match.span)
        }]
    else:
        matches = []
    
    return matches

In [30]:
def create_rules(**kwargs):
    predicate_rule_id, predicate_rule = create_predicate_rule(**kwargs)
    argument_rule_id, argument_rule = create_argument_role(**kwargs)
    return {
        'predicate_id': predicate_rule_id,
        'argument_id': argument_rule_id,
        'predicate_parser': y.Parser(predicate_rule, tokenizer=ud_tokenizer),
        'argument_parser': y.Parser(argument_rule, tokenizer=ud_tokenizer)
    }

In [31]:
roleset = set(rules.role)

In [32]:
ruleset = {}

In [33]:
from tqdm.auto import tqdm

In [34]:
for role in roleset:
    ruleset[role] = []
    
    for rule_dict in tqdm(rules[rules.role == role].to_dict(orient='records'), desc=role):
        ruleset[role].append(create_rules(**rule_dict))

HBox(children=(HTML(value='экспериенцер'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='объект'), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value='инструмент'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='каузатор'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




In [35]:
argument_rules = {}
for role in ruleset.keys():
    for rule in ruleset[role]:
        argument_rules[f"{rule['argument_id']}+{role}"] = {
            'role':role,
            'rule_id': rule['argument_id'],
            'argument_parser': rule['argument_parser']
        }

In [36]:
predicate_orient = {}
for role in ruleset.keys():
    for rule in ruleset[role]:
        rule_id = rule['predicate_id']
        if rule_id not in predicate_orient:
            predicate_orient[rule_id] = {}
            predicate_orient[rule_id]['predicate_parser'] = rule['predicate_parser']
            predicate_orient[rule_id]['arguments'] = []
            
        predicate_orient[rule_id]['arguments'].append({
            'role': role,
            'argument_id': rule['argument_id'],
            'argument_parser': rule['argument_parser']
        })

In [37]:
predicate_orient_rules = {}
for predicate_id in predicate_orient.keys():
    argument_tuples = set([
        f"{x['argument_id']}+{x['role']}" for x in predicate_orient[predicate_id]['arguments']
    ])
    predicate_orient_rules[predicate_id] = {
        'predicate_parser': predicate_orient[predicate_id]['predicate_parser'],
        'arguments': [argument_rules[key] for key in argument_tuples]
    }

In [38]:
import yargy.pipelines as pipelines

In [39]:
filter_pipeline = y.Parser(
    pipelines.morph_pipeline(list(all_predicates)),
    tokenizer=ud_tokenizer
)

In [40]:
def check_parseable(text, parser):
    return len(list(parser.findall(text))) > 0

In [41]:
class ArgumentExtractor:
    
    def __init__(self, *args, **kwargs):
        pass
    
    def extract(self, sentence: str) -> List[Dict[str, Any]]:
        pass

In [42]:
from src.predicate_argument_extract import UdPredicateArgumentExtractor

In [43]:
from ufal.udpipe import Model, Pipeline, ProcessingError
from predpatt import PredPatt, load_conllu
from predpatt.patt import Token
from predpatt import PredPattOpts
from predpatt.util.ud import dep_v1, dep_v2

class PredPattArgumentExtractor(ArgumentExtractor):
    def __init__(
        self,
        path_to_udpipe: str,
        resolve_relcl: bool = True,
        resolve_appos: bool = True,
        resolve_amod: bool = True,
        resolve_conj: bool = True,
        resolve_poss: bool = True,
        ud = dep_v2.VERSION
    ):
        super().__init__()
        self.model = Model.load(path_to_udpipe)
        self.pipeline = Pipeline(self.model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')
        self._error = ProcessingError()
        self._opts = PredPattOpts(
            resolve_relcl=resolve_relcl,
            resolve_appos=resolve_appos,
            resolve_amod=resolve_amod,
            resolve_conj=resolve_conj,
            resolve_poss=resolve_poss,
            ud=ud
        )
        
    @lru_cache(maxsize=100000)
    def extract(self, sentence: str) -> List[Dict[str, Any]]:
        processed = self.pipeline.process(sentence, self._error)
        if self._error.occurred():
            print(f"=== Error occurred: {self._error.message}")
            self._error = ProcessingError()
            return None
        else:
            conll_example = [ud_parse for sent_id, ud_parse in load_conllu(processed)][0]
            ppatt = PredPatt(conll_example, opts=self._opts)
            result = []
            for predicate in ppatt.instances:
                structure = {
                    'predicate': predicate.tokens,
                    'arguments': [x.tokens for x in predicate.arguments]
                }
                result.append(structure)
                
            return result        

In [44]:
from collections import defaultdict
class MainPhraseExtractor:
    
    def __init__(self, syntax_parser, pymorphy_analyzer):
        self.syntax = syntax_parser
        self.morph = pymorphy_analyzer

    def get_main_phrase(self, words, get_prep=False, verbose=False):
        markup = next(self.syntax.map([words]))
        forward = {}
        backward = defaultdict(list)
        token_map = {}
        candidates = []
        for token in markup.tokens:
            if token.head_id not in backward:
                backward[token.head_id] = []

            token_map[token.id] = token
            forward[token.id] = token.head_id
            backward[token.head_id].append(token.id)

            if token.id == token.head_id or token.head_id == '0':
                candidates.append(token.id)
             
        if verbose:
            print("forward ", forward)
            print("backward ", backward)
            print("candidates ", candidates)
                
        if len(candidates) == 0:
            return markup.tokens

        candidate = sorted(candidates, key=lambda x: len(backward[x]))[-1]
        if get_prep:
            prep_candidates = backward[candidate]
            prep_candidates = list(
                filter(lambda x: self.morph.tag(token_map[x].text)[0].POS == 'PREP', prep_candidates)
            )
            if len(prep_candidates) == 0:
                return [token_map[candidate]]
            
            prep = sorted(prep_candidates, key=lambda x: abs(int(x) - int(candidate)))[0]
            return (token_map[prep], token_map[candidate])

        return [token_map[candidate]]

In [45]:
from isanlp.processor_udpipe import ProcessorUDPipe
class RstClauseSeparator:
    def __init__(
        self,
        udpipe_model: str = "./data/models/russian-syntagrus-ud-2.5-191206.udpipe", 
        rst=('papertext.ru', 5555),
        cache_path="./rst-cache.pkl"
    ):
        rst_host, rst_port = rst
        self.cache_path = cache_path
        self.ppl = PipelineCommon([
            (ProcessorUDPipe(udpipe_model),
             ['text'],
             {'sentences': 'sentences',
              'tokens': 'tokens',
              'lemma': 'lemma',
              'syntax_dep_tree': 'syntax_dep_tree',
              'postag': 'ud_postag'}),
            (ProcessorMystem(delay_init=False),
             ['tokens', 'sentences'],
             {'postag': 'postag'}),
            (ConverterMystemToUd(),
             ['postag'],
             {'morph': 'morph',
              'postag': 'postag'}),
            (ProcessorRemote(rst_host, rst_port, 'default'),
             ['text', 'tokens', 'sentences', 'postag', 'morph', 'lemma', 'syntax_dep_tree'],
             {'clauses': 'clauses'})])
        self.__cache = {}
        self.__hasher = city_32()
        if os.path.exists(self.cache_path):
            self.__cache = jb.load(self.cache_path)
        
    def extract(self, text):
        text_hash = self.__hasher(text)
        if text_hash in self.__cache:
            return self.__cache[text_hash]
        else:
            result = self.ppl(text)
            clauses = [x.text for x in result['clauses']]
            self.__cache[text_hash] = clauses
            return clauses
        
        
    def flush(self):
        jb.dump(self.__cache, self.cache_path)

In [46]:
class RoleLabeler:
    def __init__(
        self,
        argument_extractor: ArgumentExtractor,
        filter_pipeline,
        predicate_ruleset,
        mode: str = 'soft',
        extend_arguments: bool = False,
        drop_soft_predicates: bool = False
    ):
        
        self.argument_extractor = argument_extractor
        self.filter_pipeline = filter_pipeline
        self.ruleset = predicate_ruleset
        self.extend_arguments = extend_arguments
        self.drop_soft_predicates = drop_soft_predicates
        if mode == 'soft':
            self.pass_fn = soft_parser_pass
        elif mode == 'strict':
            self.pass_fn = strict_parser_pass
        else:
            raise ValueError(f"Incorrect mode = {mode}, can be 'soft' or 'strict'")
            
    def check_parse(self, text, parser):
        return len(self.pass_fn(parser, text)) > 0
    
    def run(self, sentence: str, return_applied_rules: bool = False):
        tokenized = map(lambda x: x.text, razdel.tokenize(sentence))
        words = list(map(lambda x: Token(x[0], x[1], None), enumerate(tokenized)))
        arg_groups = self.argument_extractor.extract(sentence)
        #arg_groups = list(
        #    filter(
        #        lambda x: check_parseable(
        #            x['predicate'].text,
        #            self.filter_pipeline
        #        ),
        #        arg_groups
        #    )
        #)
        result = []
        for group in arg_groups:
            
            predicate_txt = group['predicate'].text
            predicate_tokens = [predicate_txt]
            predicate_main = predicate_txt
            forward_map = {argument.text: [argument] for argument in group['arguments']}
            group_name = f"predicate={predicate_txt},arguments=[{','.join(forward_map.keys())}]"
            group_result = []
            
            if predicate_main.endswith("ть") or predicate_main.endswith("ться"):
                if self.drop_soft_predicates:
                    continue
                    
            # iterating over predicates to find match
            for predicate_id, predicate in self.ruleset.items():
                if self.check_parse(predicate_main, predicate['predicate_parser']):
                    predicate_result = {
                        'predicate': predicate_txt,
                        'predicate_analyzed': predicate_main,
                        'predicate_morph': get_morph(predicate_main),
                        'predicate_tokens': group['predicate'],
                        'arguments': []
                    }
                    if return_applied_rules:
                        predicate_result['applied_predicate_rule'] = predicate_id,

                    for argument in forward_map.keys():
                        argument_tokens = [x.text for x in forward_map[argument]]
                        offset = min(x.position for x in forward_map[argument]) - 1
                        argument_main_phrase = forward_map[argument]
                        argument_main = " ".join([
                            x.text for x in argument_main_phrase
                        ])
                        argument_word = argument_main
                        
                        token_positions = [(offset + int(x.position)) for x in argument_main_phrase]
                        try:
                            if self.extend_arguments: # extending argument with up to 2 previous tokens to ensure preposition included
                                min_pos = min(token_positions)
                                if min_pos >= 2:
                                    argument_main = f"{words[min_pos - 2].text} {words[min_pos - 1].text} {argument_main}"
                                elif min_pos == 1:
                                    argument_main = f"{words[0].text} {argument_main}"
                        except IndexError as e:
                            pass
                            #print(f"Index error for {token_positions} at {words}")
                        
                        # iterating over possible arguments of matched predicate
                        roles = []
                        for argument_rule in predicate['arguments']:
                            parser = argument_rule['argument_parser']
                            rule_id = argument_rule['rule_id']
                            role = argument_rule['role']
                            if self.check_parse(argument_main, parser):
                                #print(f"Applied rule: {rule_id} to {argument_main}")
                                if return_applied_rules:
                                    roles.append({
                                        'role': role,
                                        'applied_rule': rule_id
                                    })
                                else:
                                    roles.append(role)
                                
                        if len(roles) > 0: 
                            predicate_result['arguments'].append({
                                'argument': argument,
                                'argument_main': argument_main,
                                'argument_morph': get_morph(argument_main),
                                'argument_analyzed': argument_word,
                                'argument_tokens': forward_map[argument],
                                'roles': tuple(roles)
                            })
                    if len(predicate_result['arguments']) > 0:
                        predicate_result['arguments'] = tuple(predicate_result['arguments'])
                        group_result.append(predicate_result)
            result.append({'group': group_name, 'parses': group_result})
        return result
            

In [47]:
class ConstraintEnforcer:
    def __init__(self, constraints=None):
        if constraints is None:
            constraints = list()
            
        self.constraints = constraints
        
    def add(self, constraint):
        self.constraints.append(constraint)
        
    def enforce(self, parse):
        a_parse = parse.copy()
        for constraint in self.constraints:
            a_parse = constraint(a_parse)
            if len(a_parse) == 0:
                return a_parse
            
        return a_parse

In [48]:
extractor = UdPredicateArgumentExtractor("./data/models/russian-syntagrus-ud-2.5-191206.udpipe")

In [44]:
clause_extractor = RstClauseSeparator(cache_path="./experiments/rst-cache.pkl")

In [49]:
labeler = RoleLabeler(
    extractor,
    filter_pipeline, 
    predicate_orient_rules,
    mode='soft',
    extend_arguments=True,
    drop_soft_predicates=False
)

In [50]:
enforcer = ConstraintEnforcer()
def enforce_parseable_predicate(parse):
    if check_parseable(parse['predicate_analyzed'], filter_pipeline):
        return parse
    else:
        return {}
    
def reduce_duplicate_roles(parse):
    new_args = []
    for arg in parse['arguments']:
        arg['roles'] = tuple(set(arg['roles']))
        new_args.append(arg)
    parse['arguments'] = new_args
    return parse
    
def resolve_multiple_expirirencers(parse):
    if len(parse['arguments']) >= 2:
        parse_roles = set(arg['roles'] for arg in parse['arguments'])
        if ('экспериенцер',) in parse_roles:
            new_args = []
            for arg in parse['arguments']:
                if len(arg['roles']) >= 2:
                    new_roles = list(arg['roles'])
                    if 'экспериенцер' in new_roles:
                        new_roles.remove('экспериенцер')
                    arg['roles'] = tuple(new_roles)
                new_args.append(arg)
            parse['arguments'] = new_args
    return parse

def resolve_single_expiriencer(parse):
    parse_roles = [arg['roles'] for arg in parse['arguments'] if len(arg['roles']) >= 2]
    if len(parse_roles) > 0:
        n_exp = 0
        for role in parse_roles:
            if 'экспериенцер' in role:
                n_exp += 1
                
        if n_exp == 1:
            new_args = []
            for arg in parse['arguments']:
                if len(arg['roles']) >= 2 and 'экспериенцер' in arg['roles']:
                    arg['roles'] = ('экспериенцер',)
                new_args.append(arg)
            parse['arguments'] = new_args
    return parse
        
enforcer.add(enforce_parseable_predicate)
enforcer.add(reduce_duplicate_roles) # ('каузатор', 'экспериенцер', 'экспериенцер') ->  ('каузатор', 'экспериенцер')
enforcer.add(resolve_multiple_expirirencers)
enforcer.add(resolve_single_expiriencer) # ('каузатор', 'экспериенцер') -> ('экспериенцер')

In [51]:
pd.set_option("display.max_columns", 1000)

In [48]:
data = pd.read_parquet("./data/youtube/youtube_1yeat.parquet")

In [46]:
data.head()

Unnamed: 0,file,text
0,D5S_deYcRqI,У нас на всех ток шоу обсуждают всё кроме реал...
1,D5S_deYcRqI,Не волнуйся! Все будет хорошо!!\n
2,D5S_deYcRqI,А есле полезити занимать кредит у МВФ мерового...
3,D5S_deYcRqI,"@Александр Чемезов Я знаю и помню, когда не за..."
4,D5S_deYcRqI,Глупость! Умный истеблишмент США стравливает п...


In [60]:
import emoji
import re
from multiprocessing import Pool

In [52]:
def emoji_to_punct(target: str, symbol: str = ";"):
    return re.sub(
        r'{{(.*)}}',
        symbol,
        emoji.demojize(target, delimiters=("{{", "}}"))
    )

In [61]:
def parallel_apply(series, func, n_cores=4):
    pool = Pool(n_cores)
    series = pool.map(func, tqdm(series))
    pool.close()
    pool.join()
    return series

In [54]:
parallel_processed = parallel_apply(data['text'], emoji_to_punct, 24)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=596160.0), HTML(value='')))




In [55]:
data['text'] = parallel_processed

In [59]:
clauses = []
errors = []
for text in tqdm(data['text'], total=len(data)):
    text_clauses = []
    for sentence in razdel.sentenize(text):
        try:
            text_clauses += clause_extractor.extract(sentence.text.replace(". ", ""))
        except Exception as e:
            errors.append((sentence.text, e))
            
    text_clauses = list(map(str.lower, text_clauses))
            
    clauses.append(text_clauses)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=596160.0), HTML(value='')))




In [60]:
data['clauses'] = clauses

In [62]:
data.to_parquet("./data/youtube/youtube_1year_preprocessed.parquet", index=False)

In [52]:
data = pd.read_parquet("./data/youtube/youtube_1year_preprocessed.parquet")

In [53]:
data.shape

(596160, 3)

In [54]:
data.drop_duplicates("text", inplace=True)

In [55]:
data.shape

(488087, 3)

In [56]:
data.head()

Unnamed: 0,file,text,clauses
0,D5S_deYcRqI,У нас на всех ток шоу обсуждают всё кроме реал...,[у нас на всех ток шоу обсуждают всё кроме реа...
1,D5S_deYcRqI,Не волнуйся! Все будет хорошо!!\n,"[не волнуйся!, все будет хорошо!!]"
2,D5S_deYcRqI,А есле полезити занимать кредит у МВФ мерового...,[а есле полезити занимать кредит у мвф меровог...
3,D5S_deYcRqI,"@Александр Чемезов Я знаю и помню, когда не за...","[@александр чемезов я знаю и помню, когда не з..."
4,D5S_deYcRqI,Глупость! Умный истеблишмент США стравливает п...,"[глупость!, умный истеблишмент сша стравливает..."


In [76]:
def filter_clauses(clauses):
    return list(filter(lambda x: check_parseable(x, filter_pipeline), clauses))

In [77]:
filtered_clauses = parallel_apply(data['clauses'][:100000], filter_clauses, n_cores=48)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))




In [78]:
def parsing_fn(clauses):
    results = []
    for clause in clauses:
        try:
            groups = labeler.run(clause)
            for i, group in enumerate(groups):
                for j, parse in enumerate(group['parses']):
                    groups[i]['parses'][j] = enforcer.enforce(parse)
            results.append(groups)
        except Exception as e:
            results.append([])
            
    return results

In [None]:
parses = parallel_apply(filtered_clauses, parsing_fn, n_cores=47)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))

In [72]:
subdata = data[:10000].copy()

In [73]:
subdata['clauses'] = filtered_clauses
subdata['parses'] = parses

In [75]:
subdata.to_pickle("./result.pickle")

# Predicate-Argument Structure Extraction works okay, problem is withing the rules and/or contextual morphology

In [68]:
parses = []
empty_parses = []
subdata = data[:10000]
subdata['clauses'] = filtered_clauses
for row in tqdm(subdata.itertuples(), total=len(subdata)):
    text = row.text
    file = row.file
    clauses = row.clauses
        
    for i, clause in enumerate(clauses):
        try:
            groups = labeler.run(clause)
            if len(groups) != 0:
                for j, group in enumerate(groups):
                    for k, parse in enumerate(group['parses']):
                        parse = enforcer.enforce(parse)
                        if len(parse) != 0:
                            parses.append({
                                'text': text,
                                'clause_text': clause,
                                'content_hash': file,
                                'clause_idx': i,
                                'group_idx': j,
                                'parse_idx': k,
                                'parse': parse
                            })
                        else:
                            empty_parses.append({
                                'text': text,
                                'clause_text': clause,
                                'content_hash': file,
                                'clause_idx': i,
                                'group_idx': j,
                                'parse_idx': k,
                                'parse': parse
                                    })
        except Exception as e:
            print(f"Error {e} with text {clause}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdata['clauses'] = filtered_clauses


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))




KeyboardInterrupt: 

In [55]:
len(parses)

407

In [57]:
print(parses[-30])

In [51]:
clause_extractor.flush()

In [52]:
from joblib import dump, load

In [53]:
dump({'full_parses': parses, 'empty_parses': empty_parses}, "results/results_youtube_new_morph.pkl")

['results/results_youtube_new_morph.pkl']

In [54]:
print("Done")