In [8]:
import joblib as jb
from functools import lru_cache
from tqdm.auto import tqdm
CACHE_SIZE=10000
from rich import print, inspect
import razdel
import pandas as pd
import spacy_udpipe

In [9]:
model = spacy_udpipe.load_from_path(lang='ru', path="./data/models/russian-syntagrus-ud-2.5-191206.udpipe")

In [10]:
import time

In [11]:
def visualize(text, parse):
    tokens = {i: x.text for i, x in enumerate(razdel.tokenize(text))}
    arguments = []
    sample_role = None
    for i, arg in enumerate(parse['arguments']):
        arguments.append(arg['argument_analyzed'])
        if i == 0 or sample_role is None:
            if len(arg['roles']) > 0:
                sample_role = arg['roles'][0]
        new_tokens = [x.text for x in arg['argument_tokens']]
        new_tokens[0] = f"[{new_tokens[0]}"
        new_tokens[-1] = f"{new_tokens[-1]}#{'/'.join(set(arg['roles']))}]"
        new_tokens = {arg['argument_tokens'][i].position: new_tokens[i] for i in range(len(new_tokens))}
        tokens = {**tokens, **new_tokens}
        
    new_tokens = [x.text for x in [parse['predicate_tokens']]]
    target_token = parse['predicate_analyzed'].split()[-1] # taking last word of analyzed part of predicate phrase
    target_idx = new_tokens.index(target_token)
    new_tokens[0] = f"[{new_tokens[0]}"
    new_tokens[target_idx] = f"{new_tokens[target_idx]}@Предикат"
    new_tokens[-1] = f"{new_tokens[-1]}]"
    new_tokens = {[parse['predicate_tokens']][i].position: new_tokens[i] for i in range(len(new_tokens))}
    tokens = {**tokens, **new_tokens}
    tokens = sorted(tokens.items(), key=lambda x: x[0])
    return " ".join([x[1] for x in tokens])

In [12]:
parses_control = pd.read_pickle("./result.pickle")
#parses_1year = jb.load("youtube-parses.jbl")['full_parses']

In [15]:
total = parses_control

In [16]:
def get_morph_predicate(parse, doc):
    tokens = {x.position for x in [parse['parse']['predicate_tokens']]}
    tokens_morph = {}
    for i, tok in enumerate(doc):
        if i in tokens:
            tokens_morph[i] = f"{tok.pos_}#{tok.morph}"
            
    result = ""
    for x in [parse['parse']['predicate_tokens']]:
        result += f"<{x.text.upper()}={tokens_morph[x.position]}>"
        
    return result

In [17]:
def get_morph_arguments(parse, doc):
    result = []
    for argument in parse['parse']['arguments']:
        tokens = {x.position for x in argument['argument_tokens']}
        tokens_morph = {}
        for i, tok in enumerate(doc):
            if i in tokens:
                tokens_morph[i] = f"{tok.pos_}#{tok.morph}"
                
        arg_result = ""
        for x in argument['argument_tokens']:
            arg_result += f"<{x.text.upper()}={tokens_morph[x.position]}>"
            
        result.append(arg_result)
        
    return result

In [18]:
def get_morph_full(parse):
    doc = model(parse['clause_text'])
    predicate_morph = get_morph_predicate(parse, doc)
    arguments_morph = get_morph_arguments(parse, doc)
        
    return predicate_morph, arguments_morph

In [25]:
parses_control.head()

Unnamed: 0,file,text,clauses,parses
0,D5S_deYcRqI,У нас на всех ток шоу обсуждают всё кроме реал...,[сегодня жену и дочь обрадовали пособием по бе...,"[[{'group': 'predicate=обрадовали,arguments=[ж..."
1,D5S_deYcRqI,Не волнуйся! Все будет хорошо!!\n,[не волнуйся!],"[[{'group': 'predicate=волнуйся,arguments=[]',..."
2,D5S_deYcRqI,А есле полезити занимать кредит у МВФ мерового...,[а есле полезити занимать кредит у мвф меровог...,"[[{'group': 'predicate=занимать,arguments=[пол..."
3,D5S_deYcRqI,"@Александр Чемезов Я знаю и помню, когда не за...",[],[]
4,D5S_deYcRqI,Глупость! Умный истеблишмент США стравливает п...,[],[]


In [38]:
parses_control['n_parses'] = parses_control['parses'].apply(lambda x: len(x))

In [48]:
total = []
for row in tqdm(parses_control.itertuples(), total=len(parses_control)):
    idx = row.file
    text = row.text
    for i, (clause, parses) in enumerate(zip(row.clauses, row.parses)):
            for j, group in enumerate(parses):
                for k, parse in enumerate(group['parses']):
                    total.append({
                        'text': text,
                        'clause_text': clause,
                        'idx': idx,
                        'clause_idx': i,
                        'group_idx': j,
                        'parse_idx': k,
                        'parse': parse
                    })

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [50]:
result = []
for obj in tqdm(total):
    try:
        predicate_morph, arguments_morph = get_morph_full(obj)
        file = obj['idx']
        if len(obj['parse']) == 0:
            continue
        else:
            for i, argument in enumerate(obj['parse']['arguments']):
                for role in argument['roles']:
                    predicate_word = obj['parse']['predicate_analyzed']
                    argument_word = argument['argument_analyzed']
                    result.append({
                        'id': file,
                        'role': role,
                        'argument': argument_word,
                        'predicate': predicate_word,
                        'clause_text': obj['clause_text'],
                        'formatted_text': visualize(obj['clause_text'], obj['parse']),
                        'predicate_morph': predicate_morph,
                        'arguments_morph': arguments_morph[i]
                    })
    except Exception as e:
        pass

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20330.0), HTML(value='')))




In [51]:
dataframe = pd.DataFrame(result)

In [52]:
dataframe.head()

Unnamed: 0,id,role,argument,predicate,clause_text,formatted_text,predicate_morph,arguments_morph
0,D5S_deYcRqI,экспериенцер,жену,обрадовали,сегодня жену и дочь обрадовали пособием по без...,сегодня [жену#экспериенцер] и дочь [обрадовали...,<ОБРАДОВАЛИ=VERB#Aspect=Perf|Mood=Ind|Number=P...,<ЖЕНУ=NOUN#Animacy=Anim|Case=Acc|Gender=Fem|Nu...
1,D5S_deYcRqI,каузатор,кредит,занимать,а есле полезити занимать кредит у мвф мерового...,а есле полезити [занимать@Предикат] [кредит#ка...,<ЗАНИМАТЬ=VERB#Aspect=Imp|VerbForm=Inf|Voice=Act>,<КРЕДИТ=NOUN#Animacy=Inan|Case=Acc|Gender=Masc...
2,D5S_deYcRqI,каузатор,полезити,занимать,а есле полезити занимать кредит у мвф мерового...,а есле [полезити#каузатор] [занимать@Предикат]...,<ЗАНИМАТЬ=VERB#Aspect=Imp|VerbForm=Inf|Voice=Act>,<ПОЛЕЗИТИ=NOUN#Animacy=Inan|Case=Loc|Gender=Ma...
3,D5S_deYcRqI,каузатор,полезити,занимать,а есле полезити занимать кредит у мвф мерового...,а есле [полезити#каузатор] [занимать@Предикат]...,<ЗАНИМАТЬ=VERB#Aspect=Imp|VerbForm=Inf|Voice=Act>,<ПОЛЕЗИТИ=NOUN#Animacy=Inan|Case=Loc|Gender=Ma...
4,D5S_deYcRqI,каузатор,полезити,занимать,а есле полезити занимать кредит у мвф мерового...,а есле [полезити#каузатор] [занимать@Предикат]...,<ЗАНИМАТЬ=VERB#Aspect=Imp|VerbForm=Inf|Voice=Act>,<ПОЛЕЗИТИ=NOUN#Animacy=Inan|Case=Loc|Gender=Ma...


In [53]:
with open("./data/rules/bad-predicates.txt", 'r', encoding='utf-8') as f:
    bad_predicates = f.read().split("\n")

In [54]:
bad_predicates = set(map(lambda x: x.lower().strip(), bad_predicates))

In [55]:
from pymorphy2 import MorphAnalyzer
pymorph = MorphAnalyzer()
def filter_fn(row):
    predicate = row['predicate']
    predicate_parse = pymorph.parse(predicate)
    normal_forms = [x.normal_form for x in predicate_parse]
    if any(normal_form in bad_predicates for normal_form in normal_forms):
        return False
    else:
        return True

In [56]:
bad_mask = list(map(filter_fn, tqdm(result)))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4328.0), HTML(value='')))




In [57]:
dataframe['no_bad_predicate'] = bad_mask

In [58]:
dataframe['no_bad_predicate'].value_counts()

True     3500
False     828
Name: no_bad_predicate, dtype: int64

In [59]:
dataframe = dataframe[dataframe.no_bad_predicate]

In [60]:
dataframe['has_tense'] = dataframe.predicate_morph.apply(lambda x: "VERB" in x and "Tense" in x)

In [61]:
dataframe['has_tense'].value_counts()

True     2301
False    1199
Name: has_tense, dtype: int64

In [62]:
has_tense = dataframe[dataframe.has_tense]

In [63]:
has_tense['status_category'] = has_tense.predicate.str.endswith("о")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  has_tense['status_category'] = has_tense.predicate.str.endswith("о")


In [64]:
has_tense.status_category.value_counts()

False    2253
True       48
Name: status_category, dtype: int64

In [65]:
dataframe = has_tense[~has_tense.status_category]

In [66]:
dataframe['correctness'] = [""]*len(dataframe)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['correctness'] = [""]*len(dataframe)


In [67]:
dataframe = dataframe[['correctness', 'id', 'role', 'argument', 'predicate', 'clause_text', 'formatted_text',
       'predicate_morph', "arguments_morph"]]

In [68]:
no_dups = dataframe.drop_duplicates(subset=['role', 'argument', 'predicate', 'clause_text'])

In [73]:
no_dups['possible_inf'] = no_dups['predicate'].apply(lambda x: x.endswith("ся") or x.endswith("сь"))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_dups['possible_inf'] = no_dups['predicate'].apply(lambda x: x.endswith("ся") or x.endswith("сь"))


In [75]:
no_dups = no_dups[~no_dups.possible_inf]

In [78]:
no_dups.reset_index(drop=True, inplace=True)

In [84]:
writer = pd.ExcelWriter("./results/youtube-new-morph.xlsx")
no_dups.to_excel(writer, index=False, encoding='utf-8')
writer.close()

In [85]:
no_dups.shape

(560, 10)

In [55]:
dataframe.to_csv("../youtube_control.csv", encoding='utf-8', index=False)

In [11]:
import yargy as y
import yargy.predicates as yp
import yargy.morph as ytm
import yargy.tokenizer as yt
import yargy.pipelines as pipelines

In [12]:
predicate_groups = {}
with open("../data/rules/predicate_list.txt", 'r', encoding='utf-8') as f:
    lines = f.readlines()
    accum = []    
    for i, line in enumerate(lines):
        if ((line == '\n') or i == (len(lines) - 1)) and (len(accum) > 0):
            predicate_groups[accum[0]] = accum
            accum = []
        else:
            accum.append(line.replace('\n', ''))

In [13]:
pipeline_map = {}
for group, predicates in tqdm(predicate_groups.items()):
    pipeline_map[group] = y.Parser(
        pipelines.morph_pipeline(predicates),
        tokenizer=yt.MorphTokenizer(
            morph=CachedMostProbMorphAnalyzer()
        )
    )

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=86.0), HTML(value='')))




In [14]:
def predicate2group(predicate):
    for g, pipeline in pipeline_map.items():
        if check_parseable(predicate, pipeline):
            return g
    return None

In [15]:
def get_normal_form(word: str) -> str:
    return morph(word)[0].normalized

In [16]:
import razdel
def get_n_tokens(text):
    return len(list(razdel.tokenize(text)))

In [17]:
from pymorphy2 import MorphAnalyzer
from isanlp import PipelineCommon
from isanlp.processor_remote import ProcessorRemote
from isanlp.ru.processor_mystem import ProcessorMystem
from isanlp.ru.converter_mystem_to_ud import ConverterMystemToUd 

from pyhash import city_32
import joblib as jb
import os

class RstClauseSeparator:
    def __init__(self, udpipe=('tsa05.isa.ru', 3334), rst=('papertext.ru', 5555), cache_path="./rst-cache.pkl"):
        udpipe_host, udpipe_port = udpipe
        rst_host, rst_port = rst
        self.cache_path = cache_path
        self.ppl = PipelineCommon([
            (ProcessorRemote(udpipe_host, udpipe_port, '0'),
             ['text'],
             {'sentences': 'sentences',
              'tokens': 'tokens',
              'lemma': 'lemma',
              'syntax_dep_tree': 'syntax_dep_tree',
              'postag': 'ud_postag'}),
            (ProcessorMystem(delay_init=False),
             ['tokens', 'sentences'],
             {'postag': 'postag'}),
            (ConverterMystemToUd(),
             ['postag'],
             {'morph': 'morph',
              'postag': 'postag'}),
            (ProcessorRemote(rst_host, rst_port, 'default'),
             ['text', 'tokens', 'sentences', 'postag', 'morph', 'lemma', 'syntax_dep_tree'],
             {'clauses': 'clauses'})])
        self.__cache = {}
        self.__hasher = city_32()
        if os.path.exists(self.cache_path):
            self.__cache = jb.load(self.cache_path)
        
    def extract(self, text):
        text_hash = self.__hasher(text)
        if text_hash in self.__cache:
            return self.__cache[text_hash]
        else:
            result = self.ppl(text)
            clauses = [x.text for x in result['clauses']]
            self.__cache[text_hash] = clauses
            return clauses
        
        
    def flush(self):
        jb.dump(self.__cache, self.cache_path)

In [18]:
clause_extractor = RstClauseSeparator()

In [19]:
def get_n_clauses(text):
    sentences = [x.text for x in razdel.sentenize(text)]
    n_clauses = 0
    for s in sentences:
        try:
            n_clauses += len(clause_extractor.extract(s.replace(". ", "")))
        except Exception as e:
            pass
    return n_clauses

In [20]:
reverse_map = {}
for g, l in predicate_groups.items():
    for p in l:
        reverse_map[p] = g

In [21]:
roleset = {'инструмент', 'каузатив', 'каузатор', 'объект', 'экспериенцер'}

In [22]:
result = {}

for parse in tqdm(total):
    group = parse['group']
    if group not in result:
        result[group] = {}
        for k in roleset:
            result[group][f"{k}_predicates"] = {}
            result[group][k] = []
        result[group]['predicates'] = []
        result[group]['n_tokens'] = 0
        result[group]['n_clauses'] = 0
        result[group]['n_predicates'] = 0
        
    result[group]['n_tokens'] += get_n_tokens(parse['text'])
    result[group]['n_clauses'] += get_n_clauses(parse['text'])
    
    result[group]['n_predicates'] += 1
    predicate = get_normal_form(parse['parse']['predicate_analyzed'])
    for arg in parse['parse']['arguments']:
        for role in arg['roles']:
            if predicate not in result[group][f"{role}_predicates"]:
                result[group][f"{role}_predicates"][predicate] = []
            result[group][f"{role}_predicates"][predicate].append(get_normal_form(arg['argument_analyzed']))
            result[group][role].append(get_normal_form(arg['argument_analyzed']))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=150255.0), HTML(value='')))




In [23]:
result

{'target': {'экспериенцер_predicates': {'обрадовать': ['жена',
    'дочь',
    'вы',
    'вы',
    'вы',
    'вы',
    'вы',
    'я',
    'я',
    'я',
    'это',
    'я',
    'слово',
    'я',
    'они',
    'ты',
    'это',
    'он',
    'богдасарян',
    'я',
    'раба " галер',
    'я',
    'жена',
    'я',
    'я',
    'я',
    'вы',
    'мама',
    'вы',
    'вы',
    'весь',
    'я',
    'я',
    'я',
    'он',
    'вы',
    'говорун',
    'это',
    'я',
    'я',
    'я',
    'ты',
    'спешить',
    'я',
    'они',
    'я',
    'ты',
    'вы',
    'я',
    'он',
    'номер карты тинькофф 5536 9138 2597',
    'я',
    'то , что прямо сейчас это пустили по нашему канал',
    'он',
    'номер карты тинькофф 5536 9138 2597',
    'я',
    'мама',
    'он',
    'ты',
    'он',
    'номер карты тинькофф 5536 9138 2597',
    'вы',
    'смерть',
    'я',
    'вы',
    'смерть',
    'я',
    'я',
    'то , что прямо сейчас это пустили по нашему канал',
    'вы',
    'смерть',
    'ты',


In [28]:
df = pd.DataFrame(result)
df = df.transpose()
df = df.reset_index()

In [29]:
from collections import Counter

In [30]:
def check_parseable(text, parser):
    return len(list(parser.findall(text))) > 0

In [39]:
def get_group_statistics_for_range(dataframe, prefix: str):
    result = {}
    total_freqs = {k:[] for k in roleset}
    grouped_freqs = {g:{k:[] for k in roleset} for g in predicate_groups}
    for week in tqdm(dataframe.to_dict(orient='records')):
        for k in roleset:
            total_freqs[k] += week[k]
            for predicate, freq in week[f"{k}_predicates"].items():
                group = predicate2group(predicate)
                if group is not None:
                    grouped_freqs[group][k] += freq
        
    total_freqs = {k:Counter(v) for k, v in total_freqs.items()}
    total_freqs_counts = {k:sum(v[1] for v in counter.most_common()) for k, counter in total_freqs.items()}
    grouped_freqs = {
        g:{k:Counter(v) for k, v in group.items()}
        for g, group in grouped_freqs.items()
    }
    print("========")
    print(total_freqs_counts)
    for g, role_freqs in grouped_freqs.items():
        new_freqs = {}
        for k in roleset:
            mc = role_freqs[k].most_common()
            new_freqs[f"{prefix}_{k}"] = [x[0] for x in mc]
            new_freqs[f"{prefix}_{k}_freq"] = [x[1]/total_freqs[k][x[0]] for x in mc]
            new_freqs[f"{prefix}_{k}_total_freq"] = [total_freqs[k][x[0]]/total_freqs_counts[k] for x in mc]
            
        grouped_freqs[g] = new_freqs
        
    return total_freqs, grouped_freqs

In [40]:
total_target, stats_target = get_group_statistics_for_range(df[df['index'] == 'target'], 'target')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [87]:
total_control, stats_control = get_group_statistics_for_range(df[df['index'] == 'control'], 'control')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [88]:
writer = pd.ExcelWriter("grouped_stats_youtube.xlsx")
for g in predicate_groups:
    base = stats_target[g]
    base.update(stats_control[g])
    maxlen = max(len(v) for v in base.values())
    for k in base:
        if len(base[k]) < maxlen:
            to_fill = maxlen - len(base[k])
            base[k] += [None]*to_fill
            
    df = pd.DataFrame(base)
    df.to_excel(writer, g, index=False, encoding='utf-8')
writer.close()