In [1]:
from navec import Navec
from razdel import tokenize
from pymystem3 import Mystem
from slovnet import Morph, Syntax
from ipymarkup import show_dep_ascii_markup as show_markup

In [2]:
def get_data():
    data = []
    with open('data.txt', 'r', encoding='utf8') as lines:
        for line in lines:
            data.append([_.text for _ in tokenize(line.split('\t')[0])])
            data.append([_.text for _ in tokenize(line.split('\t')[1].strip())])
    return data

In [3]:
def get_a_dict(fname):
    d = {}
    with open(fname, 'r', encoding='utf8') as lines:
        if 'particles' in fname:
            return [l.strip() for l in lines]
        for line in lines:
            key, item = line.strip().split('\t')
            d[key] = int(item)
    return d

In [4]:
def get_dicts():
    d = {}
    
    d['ADJ_NEG'] = get_a_dict('./RUS/adjectives/list_adjectives_neg_utf.txt')
    d['ADJ_POS'] = get_a_dict('./RUS/adjectives/list_adjectives_pos_utf.txt')
    d['ADJ_AMPLF'] = get_a_dict('./RUS/adjectives/list_adjectives_amplf_utf.txt')
    
    d['ADV_NEG'] = get_a_dict('./RUS/adverbs/list_adverbs_neg_utf.txt')
    d['ADV_POS'] = get_a_dict('./RUS/adverbs/list_adverbs_pos_utf.txt')
    d['ADV_AMPLF'] = get_a_dict('./RUS/adverbs/list_adverbs_amplf_utf.txt')
    
    d['NOUN_NEG'] = get_a_dict('./RUS/nouns/list_nouns_neg_utf.txt')
    d['NOUN_POS'] = get_a_dict('./RUS/nouns/list_nouns_pos_utf.txt')
    d['NOUN_ANEG'] = get_a_dict('./RUS/nouns/list_nouns_aneg_utf.txt')
    d['NOUN_APOS'] = get_a_dict('./RUS/nouns/list_nouns_apos_utf.txt')
    
    d['VERB_FLX_NEG'] = get_a_dict('./RUS/verbs/list_verbs_flxneg_utf.txt')
    d['VERB_FLX_POS'] = get_a_dict('./RUS/verbs/list_verbs_flxpos_utf.txt')
    d['VERB_NEG'] = get_a_dict('./RUS/verbs/list_verbs_neg_utf.txt')
    d['VERB_P_NEG'] = get_a_dict('./RUS/verbs/list_verbs_pure_neg_utf.txt')
    d['VERB_OPP_NEG'] = get_a_dict('./RUS/verbs/list_verbs_opp_neg_utf.txt')
    d['VERB_OPP_POS'] = get_a_dict('./RUS/verbs/list_verbs_opp_pos_utf.txt')
    d['VERB_POS'] = get_a_dict('./RUS/verbs/list_verbs_pos_utf.txt')
    d['VERB_P_POS'] = get_a_dict('./RUS/verbs/list_verbs_opp_pos_utf.txt')
    
    d['COL_VPOS'] = get_a_dict('./RUS/collocations/list_collocation_vpos_utf.txt')
    d['COL_VNEG'] = get_a_dict('./RUS/collocations/list_collocation_vneg_utf.txt')
    d['COL_VNEUT'] = get_a_dict('./RUS/collocations/list_collocation_vneut_utf.txt')
    d['COL_POS'] = get_a_dict('./RUS/collocations/list_collocation_pos_utf.txt')
    d['COL_NEG'] = get_a_dict('./RUS/collocations/list_collocation_neg_utf.txt')
    d['COL_NEUT'] = get_a_dict('./RUS/collocations/list_collocation_neut_utf.txt')
    d['COL_AMPLF'] = get_a_dict('./RUS/collocations/list_collocation_amplf_utf.txt')
    
    d['IMPOS'] = get_a_dict('./RUS/polarity/impossibility_utf.txt')
    d['AMPLF'] = get_a_dict('./RUS/polarity/amplificators_utf.txt')
    d['INVERT_POLARITY'] = get_a_dict('./RUS/polarity/inversion_polarity_utf.txt')
    d['INVERT_POLARITY_PRED'] = get_a_dict('./RUS/polarity/inversion_polarity_predication_utf.txt')
    d['INVERT_POLARITY_SHIFT_NEG'] = get_a_dict('./RUS/polarity/inversion_polarity_shifters_neg_utf.txt')
    d['INVERT_POLARITY_SHIFT_POS'] = get_a_dict('./RUS/polarity/inversion_polarity_shifters_pos_utf.txt')
    d['INVERT_POLARITY_PARTICLE'] = get_a_dict('./RUS/polarity/inversion_polarity_particles.txt')
    
    return d

In [5]:
def stage1():
    data = get_data()
    d = get_dicts()
    m = Mystem()
    
    navec = Navec.load('navec_news_v1_1B_250K_300d_100q.tar')
    morph = Morph.load('slovnet_morph_news_v1.tar', batch_size=4)
    morph.navec(navec)
    
    res = []
    
    markups = morph.map(data)
    for markup in markups:
        for sentence in markup:
            item = []
            for token in sentence:
                n = 0
                tag = token.tag.split('|')[0] + '_NEUT'
                lem = m.lemmatize(token.text)[0]
                for t in d.keys():
                    if lem in d[t]:
                        if t == 'INVERT_POLARITY_PARTICLE':
                            n = 0
                            tag = t
                            continue
                        n = d[t][lem]
                        tag = t
                item.append('\t'.join([token.text, lem, str(n), tag]))
        res.append(item)
    
    return res

In [6]:
def stage2():
    data = get_data()
    
    navec = Navec.load('navec_news_v1_1B_250K_300d_100q.tar')
    syntax = Syntax.load('slovnet_syntax_news_v1.tar')
    syntax.navec(navec)
    
    markups = syntax.map(data)
    
    words, deps = [], []
    
    for markup in markups:
        for sentence in markup:
            word = []
            dep = []
            for token in sentence:
                word.append(token.text)
                source = int(token.head_id) - 1
                target = int(token.id) - 1
                #print(token.text, token.head_id, token.id, token.rel)
                #if source > 0 and source != target:  # skip root, loops
                dep.append([source, target, token.rel])
            words.append(word)
            deps.append(dep)
            #print(word, dep)
            #show_markup(word, dep)
            #print()
            
    res = []
    
    for i in range(len(words)):
        item = []
        for j in range(len(words[i])):
            item.append('\t'.join([words[i][j], str(deps[i][j][0]), str(deps[i][j][1]), deps[i][j][2]]))
        res.append(item)
            
    return res

In [7]:
morphs = stage1()

In [8]:
synts = stage2()

In [9]:
res = []
for i in range(len(morphs)):
    item = []
    for j in range(len(morphs[i])):
        item.append(morphs[i][j] + '\t' + synts[i][j].split('\t', 1)[1])
    res.append(item)

In [10]:
def get_sent(sent):
    is_amplf = 0

    s = 0
    for w in sent:
        if 'AMPLF' in w:
            is_amplf = int(w.split('\t')[2])
            
        elif 'NEG' in w:
            s -= int(w.split('\t')[2])
            s -= 1
            if 'root' in w:
                s -= 1
            if is_amplf:
                s -= is_amplf
                is_amplf = 0

        elif 'POS' in w:
            s += int(w.split('\t')[2])
            s += 1
            if 'root' in w:
                s += 1
            if is_amplf:
                s += is_amplf
                is_amplf = 0
        
        else:
            is_amplf = 0
            
    return s

In [11]:
data = []
with open('data.txt', 'r', encoding='utf8') as lines:
        for line in lines:
            data.append(line.split('\t')[0])
            data.append(line.split('\t')[1].strip())

In [12]:
with open('results.txt', 'w', encoding='utf8') as ouf:
    for i, sent in enumerate(res):
        s = get_sent(sent)
        if s == 0:
            s = 'neutral'
        elif s > 0:
            s = 'positive'
        else:
            s = 'negative'
        ouf.write(data[i] + '\t' + s + '\n')

In [13]:
d = {'neutral': 0, 'positive': 0, 'negative': 0}

for i, sent in enumerate(res):
    s = get_sent(sent)
    if s == 0:
        d['neutral'] += 1
    elif s > 0:
        d['positive'] += 1
    else:
        d['negative'] += 1
        
print(d)

{'neutral': 53, 'positive': 83, 'negative': 104}
