In [1]:
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np
import pymorphy2
import codecs
import itertools  
import string
import os, re, fnmatch
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from pymorphy2.tokenizers import simple_word_tokenize
import re
import string
from sklearn.metrics import accuracy_score
from tqdm import tqdm

In [2]:
morph = pymorphy2.MorphAnalyzer()

In [3]:
def format_lines(lines):
    formatted = []
    for line in lines[18:]:
        parts = line.split(',')
        formatted.append(parts[:4])
    return formatted

In [4]:
def make_sent_dict(path):
    sent_words = defaultdict(list)
    with codecs.open(path,'r',encoding='cp1251') as f:
        print(f.encoding)
        lines = f.readlines()
    #for line in lines:
    #    print(line)
    formatted = format_lines(lines)
    words = dict(word = [], tone = [], part = [])
    
    for line in formatted:
        words['word'].append(line[0].strip())
        words['tone'].append(line[1].strip())
        words['part'].append(line[3].strip())
        sent_words[line[0].strip()] = [line[1].strip(), line[3].strip()]
    #print(pd.DataFrame(words))
    return sent_words, pd.DataFrame(words)

In [5]:
sent_words, words = make_sent_dict('RuSentiLex2017_revised_2.txt')
print(sent_words['гадкий'])

cp1251
['Adj', 'negative']


In [6]:
def get_file_paths(source_path, mask):
    find_files = []
    for root, dirs, files in os.walk(source_path):
        find_files += [os.path.join(root, name) for name in files if fnmatch.fnmatch(name, mask)]
    return find_files

In [7]:
def get_source_paths(dirname, mod):
    file_paths = []
    opin_paths = []
    ann_paths = []
    for i in range(100):
        path = get_file_paths(dirname, 'art' + str(i) + '.txt')
        if len(path) > 0:
            file_paths.append(path[0])
    for path in file_paths:
        idx = path[len(dirname) + 3:-4]
        #print(path, idx)
        ann_paths.append(dirname + 'art' + str(idx) + '.ann')
    if mod == "train":
        for path in file_paths:
            idx = path[len(dirname) + 3:-4]
            opin_paths.append(dirname + 'art' + str(idx) + '.opin.txt')
    if mod == "test":
        return file_paths, ann_paths
    return file_paths, opin_paths, ann_paths

In [8]:
file_paths, opin_paths, ann_paths = get_source_paths("Texts/", mod="train")
#opin_paths

In [9]:
l = [a for a in string.punctuation]

In [10]:
def prepare_text(text):
    snts = text.replace('\n', '').split('{Author, Unknown}')
    snts = [snt for snt in snts if len(snt) > 5]    
    norm_snts = []
    for sent in snts:
        for k in l:
            sent = sent.replace(k, ' ')
        words = simple_word_tokenize(sent)
        words = [word for word in words if len(word) > 0]
        words = [morph.parse(word)[0].normal_form for word in words]
        norm_snts.append(' '.join(words))
    return norm_snts

In [11]:
def find_sample(sample, pairs):
    for pair in pairs:
        if (pair[0] == sample[1] and pair[1] == sample[0]) or (pair[0] == sample[0] and pair[1] == sample[1]):
            return True
    return False

In [12]:
find_sample(("aa", "bb"), [("tt", "rr"), ("bb", "aa")])

True

In [13]:
def make_file_dict(text, dd, sent_words):
    slovar = dict(first = [], second = [], num_neg = [], num_pos = [], sent = [], ans = [])
    norm_snts = prepare_text(text)
    #print(norm_snts)
    
    entity_pairs_train = []
    for idx, row in dd.iterrows():
        entity_pairs_train.append((row[0].strip(), row[1].strip(), row[2].strip()))
        
    #print(entity_pairs_train)
    filtered_entities = [] 
    for pair in entity_pairs_train:
        if find_sample(pair, filtered_entities) == False:
            filtered_entities.append(pair)
            
    #print(filtered_entities)
    for first_entity, second_entity, tag in entity_pairs_train: #!entity_pairs_train-> filtered_entities
        first_entity = morph.parse(first_entity)[0].normal_form
        second_entity = morph.parse(second_entity)[0].normal_form
        for sent in norm_snts:
            #print(first_entity, second_entity, row[2])
            #print(sent)            
            if first_entity in sent and second_entity in sent:
                #print(sent)
                num_neg = 0
                num_pos = 0
                for word in sent.split(' '):
                    #print(word)
                    if len(sent_words[word]) != 2:
                        continue
                    if sent_words[word][1] == 'negative':
                        num_neg += 1
                    if sent_words[word][1] == 'positive':
                        num_pos += 1
                
                #if num_pos == 0 and num_neg == 0:
                #    continue
                
                slovar['sent'].append(sent)
                slovar['ans'].append(tag)
                slovar['first'].append(first_entity)
                slovar['second'].append(second_entity)
                slovar['num_neg'].append(num_neg)
                slovar['num_pos'].append(num_pos)

    #print(slovar.items())
    filtered_slovar = []
    
    '''for line in zip(slovar['first'], slovar['second'], slovar['ans'], slovar['sent'], slovar['num_neg'], slovar['num_pos']):
        #print(first, second, ans, sent, num_neg, num_pos)
        if find_sample(line, filtered_slovar) == True:
            filtered_slovar = splash(line, filtered_slovar)
        else:
            filtered_slovar.append(line)
            
    #print(filtered_slovar)  
    slovar = dict(first = [], second = [], num_neg = [], num_pos = [], sent = [], ans = [])
    for first, second, ans, sent, num_neg, num_pos in filtered_slovar: 
        #print(first, second, ans, sent, num_neg, num_pos)
        slovar['first'].append(first)
        slovar['second'].append(second)
        slovar['ans'].append(ans)
        slovar['sent'].append(sent)
        slovar['num_neg'].append(num_neg)
        slovar['num_pos'].append(num_pos)
    '''   
    return pd.DataFrame(slovar)

In [14]:
def splash(sample, filtered_slovar):
    new_slovar = []
    for pair in filtered_slovar:
        if (pair[0] == sample[1] and pair[1] == sample[0]) or (pair[0] == sample[0] and pair[1] == sample[1]):
            new_slovar.append((pair[0], pair[1], pair[2], pair[3] + sample[3], pair[4] + sample[4], pair[5] + sample[5]))
        else:
            new_slovar.append(pair)
    return new_slovar

In [102]:
def splash_test(sample, filtered_slovar):
    new_slovar = []
    for pair in filtered_slovar:
        if (pair[0] == sample[1] and pair[1] == sample[0]) or (pair[0] == sample[0] and pair[1] == sample[1]):
            new_slovar.append((pair[0], pair[1], pair[2] + sample[2], pair[3] + sample[3], pair[4] + sample[4]))
        else:
            new_slovar.append(pair)
    return new_slovar

In [104]:
def make_file_dict_test(text, entity_pairs, sent_words):
    slovar = dict(sent = [], first = [], second = [], num_neg = [], num_pos = [])

    norm_snts = prepare_text(text)
    filtered_entities = [] 
    for pair in entity_pairs:
        if find_sample(pair, filtered_entities) == False:
            filtered_entities.append(pair)    
    for first_entity, second_entity in filtered_entities:
        #print(first_entity, second_entity)
        for sent in norm_snts:
            if first_entity in sent and second_entity in sent:
                num_neg = 0
                num_pos = 0
                for word in sent.split(' '):
                    if len(sent_words[word]) != 2:
                        continue
                    if sent_words[word][1] == 'negative':
                        num_neg += 1
                    if sent_words[word][1] == 'positive':
                        num_pos += 1
                        
                #if num_pos == 0 and num_neg == 0:
                #    continue
                        
                slovar['sent'].append(sent)
                slovar['first'].append(first_entity)
                slovar['second'].append(second_entity)
                slovar['num_neg'].append(num_neg)
                slovar['num_pos'].append(num_pos)
    
    filtered_slovar = [] 
    for line in zip(slovar['first'], slovar['second'], slovar['sent'], slovar['num_neg'], slovar['num_pos']):
        if find_sample(line, filtered_slovar) == True:
            filtered_slovar = splash_test(line, filtered_slovar)
        else:
            filtered_slovar.append(line)
            
    #print(filtered_slovar)  
    slovar = dict(first = [], second = [], num_neg = [], num_pos = [], sent = [])
    for first, second, sent, num_neg, num_pos in filtered_slovar: 
        slovar['first'].append(first)
        slovar['second'].append(second)
        slovar['sent'].append(sent)
        slovar['num_neg'].append(num_neg)
        slovar['num_pos'].append(num_pos)
    
    return pd.DataFrame(slovar)

In [16]:
def get_entities(path):
    ann_file = pd.read_csv(path, sep='\t', header = None)
    ann_file.columns = ['Term', 'Type', 'Name']

    digits = []
    for i in range(len(ann_file.loc[:, 'Type'])):
        string = ann_file.loc[i, 'Type']
        digits.append(re.findall('\d+', string))
        idx = string.find(re.findall('\d+', string)[0])
        ann_file.loc[i, 'Type'] = string[:idx]
    
    for i in range(len(ann_file.loc[:, 'Name'])):
        string = ann_file.loc[i, 'Name']
        lemma = morph.parse(string)[0]
        ann_file.loc[i, 'Name'] = lemma.normal_form    
    
    dgts = pd.DataFrame(digits, columns=['BeginPos', 'EndPos'])
    ann_file['BeginPos'], ann_file['EndPos'] = dgts.loc[:, 'BeginPos'], dgts.loc[:, 'EndPos']
    ann_file.drop('Term', inplace=True, axis=1)

    for i in range(len(ann_file)):
        if ann_file.loc[i, 'Name'] == 'author' or ann_file.loc[i, 'Name'] == 'unknown':
            ann_file.loc[i, 'Name'] = np.NaN
        
    ann_file.dropna(inplace=True)
    ann_file.index = range(ann_file.shape[0])
    return ann_file

In [17]:
l = [a for a in string.punctuation]

In [18]:
def analize_files(file_paths, opin_paths, sent_words):
    spisok = []
    for path, opin in zip(file_paths, opin_paths):
        with open(path,'r') as file:
            text = file.read()
        
        dd = pd.read_csv(opin, header = None).drop(3, axis = 1)
        
        #print(dd)
        #print(make_file_dict(text, dd, sent_words))
        spisok.append(make_file_dict(text, dd, sent_words))
        result = pd.concat(spisok, ignore_index=True)
    return result

In [19]:
def analize_files_test(file_paths, ann_paths, sent_words):
    spisok = []
    test_data = list(zip(file_paths, ann_paths))
    for path, ann_path in tqdm(test_data):
        with open(path,'r') as file:
            text = file.read()
        
        ann_df = get_entities(ann_path)
        entities = list(set(ann_df['Name'].tolist()))
        #print(len(entities))
        entities = [' '.join([morph.parse(word)[0].normal_form for word in entity.split(' ')]) for entity in entities]
        
        entity_pairs = list(itertools.combinations(entities, 2))
        #print(entity_pairs)
        spisok.append(make_file_dict_test(text, entity_pairs, sent_words))
        result = pd.concat(spisok, ignore_index=True)
    return spisok

In [20]:
train = analize_files(file_paths, opin_paths, sent_words)

In [21]:
train.shape

(878, 6)

In [35]:
test = analize_files_test(file_paths, ann_paths, sent_words)

100%|██████████████████████████████████████████| 42/42 [00:28<00:00,  1.36it/s]


In [37]:
len(test)

42

## Training

In [22]:
from sklearn.preprocessing import LabelEncoder

In [23]:
train.head()

Unnamed: 0,ans,first,num_neg,num_pos,second,sent
0,neg,сша,0.0,2.0,игил,согласно заявление из пентагон цель сша — спос...
1,neg,сирия,3.0,2.0,сша,беспокойство сша очевидно тот кто оказаться по...
2,neg,россия,3.0,0.0,сша,инспирировать сша и поддержать европа « револю...
3,neg,россия,2.0,0.0,сша,последний отметить алеми наиболее откровенно в...
4,neg,нато,1.0,0.0,россия,сми иран финляндия не хотеть вступать в нато и...


In [33]:
X_ent = train[['first', 'second']] 
#X_test_ent = test[['first', 'second']]

In [41]:
#X_test_ent

In [42]:
#X_ent

In [34]:
enc = LabelEncoder()
#enc.fit(X_ent['first'].append(X_ent['second']).append(X_test_ent['first']).append(X_test_ent['second']))
enc.fit(X_ent['first'].append(X_ent['second']))


X_ent['first'] = enc.transform(X_ent['first'])
X_ent['second'] = enc.transform(X_ent['second'])

#X_test_ent['second'] = enc.transform(X_test_ent['second'])
#X_test_ent['first'] = enc.transform(X_test_ent['first'])




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [35]:
target = list(train['ans'])
print(len(target))
print(X_ent.shape)

878
(878, 2)


In [36]:
counter = 0
for i in range(len(target)):
    if target[i].strip() == 'neg':
        target[i] = 0
        counter += 1
    else:
        target[i] = 1

In [37]:
len(target)

878

In [38]:
X_ent.shape

(878, 2)

In [39]:
model_ent = CatBoostClassifier(iterations = 400, depth=12)
model_ent.fit(np.array(X_ent), target)

<catboost.core.CatBoostClassifier at 0x118009c88>

In [40]:
ans = model_ent.predict(np.array(X_ent[600:]))

In [41]:
accuracy_score(target[600:], ans)

0.97841726618705038

## Testing

In [24]:
test_file_paths, test_ann_paths = get_source_paths('test/', mod="test")

In [105]:
test_data = analize_files_test(ann_paths=test_ann_paths, file_paths=test_file_paths, sent_words=sent_words)

100%|██████████| 27/27 [02:04<00:00,  2.94s/it]


In [106]:
test_data[0]

Unnamed: 0,first,num_neg,num_pos,second,sent
0,финляндия,1,0,сша,поддержать президент и министр оборона страна ...
1,финляндия,0,2,путин,особый внимание сми привлечь неожиданный вопро...
2,финляндия,1,0,франция,поддержать президент и министр оборона страна ...
3,финляндия,1,1,запасть,журналист ярмо мякель jarmo makela не понравит...
4,финляндия,0,0,таусить,сари таусить sari tausi журналистка финский те...
5,финляндия,1,0,германия,поддержать президент и министр оборона страна ...
6,финляндия,1,1,helsingin sanomat,журналист ярмо мякель jarmo makela не понравит...
7,финляндия,1,2,россия,сми финляндия мы не нужный совет швеция о тот ...
8,финляндия,0,2,yle,особый внимание сми привлечь неожиданный вопро...
9,финляндия,1,3,сми,сми финляндия мы не нужный совет швеция о тот ...


In [107]:
test_data_full = pd.concat(test_data, ignore_index=True)

In [108]:
X_ent = train[['first', 'second']] 
X_test_ent = test_data_full[['first', 'second']]
X_test_ent.shape

(5826, 2)

In [73]:
X_ent

Unnamed: 0,first,second
0,609,288
1,565,609
2,528,609
3,528,609
4,443,528
5,443,528
6,443,528
7,528,655
8,528,655
9,702,629


In [109]:
enc = LabelEncoder()
enc.fit(X_ent['first'].append(X_ent['second']).append(X_test_ent['first']).append(X_test_ent['second']))

X_ent.loc[:, 'first'] = enc.transform(X_ent['first'])
X_ent.loc[:, 'second'] = enc.transform(X_ent['second'])

X_test_ent.loc[:, 'second'] = enc.transform(X_test_ent['second'])
X_test_ent.loc[:, 'first'] = enc.transform(X_test_ent['first'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [110]:
model_ent.fit(np.array(X_ent), target)
ans = model_ent.predict(np.array(X_test_ent))

In [112]:
print(len([item for item in ans if item == 0]))

2970


In [65]:
test_data[0][['first', 'second']].loc[:, 'first']

0             финляндия
1             финляндия
2             финляндия
3             финляндия
4             финляндия
5             финляндия
6             финляндия
7             финляндия
8             финляндия
9             финляндия
10            финляндия
11            финляндия
12            финляндия
13            финляндия
14            финляндия
15            финляндия
16                  сша
17                  сша
18                  сша
19                  сша
20                  сша
21                путин
22                путин
23                путин
24                путин
25                путин
26                путин
27                путин
28                путин
29              лиухтый
30              лиухтый
31              франция
32              франция
33              франция
34              франция
35         вильякайнный
36         вильякайнный
37              запасть
38              запасть
39              запасть
40             германия
41             г

In [77]:
enc.inverse_transform

<bound method LabelEncoder.inverse_transform of LabelEncoder()>

In [126]:
for el, path in zip(test_data, test_file_paths):
    XX = el[['first', 'second']]
    #print(XX)
    #print(XX.loc['first'])
    XX.loc[:, 'first'] = enc.transform(XX['first'])
    XX.loc[:, 'second'] = enc.transform(XX['second'])
    ans = model_ent.predict(np.array(XX))
    #print(ans)
    XX.loc[:, 'ans'] = ans
    #print(path[5:-4])
    XX.loc[:, 'first'] = enc.inverse_transform(XX['first'])
    XX.loc[:, 'second'] = enc.inverse_transform(XX['second'])
    col_ans = XX['ans'] 
    for idx, item in col_ans.items():
        if col_ans[idx] > 0:
            col_ans[idx] = "pos"
        else:
            col_ans[idx] = "neg"
    #print(col_ans)
    XX['ans'] = col_ans
    #print(XX)
    XX.to_csv('res2/' + path[5:-4] + '.opin.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stab

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer,  CountVectorizer
vect = TfidfVectorizer(ngram_range=(1, 4), min_df=2)
vect_c = CountVectorizer(ngram_range=(1, 4), min_df=2)

In [46]:
X_tfidf = vect.fit_transform(X)
X_count = vect_c.fit_transform(X)

In [284]:
XX = np.hstack((X_tfidf.toarray(), X_count.toarray()))

In [52]:
XX.shape

(57, 2)