In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import pymystem3
import nltk
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

import xml.etree.ElementTree as ET
import re

In [2]:
train_path = 'data/train/news_eval_train.xml'
test_path = 'data/test/news_eval_test.xml'

In [3]:
SPLIT_RGX = re.compile(u'[A-Za-zА-Яа-я0-9]+', re.UNICODE)

def split(string):
    words = re.findall(SPLIT_RGX, string)
    return words

In [4]:
stem = pymystem3.Mystem()

In [5]:
def parse_xml(path):
    d = {'+': 1, '0': 0, '-': -1}
    
    tree = ET.parse(path)
    root = tree.getroot()
    data, target = [], []
    for sent in tqdm(root):
        t = sent.find('evaluation').text.strip()
        if t not in d:
            continue
        target.append(d[t])
        
        t = sent.find('speech').text.strip()
        t = split(' '.join(stem.lemmatize(' '.join(list(map(lambda x: x.lower(), split(t)))))).strip())
        data.append(t)
        
    return data, target

In [6]:
train_text, train_target = parse_xml(train_path)
test_text, test_target = parse_xml(test_path)

100%|█████████████████████████████████████| 4260/4260 [00:03<00:00, 1156.18it/s]
100%|█████████████████████████████████████| 5500/5500 [00:04<00:00, 1283.31it/s]


In [7]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = stopwords.words('russian')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/svasilyev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
def remove_words(data, remove_words):
    remove_words = set(remove_words)
    res = []
    for d in data:
        res.append([e for e in d if e not in remove_words])
    return res

In [9]:
train_text_nostop = remove_words(train_text, stopwords)
test_text_nostop = remove_words(test_text, stopwords)

In [10]:
train_text = list(map(lambda x: ' '.join(x), train_text))
test_text = list(map(lambda x: ' '.join(x), test_text))
train_text_nostop = list(map(lambda x: ' '.join(x), train_text_nostop))
test_text_nostop = list(map(lambda x: ' '.join(x), test_text_nostop))

In [11]:
vect = CountVectorizer().fit(train_text)
vect_nostop = CountVectorizer().fit(train_text_nostop)
train_fea = vect.transform(train_text).toarray()
test_fea = vect.transform(test_text).toarray()
train_fea_nostop = vect_nostop.transform(train_text_nostop).toarray()
test_fea_nostop = vect_nostop.transform(test_text_nostop).toarray()

In [12]:
# 01
# logreg
# binary vectors
# with stopwords

clf_01 = LogisticRegression(C=0.05, max_iter=500, random_state=0).fit((train_fea > 0).astype(int), train_target)
preds_01 = clf_01.predict((test_fea > 0).astype(int))
print(f'Accuracy:\t{accuracy_score(test_target, preds_01)}')
print(f'F1-micro:\t{f1_score(test_target, preds_01, average="micro")}')
print(f'F1-macro:\t{f1_score(test_target, preds_01, average="macro")}')

Accuracy:	0.6153509731029958
F1-micro:	0.6153509731029958
F1-macro:	0.5623278684302425


In [13]:
# 02
# logreg
# tfs vectors
# with stopwords

clf_02 = LogisticRegression(C=0.05, max_iter=500, random_state=0).fit(train_fea, train_target)
preds_02 = clf_02.predict(test_fea)
print(f'Accuracy:\t{accuracy_score(test_target, preds_02)}')
print(f'F1-micro:\t{f1_score(test_target, preds_02, average="micro")}')
print(f'F1-macro:\t{f1_score(test_target, preds_02, average="macro")}')

Accuracy:	0.6146949486114148
F1-micro:	0.6146949486114148
F1-macro:	0.5671031718658105


In [14]:
# 03
# logreg
# tfs-idf vectors
# with stopwords

idf = np.log(train_fea.shape[0] / (train_fea > 0).astype(int).sum(axis=0))

clf_03 = LogisticRegression(C=0.1, max_iter=1000, random_state=0).fit(train_fea * idf, train_target)
preds_03 = clf_03.predict(test_fea * idf)
print(f'Accuracy:\t{accuracy_score(test_target, preds_03)}')
print(f'F1-micro:\t{f1_score(test_target, preds_03, average="micro")}')
print(f'F1-macro:\t{f1_score(test_target, preds_03, average="macro")}')

Accuracy:	0.6094467526787667
F1-micro:	0.6094467526787667
F1-macro:	0.5813070490171646


In [15]:
# 04
# logreg
# binary vectors
# withOUT stopwords

clf_04 = LogisticRegression(C=0.09, max_iter=500, random_state=0).fit((train_fea_nostop > 0).astype(int), train_target)
preds_04 = clf_04.predict((test_fea_nostop > 0).astype(int))
print(f'Accuracy:\t{accuracy_score(test_target, preds_04)}')
print(f'F1-micro:\t{f1_score(test_target, preds_04, average="micro")}')
print(f'F1-macro:\t{f1_score(test_target, preds_04, average="macro")}')

Accuracy:	0.6125082003061447
F1-micro:	0.6125082003061447
F1-macro:	0.5692549088194366


In [16]:
# 05
# logreg
# tfs vectors
# withOUT stopwords

clf_05 = LogisticRegression(C=0.15, max_iter=500, random_state=0).fit(train_fea_nostop, train_target)
preds_05 = clf_05.predict(test_fea_nostop)
print(f'Accuracy:\t{accuracy_score(test_target, preds_05)}')
print(f'F1-micro:\t{f1_score(test_target, preds_05, average="micro")}')
print(f'F1-macro:\t{f1_score(test_target, preds_05, average="macro")}')

Accuracy:	0.6050732560682266
F1-micro:	0.6050732560682266
F1-macro:	0.5675443959376305


In [17]:
# 06
# logreg
# tfs-idf vectors
# withOUT stopwords

idf = np.log(train_fea_nostop.shape[0] / (train_fea_nostop > 0).astype(int).sum(axis=0))

clf_06 = LogisticRegression(C=0.005, max_iter=1000, random_state=0).fit(train_fea_nostop * idf, train_target)
preds_06 = clf_06.predict(test_fea_nostop * idf)
print(f'Accuracy:\t{accuracy_score(test_target, preds_06)}')
print(f'F1-micro:\t{f1_score(test_target, preds_06, average="micro")}')
print(f'F1-macro:\t{f1_score(test_target, preds_06, average="macro")}')

Accuracy:	0.6096654275092936
F1-micro:	0.6096654275092936
F1-macro:	0.5647910278163352


In [18]:
print(f'F1 (-): {f1_score((np.array(test_target) == -1).astype(int), (preds_06 == -1).astype(int))}')
print(f'F1 (0): {f1_score((np.array(test_target) == 0).astype(int), (preds_06 == 0).astype(int))}')
print(f'F1 (+): {f1_score((np.array(test_target) == 1).astype(int), (preds_06 == 1).astype(int))}')

F1 (-): 0.7031214399635451
F1 (0): 0.3732317736670294
F1 (+): 0.6180198698184309
