In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from seqlearn.hmm import MultinomialHMM
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
import os
import openpyxl
from bs4 import BeautifulSoup
import re

In [2]:
class Closed:
    attrs = ['P', 'PART', 'ADV', 'CONJ', 'CONN', 'PRON', 'fragment', 'REL', 'PRV', 'POST', 'NEG', 'INDEF', 'Q', 'DEM', 'POSS', 'NUM']
    def __init__(self):
        self.adverb = {i.split()[0]: i.split()[1] for i in
                           open('../closed_class/Closed_lists_adverbs.txt').read().split('\n') if i != ''}
        self.compliment = {i.split()[0]: i.split()[1] for i in
                           open('../closed_class/Closed_lists_complementizers.txt').read().split('\n') if i != ''}
        self.conjunct = {i.split()[0]: i.split()[1] for i in
                           open('../closed_class/Closed_lists_conjunctions.txt').read().split('\n') if i != ''}
        self.connector = {i.split()[0]: i.split()[1] for i in
                           open('../closed_class/Closed_lists_connectors.txt').read().split('\n') if i != ''}
        self.preverb = {i.split()[0]: i.split()[1] for i in
                           open('../closed_class/Closed_lists_preverbs.txt').read().split('\n') if i != ''}
        self.pronoun = {i.split()[0]: i.split()[1] for i in
                           open('../closed_class/Closed_lists_pronouns.txt').read().split('\n') if i != ''}
        self.q_word = {i.split()[0]: i.split()[1] for i in
                           open('../closed_class/Closed_lists_q-words.txt').read().split('\n') if i != ''}
        self.fragment = {i.split()[0]: i.split()[1] for i in
                       open('../closed_class/Closed_lists_fragment.txt').read().split('\n') if i != ''}

In [3]:
# делаем обучающий сет

closed = Closed()
# считываем класс закрытых классов
speach_class = {i.split()[1]: i.split()[0] for i in open('../class.txt').read().split('\n') if i != ''}
# считываем части речи с их порядковым номером в словарь
features = open('../features.txt').read().split()
# считываем признаки

train_word_list = []  # массив для обучающих слов
train_features_set = []  # массив для обучающих признаков
train_tag_list = []  # массив с правильными частями речи

fname_list = os.listdir('../materials_xlsx/') # массив имён файлов с обучающими материалами в .xlsx

for fname in fname_list: # для каждого файла

    print('processing ' + fname + '\n')
    #os.system('libreoffice --convert-to xml --outdir ./materials_xml/ ../materials_xlsx/' + fname) # конвертируем в .xml
    # результат положится в папку materials_xml в данной дериктории

    soup = BeautifulSoup(open('./materials_xml/' + fname.replace('xlsx', 'xml')), 'xml') # читаем дерево

    row_list = []
    for l in soup.find_all('table', attrs={'table:name': 'Word Forms'}): # идём во второй лист "Word Forms"
        for i, line in enumerate(l.find_all('table-row')):
            row_list.append(re.split('\n{1,3}', line.get_text().replace('\xa0', '').strip('\n'))) #считываем каждый ряд
            # и получаем массив, где каждый элемент -- массив с содержимым каждой ячейки в этом ряду

    row_list.pop(0) # удаляем первые два ряда,
    row_list.pop(0) # потому что в них названия столбцов

    for row in row_list:
        if row[0] != '':
            if len(row) > 8 and row[8].strip() in speach_class.keys():
                example = []
                train_word_list.append(row[5].strip('!: ')) # добавляем слово
                for feat in features: # добавляем его признаки
                    if row[5].strip('!: ').endswith(feat):
                        example.append(1.0) # есть признак
                    else:
                        example.append(0.0) # нет признака
                example.append(float(row[10])) # добавляем номер в предложении
                train_tag_list.append(speach_class[row[8].strip()]) # порядковый номер правильной части речи
                train_features_set.append(example) # получаем матрицу из примеров

X = np.array(train_features_set)
y = np.array(train_tag_list)

print('train_matrix made' + '\n')

processing Letters-NH-3_KUB-19-5-KBo-19-79.xlsx

processing Letters-NH-4_KUB-14-3.xlsx

train_matrix made



In [4]:
def get_estimation(algorithm, X_train, X_test, y_train, y_test):
    # считаем всякие оценки
    algorithm.fit(X_train, y_train)
    y_pred = algorithm.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    F1 = f1_score(y_test, y_pred, average='weighted')

    return [accuracy, precision, recall, F1]

In [5]:
def knn_pred(n, X_train, X_test, y_train, y_test):
    # k ближайших соседей
    knn = KNeighborsClassifier(n_neighbors=n, metric='euclidean')
    knn_estimation = get_estimation(knn, X_train, X_test, y_train, y_test)
    return knn_estimation

In [6]:
def lr_pred(с, X_train, X_test, y_train, y_test):
    # логистическая регрессия
    lr = LogisticRegression(penalty="l2", fit_intercept=True, max_iter=100, C=с, solver="lbfgs")
    lr_estimation = get_estimation(lr, X_train, X_test, y_train, y_test)
    return lr_estimation

In [7]:
def svm_pred(c, X_train, X_test, y_train, y_test):
    # метод опорных векторов
    svm = SVC(C=c)
    svm_estimation = get_estimation(svm, X_train, X_test, y_train, y_test)
    return svm_estimation

In [22]:
def dtree_pred(m, X_train, X_test, y_train, y_test):
    # решающие деревья
    dtree = DecisionTreeClassifier(max_depth=m)
    dtree_estimation = get_estimation(dtree, X_train, X_test, y_train, y_test)
    return dtree_estimation

In [9]:
def rforest_pred(n, X_train, X_test, y_train, y_test):
    # случайный лес
    rforest = RandomForestClassifier(n_estimators=n)
    rforest_estimation = get_estimation(rforest, X_train, X_test, y_train, y_test)
    return rforest_estimation

In [10]:
def sgd_pred(a, X_train, X_test, y_train, y_test):
    # метод стохастического градиента
    sgd = SGDClassifier(alpha=a)
    sgd_estimation = get_estimation(sgd, X_train, X_test, y_train, y_test)
    return sgd_estimation

In [11]:
def hmm_pred(a, X_train, X_test, y_train, y_test):
    # скрытая марковская модель
    hmm = MultinomialHMM(alpha=a)
    hmm.fit(X_train, y_train, lengths=np.array([1 for i in y_train]))
    y_pred = hmm.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    F1 = f1_score(y_test, y_pred, average='weighted')
    return [accuracy, precision, recall, F1]

In [12]:
def kfolds_validation(model, param, X, y):
    # кросс-валидация
    kf = KFold(n_splits=4, shuffle=True, random_state=12345)
    estimations = []

    for train, test in kf.split(X):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        estimations.append(model(param, X_train, X_test, y_train, y_test)[3])

    estimation = np.array(estimations).mean() # средняя оценка по фолдам

    return estimation

In [13]:
def get_params_dic(model, params, X, y):
    # создаём словарь, где ключи -- параметр, а значения -- оценка при этом параметре
    estimations = {}

    for param in params:
        estimations[param] = kfolds_validation(model, param, X, y)

    return estimations

In [15]:
def create_plot(model, estimation):
    # строим график зависимости параметра и оценки
    plt.plot([key for key in estimation.keys()], [value for value in estimation.values()])
    plt.title('F1-score of ' + model)
    plt.xlabel('Hyperparameters')
    plt.ylabel('F1-score')
    plt.tight_layout()
    plt.savefig('./graphs/' + model + ' F1-score')
    plt.close()
    print(model + ' plot created\n')

In [16]:
ns_knn = np.arange(1, 150, 10)  # количество соседей
cs_lr = np.logspace(-2, 10, 8, base=10)  # параметр регуляризации для логрега
cs_svm = [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500] # параметр регуляризации для опорных векторов
ms = np.arange(1, 150, 10)  # максимальная глубина деревьев
ns_rf = np.arange(1, 150, 10)  # количество деревьев в лесу
als = [0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5] 
# параметр регуляризации для стохастического градиента и скрытой марковской модели

In [23]:
knn_estimation = get_params_dic(knn_pred, ns_knn, X, y)
print('knn F1-score: ' + '\n', '\n'.join(['{}: {}'.format(key, knn_estimation[key]) for key in knn_estimation]), '\n')
create_plot('kNN Classifier', knn_estimation)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


knn F1-score: 
 1: 0.6478423879850923
11: 0.6417157368231654
21: 0.6112645330406992
31: 0.5880382375474585
41: 0.5611931414061432
51: 0.563153641694421
61: 0.5842704050113614
71: 0.6045543373438369
81: 0.588100201033911
91: 0.5819309844064607
101: 0.5836251284935134
111: 0.589430333268643
121: 0.5946708295945445
131: 0.6044231171719052
141: 0.6097420152625423 

kNN Classifier plot created



In [24]:
lr_estimation = get_params_dic(lr_pred, cs_lr, X, y)
print('log reg F1-score: ' + '\n', '\n'.join(['{}: {}'.format(key, lr_estimation[key]) for key in lr_estimation]), '\n')
create_plot('Logistic Regression', lr_estimation)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


log reg F1-score: 
 0.01: 0.6135133160955508
0.517947467923121: 0.6995614815149891
26.826957952797247: 0.7124883138514506
1389.4954943731361: 0.7083285695408699
71968.56730011514: 0.7083285695408699
3727593.720314938: 0.7089621363003767
193069772.88832456: 0.706651939292811
10000000000.0: 0.704143836713699 

Logistic Regression plot created



In [20]:
svm_estimation = get_params_dic(svm_pred, cs_svm, X, y)
print('svm F1-score: ' + '\n', '\n'.join(['{}: {}'.format(key, svm_estimation[key]) for key in svm_estimation]), '\n')
create_plot('SVM', svm_estimation)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


svm F1-score: 
 0.01: 0.26726750715874215
0.05: 0.31592987403303685
0.1: 0.5594647196724212
0.5: 0.6078844599288422
1: 0.6332864017809111
5: 0.6858864587804382
10: 0.6950401644317002
50: 0.71312557010882
100: 0.7084206549584185
500: 0.715266766993222 

SVM plot created



In [25]:
dtree_estimation = get_params_dic(dtree_pred, ms, X, y)
print('decision tree F1-score: ' + '\n', '\n'.join(['{}: {}'.format(key, dtree_estimation[key]) for key in dtree_estimation]), '\n')
create_plot('Decision Tree', dtree_estimation)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


decision tree F1-score: 
 1: 0.5103516311799995
11: 0.6825577455368509
21: 0.6838788837638717
31: 0.6809720874966773
41: 0.6787248221392218
51: 0.6787248221392218
61: 0.6813392012241891
71: 0.6838788837638717
81: 0.6787248221392218
91: 0.6838788837638717
101: 0.6838788837638717
111: 0.6813392012241891
121: 0.6838788837638717
131: 0.6813392012241891
141: 0.6813392012241891 

Decision Tree plot created



In [21]:
rforest_estimation = get_params_dic(rforest_pred, ns_rf, X, y)
print('random forest F1-score: ' + '\n', '\n'.join(['{}: {}'.format(key, rforest_estimation[key]) for key in rforest_estimation]), '\n')
create_plot('Random Forest', rforest_estimation)

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)

random forest F1-score: 
 1: 0.6614491434396056
11: 0.6915625026788181
21: 0.7006945054066397
31: 0.7095110723180877
41: 0.6876987054336957
51: 0.7033511575064901
61: 0.699685265220255
71: 0.7086431473460834
81: 0.706030989993555
91: 0.7036868857529992
101: 0.7163219425543643
111: 0.7052990639946086
121: 0.6944772559564469
131: 0.7005954084141772
141: 0.6928551324603309 

Random Forest plot created



In [26]:
sgd_estimation = get_params_dic(sgd_pred, als, X, y)
print('Stochastic gradient descent F1-score: ' + '\n', '\n'.join(['{}: {}'.format(key, sgd_estimation[key]) for key in sgd_estimation]), '\n')
create_plot('Stochastic Gradient Descent', sgd_estimation)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Stochastic gradient descent F1-score: 
 1e-05: 0.6026124406444265
5e-05: 0.6324811772220092
0.0001: 0.5595697037539279
0.0005: 0.4604395761514881
0.001: 0.6431995964785142
0.005: 0.6375580841087698
0.01: 0.6131787981863663
0.05: 0.661747547277761
0.1: 0.5520062268005715
0.5: 0.5908727606470947
1: 0.47215033845297005
5: 0.4171510251184114 

Stochastic Gradient Descent plot created



In [27]:
hmm_estimation = get_params_dic(hmm_pred, als, X, y)
print('Hidden Markov model F1-score: ' + '\n', '\n'.join(['{}: {}'.format(key, hmm_estimation[key]) for key in hmm_estimation]), '\n')
create_plot('Hidden Markov Model', hmm_estimation)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Hidden Markov model F1-score: 
 1e-05: 0.5578850845809207
5e-05: 0.5578850845809207
0.0001: 0.5578850845809207
0.0005: 0.5578850845809207
0.001: 0.5578850845809207
0.005: 0.5578850845809207
0.01: 0.5578850845809207
0.05: 0.5578850845809207
0.1: 0.5578850845809207
0.5: 0.5542730302917831
1: 0.5548899191104263
5: 0.5428701577036441 

Hidden Markov Model plot created

