In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from seqlearn.hmm import MultinomialHMM
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
import os
import openpyxl
from bs4 import BeautifulSoup
import re

In [2]:
class Closed:
    attrs = ['P', 'PART', 'ADV', 'CONJ', 'CONN', 'PRON', 'fragment', 'REL', 'PRV', 'POST', 'NEG', 'INDEF', 'Q', 'DEM', 'POSS', 'NUM']
    def __init__(self):
        self.adverb = {i.split()[0]: i.split()[1] for i in
                           open('../closed_class/Closed_lists_adverbs.txt').read().split('\n') if i != ''}
        self.compliment = {i.split()[0]: i.split()[1] for i in
                           open('../closed_class/Closed_lists_complementizers.txt').read().split('\n') if i != ''}
        self.conjunct = {i.split()[0]: i.split()[1] for i in
                           open('../closed_class/Closed_lists_conjunctions.txt').read().split('\n') if i != ''}
        self.connector = {i.split()[0]: i.split()[1] for i in
                           open('../closed_class/Closed_lists_connectors.txt').read().split('\n') if i != ''}
        self.preverb = {i.split()[0]: i.split()[1] for i in
                           open('../closed_class/Closed_lists_preverbs.txt').read().split('\n') if i != ''}
        self.pronoun = {i.split()[0]: i.split()[1] for i in
                           open('../closed_class/Closed_lists_pronouns.txt').read().split('\n') if i != ''}
        self.q_word = {i.split()[0]: i.split()[1] for i in
                           open('../closed_class/Closed_lists_q-words.txt').read().split('\n') if i != ''}
        self.fragment = {i.split()[0]: i.split()[1] for i in
                       open('../closed_class/Closed_lists_fragment.txt').read().split('\n') if i != ''}

In [3]:
# делаем обучающий сет

closed = Closed()
# считываем класс закрытых классов
speach_class = {i.split()[1]: i.split()[0] for i in open('../class.txt').read().split('\n') if i != ''}
# считываем части речи с их порядковым номером в словарь
features = open('../features.txt').read().split()
# считываем признаки

train_word_list = []  # массив для обучающих слов
train_features_set = []  # массив для обучающих признаков
train_tag_list = []  # массив с правильными частями речи

fname_list = os.listdir('../materials_xlsx/') # массив имён файлов с обучающими материалами в .xlsx

for fname in fname_list: # для каждого файла

    print('processing ' + fname + '\n')
    #os.system('libreoffice --convert-to xml --outdir ./materials_xml/ ../materials_xlsx/' + fname) # конвертируем в .xml
    # результат положится в папку materials_xml в данной дериктории

    soup = BeautifulSoup(open('./materials_xml/' + fname.replace('xlsx', 'xml')), 'xml') # читаем дерево

    row_list = []
    for l in soup.find_all('table', attrs={'table:name': 'Word Forms'}): # идём во второй лист "Word Forms"
        for i, line in enumerate(l.find_all('table-row')):
            row_list.append(re.split('\n{1,3}', line.get_text().replace('\xa0', '').strip('\n'))) #считываем каждый ряд
            # и получаем массив, где каждый элемент -- массив с содержимым каждой ячейки в этом ряду

    row_list.pop(0) # удаляем первые два ряда,
    row_list.pop(0) # потому что в них названия столбцов

    for row in row_list:
        if row[0] != '':
            if len(row) > 8 and row[8].strip() in speach_class.keys():
                example = []
                train_word_list.append(row[5].strip('!: ')) # добавляем слово
                for feat in features: # добавляем его признаки
                    if row[5].strip('!: ').endswith(feat):
                        example.append(1.0) # есть признак
                    else:
                        example.append(0.0) # нет признака
                example.append(float(row[10])) # добавляем номер в предложении
                train_tag_list.append(speach_class[row[8].strip()]) # порядковый номер правильной части речи
                train_features_set.append(example) # получаем матрицу из примеров

X = np.array(train_features_set)
y = np.array(train_tag_list)

print('train_matrix made' + '\n')

processing Letters-NH-3_KUB-19-5-KBo-19-79.xlsx

processing Letters-NH-4_KUB-14-3.xlsx

train_matrix made



In [4]:
def get_estimation(algorithm, X_train, X_test, y_train, y_test):
    # считаем всякие оценки
    algorithm.fit(X_train, y_train)
    y_pred = algorithm.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    F1 = f1_score(y_test, y_pred, average='weighted')

    return [accuracy, precision, recall, F1]

In [5]:
def knn_pred(X_train, X_test, y_train, y_test):
    # k ближайших соседей
    knn = KNeighborsClassifier(n_neighbors=11, metric='euclidean')
    knn_estimation = get_estimation(knn, X_train, X_test, y_train, y_test)
    return knn_estimation

In [6]:
def lr_pred(X_train, X_test, y_train, y_test):
    # логистическая регрессия
    lr = LogisticRegression(penalty="l2", fit_intercept=True, max_iter=100, C=27, solver="lbfgs")
    lr_estimation = get_estimation(lr, X_train, X_test, y_train, y_test)
    return lr_estimation

In [7]:
def svm_pred(X_train, X_test, y_train, y_test):
    # метод опорных векторов
    svm = SVC(C=500)
    svm_estimation = get_estimation(svm, X_train, X_test, y_train, y_test)
    return svm_estimation

In [8]:
def dtree_pred(X_train, X_test, y_train, y_test):
    # решающие деревья
    dtree = DecisionTreeClassifier(max_depth=21)
    dtree_estimation = get_estimation(dtree, X_train, X_test, y_train, y_test)
    return dtree_estimation

In [9]:
def rforest_pred(X_train, X_test, y_train, y_test):
    # случайный лес
    rforest = RandomForestClassifier(n_estimators=31)
    rforest_estimation = get_estimation(rforest, X_train, X_test, y_train, y_test)
    return rforest_estimation

In [10]:
def sgd_pred(X_train, X_test, y_train, y_test):
    # метод стохастического градиента
    sgd = SGDClassifier(alpha=0.005)
    sgd_estimation = get_estimation(sgd, X_train, X_test, y_train, y_test)
    return sgd_estimation

In [11]:
def hmm_pred(X_train, X_test, y_train, y_test):
    # скрытая марковская модель
    hmm = MultinomialHMM(alpha=0.1)
    hmm.fit(X_train, y_train, lengths=np.array([1 for i in y_train]))
    y_pred = hmm.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    F1 = f1_score(y_test, y_pred, average='weighted')
    return [accuracy, precision, recall, F1]

In [12]:
def kfolds_validation(model, X, y):
    # кросс-валидация
    kf = KFold(n_splits=4, shuffle=True, random_state=12345)
    estimations = []

    for train, test in kf.split(X):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        estimations.append(model(X_train, X_test, y_train, y_test))

    accuracy_mean = np.array([i[0] for i in estimations]).mean() # средняя оценка по фолдам
    precision_mean = np.array([i[1] for i in estimations]).mean()
    recall_mean = np.array([i[2] for i in estimations]).mean()
    F1_mean = np.array([i[3] for i in estimations]).mean()
    
    return [accuracy_mean, precision_mean, recall_mean, F1_mean]

In [13]:
def create_plot(estimation, estimation_dic, color):
    # строим график зависимости параметра и оценки
    plt.plot(list(range(0, len(estimation_dic.keys())*10, 10)), [value for value in estimation_dic.values()], color, label=estimation)
    plt.title('Dependence of Estimations on Model')
    plt.xlabel('Model')
    plt.ylabel('Estimations')
    plt.tight_layout()

In [14]:
knn_estimation = kfolds_validation(knn_pred, X, y)
print("got knn estimations")
lr_estimation = kfolds_validation(lr_pred, X, y)
print("got logreg estimations")
svm_estimation = kfolds_validation(svm_pred, X, y)
print("got svm estimations")
dtree_estimation = kfolds_validation(dtree_pred, X, y)
print("got decision tree estimations")
rforest_estimation = kfolds_validation(rforest_pred, X, y)
print("got random forest estimations")
sgd_estimation = kfolds_validation(sgd_pred, X, y)
print("got sgd estimations")
hmm_estimation = kfolds_validation(hmm_pred, X, y)
print("got hmm estimations")

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


got knn estimations


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


got logreg estimations
got svm estimations
got decision tree estimations


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


got random forest estimations
got sgd estimations
got hmm estimations


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [15]:
accuracy_dic = {'knn': knn_estimation[0], 'logreg': lr_estimation[0], 'svm': svm_estimation[0], 'dtree': dtree_estimation[0], 'rforest': rforest_estimation[0], 'sgd': sgd_estimation[0], 'hmm': hmm_estimation[0]}
precision_dic = {'knn': knn_estimation[1], 'logreg': lr_estimation[1], 'svm': svm_estimation[1], 'dtree': dtree_estimation[1], 'rforest': rforest_estimation[1], 'sgd': sgd_estimation[1], 'hmm': hmm_estimation[1]}
recall_dic = {'knn': knn_estimation[2], 'logreg': lr_estimation[2], 'svm': svm_estimation[2], 'dtree': dtree_estimation[2], 'rforest': rforest_estimation[2], 'sgd': sgd_estimation[2], 'hmm': hmm_estimation[2]}
F1_dic = {'knn': knn_estimation[3], 'logreg': lr_estimation[3], 'svm': svm_estimation[3], 'dtree': dtree_estimation[3], 'rforest': rforest_estimation[3], 'sgd': sgd_estimation[3], 'hmm': hmm_estimation[3]}

In [16]:
#create_plot('accuracy', accuracy_dic, 'g')
create_plot('precision', precision_dic, 'c')
create_plot('recall', recall_dic, 'r')
create_plot('F1-score', F1_dic, 'b')

In [17]:
plt.xticks(list(range(0, len(accuracy_dic.keys())*10, 10)), ('knn', 'logreg', 'svm', 'dtree', 'rforest', 'sgd', 'hmm') )
plt.legend()
plt.savefig('./graphs/Dependency of estimation on model.png')
print('Estimation plot created')

Estimation plot created


In [18]:
fw = open('README.md', 'a') # добавляем наглядную табличку в ридми
fw.write('| KNeighbors Classifier | ' + str(accuracy_dic['knn']) + ' | ' + str(precision_dic['knn']) + ' | ' + str(recall_dic['knn']) + ' | ' + str(F1_dic['knn']) + ' |\n')
fw.write('| Logistic Regression | ' + str(accuracy_dic['logreg']) + ' | ' + str(precision_dic['logreg']) + ' | ' + str(recall_dic['logreg']) + ' | ' + str(F1_dic['logreg']) + ' |\n')
fw.write('| Support Vector Machine | ' + str(accuracy_dic['svm']) + ' | ' + str(precision_dic['svm']) + ' | ' + str(recall_dic['svm']) + ' | ' + str(F1_dic['svm']) + ' |\n')
fw.write('| Decision Tree | ' + str(accuracy_dic['dtree']) + ' | ' + str(precision_dic['dtree']) + ' | ' + str(recall_dic['dtree']) + ' | ' + str(F1_dic['dtree']) + ' |\n')
fw.write('| Random Forest | ' + str(accuracy_dic['rforest']) + ' | ' + str(precision_dic['rforest']) + ' | ' + str(recall_dic['rforest']) + ' | ' + str(F1_dic['rforest']) + ' |\n')
fw.write('| Stochastic Gradient Descent | ' + str(accuracy_dic['sgd']) + ' | ' + str(precision_dic['sgd']) + ' | ' + str(recall_dic['sgd']) + ' | ' + str(F1_dic['sgd']) + ' |\n')
fw.write('| Hidden Markov Model | ' + str(accuracy_dic['hmm']) + ' | ' + str(precision_dic['hmm']) + ' | ' + str(recall_dic['hmm']) + ' | ' + str(F1_dic['hmm']) + ' |\n')
fw.close()
print('README.md edit')

README.md edit
