# Лемматизация файла оценки

In [1]:
from mystem import mystem as mstm
import csv
from collections import namedtuple

def lemmatize(dataset, mode):

    testFile = 'Result/' + dataset + '/' + mode + '.csv'
    WordBag = namedtuple('WordBag', 'context_id word gold_sense_id positions context')
    originList = list()

    contextAll = ''

    with open(testFile, 'r', encoding='utf-8', newline='') as f:
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
        next(reader, None)  # skip the headers
    
        for row in reader:
            word, context = row[1], remove_accents(row[5])

            originList.append(
                WordBag(context_id=int(row[0]),
                        word=word,
                        gold_sense_id=row[2],
                        positions=row[4],
                        context=context
                        )
            )
    
    contextDictClean = write_mystem_dict(originList)
    
    return contextDictClean, originList

In [2]:
def write_mystem_dict(originList):
    """Write dict [word, mystem_sentences_list]"""
    contextDict = dict()
    contextDictClean = dict()
    contextDictCount = dict()

    for row in originList:
        if row.word not in contextDict:
            contextDict[row.word] = str()
            contextDictCount[row.word] = 0

        contextDict[row.word] = contextDict[row.word] + row.context + ' \n\n '
        contextDictCount[row.word] += 1

    for word in contextDict:
        contextList = mstm(contextDict[word])
        contextDictClean[word] = lemmatized_text_list(contextList)
        
    for word in contextDict:
        word_diff = contextDictCount[word] - len(contextDictClean[word])
        if word_diff != 0:
            print('ERROR in mystem:', word, word_diff)
    
    return contextDictClean

In [3]:
import unicodedata
import re

def remove_accents(input_str):
    """
    Removes non-unicode symbols from string
    """
    nfс_form = unicodedata.normalize('NFC', input_str)
    nfс_form = re.sub(r'[^А-Яа-яЁё\s\-]', u'', nfс_form, flags=re.UNICODE)
    return u"".join([c for c in nfс_form if not unicodedata.combining(c)])

In [4]:
# Выделение лемм из выхлопа mystem
def lemmatized_text_list(contextList):
    """
    Gets lemmas from list of tuples (mystem func)
    """
    line = ''
    textList = list()
    for sentence in contextList:
        if len(sentence) == 0:
            continue
        for word in sentence:
            if word[2] in ['UNKNOWN', 'CONJ', 'INTJ', 'PART', 'PR']:
                continue
            lemma = word[1]
            if lemma != '.':
                if len(lemma) > 1 and lemma[-1:] == '?':
                    lemma = lemma[:-1]                     
                line += lemma + ' '
        textList.append(line)
        line = ''
    return textList

# Обучение модели и предсказание

In [5]:
def predict_results(trainwords, clf, printscreen=True):
    
    resultList = list()
    clfDict = learn_clf(clf, printscreen)
    for word in trainwords:
        if word in clfDict.keys():
            context = contextDictClean[word]
            clf = clfDict[word]
            wordresult = clf.predict(context)
            for result in wordresult:
                resultList.append(result)
        else:
            continue
            
    return resultList

In [6]:
from sklearn.base import clone

def learn_clf(clf, printscreen):
    path = 'Data/НКРЯ/labeled txt/' + dataset + '(' + mode + ')/' + learnModel + '/'
    clfDict = dict()
    for word in trainwords: 
        model = clone(clf)
        clfDict[word] = learn_word(path, word, model)
        
        if printscreen == True:
            print(word, clfDict[word].classes_)
    return clfDict

In [7]:
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

def learn_word(path, word, model):
    """
    Make word classifier
    """
    
    clf = Pipeline([('tdidfvect', TfidfVectorizer(ngram_range=(1, 2))),
                    ('model', model),
                   ])
    
    contextTrain = list()
    targetTrain = list() 
    
    trainFile = path + word + '.csv'
    with open(trainFile, 'r', encoding='utf-8', newline='') as f:
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
        for row in reader:
            targetTrain.append(row[0])
            contextTrain.append(row[1])
    
    clf.fit(contextTrain, targetTrain)
    
    return clf

# Запись результата

In [8]:
def write_result_file(dataset, mode, learnModel, testModel, originList, resultList):

    outputName = 'Result/' + dataset + '/' + mode + '.' + learnModel + '.' + testModel + '.csv'

    with open(outputName, 'w', encoding='utf-8', newline='') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_NONE, escapechar='\\')

        wr.writerow(['context_id\tword\tgold_sense_id\tpredict_sense_id\tpositions\tcontext'])
        for index, row in enumerate(originList):
            try:
                line = '\t'.join([
                    str(row.context_id),
                    row.word,
                    str(row.gold_sense_id),
                    str(resultList[index]),
                    row.positions,
                    row.context
                ])
            except:
                continue
            wr.writerow([line])

# Вычисление результата

In [9]:
from evaluate import evaluate2 as eval_score

def result_score(semilearnModel, testModel, dataset, mode, printscreen=True):
    file = 'Result\\' + dataset + '\\' + mode + '.' + semilearnModel + '.' + testModel + '.csv'
    result = eval_score(file)
    
    if printscreen == True:
        print(dataset, mode, semilearnModel, testModel)
        print(result)
    
    return result

In [None]:
# from sklearn import model_selection

# def find_best_params(model, parameters_grid, df):
    
#     classifier = Pipeline(
#         [('tfidf', TfidfVectorizer()),
#          ('clf', model),
#         ])   
    
#     # Деление данных на обучающую и тестовую выборки
#     train_data, test_data, train_labels, test_labels = model_selection.train_test_split(df.name, df.tare, 
#                                                                                      test_size = 0.3,random_state = 0)
         
#     # Обучаем grid search
#     grid_cv = model_selection.GridSearchCV(classifier, parameters_grid, scoring = 'accuracy')
#     grid_cv.fit(train_data, train_labels)
    
#     print('best score:', grid_cv.best_score_, '\n')
#     print('best parametes:', grid_cv.best_params_)

## Общая функция

In [10]:
def test_model(clf, trainwords, dataset, mode, learnModel, originList):
    resultList = predict_results(trainwords, clf, printscreen=False)
    testModel = str(clf.__class__.__name__)
    
    write_result_file(dataset, mode, learnModel, testModel, originList, resultList)
    result_score(learnModel, testModel, dataset, mode, printscreen=True)

# Начало работы

## Предварительная обработка

In [11]:
# trainwords = ['балка',
#              'вид',
#              'винт',
#              'горн',
#              'губа',
#              'жаба',
#              'клетка',
#              'крыло',
#              'купюра', 
#              'курица', 
#              'лавка', 
#              'лайка', 
#              'лев', 
#              'лира', 
#              'мина', 
#              'мишень', 
#              'обед', 
#              'оклад', 
#              'опушка', 
#              'полис', 
#              'пост', 
#              'поток', 
#              'проказа', 
#              'пропасть', 
#              'проспект', 
#              'пытка',
#              'рысь',
#              'среда',
#              'хвост',
#              'штамп',
#             ]

trainwords = [
    'акция',
    'баба',
    'байка',
    'бум',
    'бычок',
    'вал',
    'газ',
    'гвоздика',
    'гипербола', 
    'град',
    'гусеница', 
    'дождь', 
    'домино', 
    'забой', 
    'икра', 
    'кабачок',
    'капот', 
    'карьер', 
    'кличка', 
    'ключ', 
    'кок', 
    'кольцо', 
    'концерт', 
    'котелок', 
    'крона', 
    'круп',
    'кулак',
    'лейка',
    'лук',
    'мандарин',
    'ножка', 
    'опора', 
    'патрон', 
    'печать', 
    'пол',
    'полоз', 
    'почерк', 
    'пробка', 
    'рак', 
    'рок', 
    'свет', 
    'секрет', 
    'скат', 
    'слог', 
    'стан',
    'стопка',
    'таз',
    'такса',
    'тюрьма',
    'шах',
    'шашка'
] 

dataset = 'bts-rnc'
mode = 'test'

In [12]:
%%time
# Считывание и лемматизация файла проверки
contextDictClean, originList = lemmatize(dataset, mode)

Wall time: 1min 38s


## Тестовый файл

In [13]:
from sklearn.naive_bayes import MultinomialNB

learnModel = 'KNeighborsClassifier'

clf = MultinomialNB(alpha=0.01, fit_prior=False)
testModel = str(clf.__class__.__name__)

resultList = predict_results(trainwords, clf, printscreen=False)
write_result_file(dataset, mode, learnModel, testModel, originList, resultList)

## Работа с моделями

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC

## На данных KNN

In [61]:
learnModel = 'KNeighborsClassifier'

In [66]:
%%time
clf = MultinomialNB(alpha=0.01, fit_prior=False)
#alpha=0.01, fit_prior=False - 0.301357
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train KNeighborsClassifier MultinomialNB
0.302541
Wall time: 3.65 s


In [67]:
%%time
clf = BernoulliNB(alpha=0.01, fit_prior=False)
#alpha=0.01, fit_prior=False - 0.236203
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train KNeighborsClassifier BernoulliNB
0.236935
Wall time: 3.58 s


In [68]:
%%time
clf = SGDClassifier(max_iter=100, tol=0.01, loss='squared_hinge', penalty='elasticnet')
#max_iter=100, tol=0.01, loss='squared_hinge', penalty='elasticnet' - 0.250178
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train KNeighborsClassifier SGDClassifier
0.231198
Wall time: 4.45 s


In [69]:
%%time
clf = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, tol=0.0001, C=2.0)
#penalty='l1', loss='squared_hinge', dual=False, C=2.0 - 0.24
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train KNeighborsClassifier LinearSVC
0.247301
Wall time: 4.68 s


**Вывод.** Частичное обучение на KNN + дальнейшее обучение на MNB дают результат 0.3.

Так же были использованы, но по разным причинам не включены в ноутбук: GradientBoostingClassifier (время), AdaBoostClassifier (точность), KNeighborsClassifier (точность), RandomForestClassifier (точность).

## На данных MNB

In [25]:
learnModel = 'MultinomialNB'

In [57]:
%%time
clf = MultinomialNB(alpha=0.01, fit_prior=False)
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train MultinomialNB MultinomialNB
0.178091
Wall time: 650 ms


In [58]:
%%time
clf = BernoulliNB(alpha=0.01, fit_prior=False)
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train MultinomialNB BernoulliNB
0.235459
Wall time: 642 ms


In [59]:
%%time
clf = SGDClassifier(max_iter=100, tol=0.01, loss='squared_hinge', penalty='elasticnet')
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train MultinomialNB SGDClassifier
0.160804
Wall time: 671 ms


In [60]:
%%time
clf = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, tol=0.0001, C=2.0)
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train MultinomialNB LinearSVC
0.206220
Wall time: 758 ms


**Вывод.** MNB - не лучший способ частичного обучения.

## На данных Бернулли NB

In [20]:
learnModel = 'BernoulliNB'

In [21]:
%%time
clf = MultinomialNB(alpha=0.01, fit_prior=False)
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train BernoulliNB MultinomialNB
0.220795
Wall time: 4.68 s


In [22]:
%%time
clf = BernoulliNB(alpha=0.01, fit_prior=False)
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train BernoulliNB BernoulliNB
0.139015
Wall time: 4.9 s


In [23]:
%%time
clf = SGDClassifier(max_iter=100, tol=0.01, loss='squared_hinge', penalty='elasticnet')
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train BernoulliNB SGDClassifier
0.147161
Wall time: 6.18 s


In [24]:
%%time
clf = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, tol=0.0001, C=2.0)
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train BernoulliNB LinearSVC
0.180712
Wall time: 6.48 s


# На данных GradientBoostingClassifier

In [70]:
learnModel = 'GradientBoostingClassifier'

In [71]:
%%time
clf = MultinomialNB(alpha=0.01, fit_prior=False)
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train GradientBoostingClassifier MultinomialNB
0.228913
Wall time: 4.71 s


In [72]:
%%time
clf = BernoulliNB(alpha=0.01, fit_prior=False)
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train GradientBoostingClassifier BernoulliNB
0.164874
Wall time: 4.95 s


In [73]:
%%time
clf = SGDClassifier(max_iter=100, tol=0.01, loss='squared_hinge', penalty='elasticnet')
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train GradientBoostingClassifier SGDClassifier
0.184168
Wall time: 5.57 s


In [74]:
%%time
clf = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, tol=0.0001, C=2.0)
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train GradientBoostingClassifier LinearSVC
0.202593
Wall time: 5.56 s
