# Лемматизация файла оценки

In [1]:
from mystem import mystem as mstm
import csv
from collections import namedtuple

def lemmatize(dataset, mode):

    testFile = 'Result/' + dataset + '/' + mode + '.csv'
    WordBag = namedtuple('WordBag', 'context_id word gold_sense_id positions context')
    originList = list()

    contextAll = ''

    with open(testFile, 'r', encoding='utf-8', newline='') as f:
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
        next(reader, None)  # skip the headers
    
        for row in reader:
            word, context = row[1], remove_accents(row[5])

            originList.append(
                WordBag(context_id=int(row[0]),
                        word=word,
                        gold_sense_id=row[2],
                        positions=row[4],
                        context=context
                        )
            )
    
    contextDictClean = write_mystem_dict(originList)
    
    return contextDictClean, originList

In [2]:
def write_mystem_dict(originList):
    """Write dict [word, mystem_sentences_list]"""
    contextDict = dict()
    contextDictClean = dict()
    contextDictCount = dict()

    for row in originList:
        if row.word not in contextDict:
            contextDict[row.word] = str()
            contextDictCount[row.word] = 0

        contextDict[row.word] = contextDict[row.word] + row.context + ' \n\n '
        contextDictCount[row.word] += 1

    for word in contextDict:
        contextList = mstm(contextDict[word])
        contextDictClean[word] = lemmatized_text_list(contextList)
        
    for word in contextDict:
        word_diff = contextDictCount[word] - len(contextDictClean[word])
        if word_diff != 0:
            print('ERROR in mystem:', word, word_diff)
    
    return contextDictClean

In [3]:
import unicodedata
import re

def remove_accents(input_str):
    """
    Removes non-unicode symbols from string
    """
    nfс_form = unicodedata.normalize('NFC', input_str)
    nfс_form = re.sub(r'[^А-Яа-яЁё\s\-]', u'', nfс_form, flags=re.UNICODE)
    return u"".join([c for c in nfс_form if not unicodedata.combining(c)])

In [4]:
# Выделение лемм из выхлопа mystem
def lemmatized_text_list(contextList):
    """
    Gets lemmas from list of tuples (mystem func)
    """
    line = ''
    textList = list()
    for sentence in contextList:
        if len(sentence) == 0:
            continue
        for word in sentence:
            if word[2] in ['UNKNOWN', 'CONJ', 'INTJ', 'PART', 'PR']:
                continue
            lemma = word[1]
            if lemma != '.':
                if len(lemma) > 1 and lemma[-1:] == '?':
                    lemma = lemma[:-1]                     
                line += lemma + ' '
        textList.append(line)
        line = ''
    return textList

# Обучение модели и предсказание

In [5]:
def predict_results(trainwords, clf, printscreen=True):
    
    resultList = list()
    clfDict = learn_clf(clf, printscreen)
    for word in trainwords:
        if word in clfDict.keys():
            context = contextDictClean[word]
            clf = clfDict[word]
            wordresult = clf.predict(context)
            for result in wordresult:
                resultList.append(result)
        else:
            continue
            
    return resultList

In [6]:
from sklearn.base import clone

def learn_clf(clf, printscreen):
    path = 'Input/full txt/' + dataset + '(' + mode + ')/' + learnModel + '/'
    clfDict = dict()
    for word in trainwords: 
        model = clone(clf)
        clfDict[word] = learn_word(path, word, model)
        
        if printscreen == True:
            print(word, clfDict[word].classes_)
    return clfDict

In [7]:
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

def learn_word(path, word, model):
    """
    Make word classifier
    """
    
    clf = Pipeline([('tdidfvect', TfidfVectorizer(ngram_range=(1, 2))),
                    ('model', model),
                   ])
    
    contextTrain = list()
    targetTrain = list() 
    
    trainFile = path + word + '.csv'
    with open(trainFile, 'r', encoding='utf-8', newline='') as f:
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
        for row in reader:
            targetTrain.append(row[0])
            contextTrain.append(row[1])
    
    clf.fit(contextTrain, targetTrain)
    
    return clf

# Запись результата

In [8]:
def write_result_file(dataset, mode, learnModel, testModel, originList, resultList):

    outputName = 'Result/' + dataset + '/' + mode + '.' + learnModel + '.' + testModel + '.csv'

    with open(outputName, 'w', encoding='utf-8', newline='') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_NONE, escapechar='\\')

        wr.writerow(['context_id\tword\tgold_sense_id\tpredict_sense_id\tpositions\tcontext'])
        for index, row in enumerate(originList):
            try:
                line = '\t'.join([
                    str(row.context_id),
                    row.word,
                    str(row.gold_sense_id),
                    str(resultList[index]),
                    row.positions,
                    row.context
                ])
            except:
                continue
            wr.writerow([line])

# Вычисление результата

In [9]:
from evaluate import evaluate2 as eval_score

def result_score(semilearnModel, testModel, dataset, mode, printscreen=True):
    file = 'Result\\' + dataset + '\\' + mode + '.' + semilearnModel + '.' + testModel + '.csv'
    result = eval_score(file)
    
    if printscreen == True:
        print(dataset, mode, semilearnModel, testModel)
        print(result)
    
    return result

## Общая функция

In [10]:
def test_model(clf, trainwords, dataset, mode, learnModel, originList):
    resultList = predict_results(trainwords, clf, printscreen=False)
    testModel = str(clf.__class__.__name__)
    
    write_result_file(dataset, mode, learnModel, testModel, originList, resultList)
    result_score(learnModel, testModel, dataset, mode, printscreen=True)

# Поиск оптимального алгоритма

## Предварительная обработка

In [11]:
trainwords = ['балка',
             'вид',
             'винт',
             'горн',
             'губа',
             'жаба',
             'клетка',
             'крыло',
             'купюра', 
             'курица', 
             'лавка', 
             'лайка', 
             'лев', 
             'лира', 
             'мина', 
             'мишень', 
             'обед', 
             'оклад', 
             'опушка', 
             'полис', 
             'пост', 
             'поток', 
             'проказа', 
             'пропасть', 
             'проспект', 
             'пытка',
             'рысь',
             'среда',
             'хвост',
             'штамп',
            ]

In [12]:
dataset = 'bts-rnc'
mode = 'train'

In [13]:
%%time
# Считывание и лемматизация файла проверки
contextDictClean, originList = lemmatize(dataset, mode)

Wall time: 55.7 s


## Работа с моделями

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC

## На данных KNN

In [15]:
learnModel = 'KNeighborsClassifier'
mode = 'train'

In [16]:
%%time
clf = MultinomialNB(alpha=0.01, fit_prior=False)
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train KNeighborsClassifier MultinomialNB
0.302541
Wall time: 4.21 s


In [17]:
%%time
clf = BernoulliNB(alpha=0.01, fit_prior=False)
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train KNeighborsClassifier BernoulliNB
0.236935
Wall time: 3.68 s


In [18]:
%%time
clf = SGDClassifier(max_iter=100, tol=0.01, loss='squared_hinge', penalty='elasticnet')
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train KNeighborsClassifier SGDClassifier
0.248732
Wall time: 5.06 s


In [19]:
%%time
clf = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, tol=0.0001, C=2.0)
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train KNeighborsClassifier LinearSVC
0.246685
Wall time: 5.07 s


**Вывод.** Частичное обучение на KNN + дальнейшее обучение на MNB дают результат 0.3.

## На данных MNB

In [20]:
learnModel = 'MultinomialNB'

In [21]:
%%time
clf = MultinomialNB(alpha=0.01, fit_prior=False)
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train MultinomialNB MultinomialNB
0.178091
Wall time: 1.07 s


In [22]:
%%time
clf = BernoulliNB(alpha=0.01, fit_prior=False)
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train MultinomialNB BernoulliNB
0.235459
Wall time: 726 ms


In [23]:
%%time
clf = SGDClassifier(max_iter=100, tol=0.01, loss='squared_hinge', penalty='elasticnet')
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train MultinomialNB SGDClassifier
0.139709
Wall time: 724 ms


In [24]:
%%time
clf = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, tol=0.0001, C=2.0)
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train MultinomialNB LinearSVC
0.204999
Wall time: 718 ms


**Вывод.** MNB - не лучший способ частичного обучения.

## На данных Бернулли NB

In [25]:
learnModel = 'BernoulliNB'

In [26]:
%%time
clf = MultinomialNB(alpha=0.01, fit_prior=False)
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train BernoulliNB MultinomialNB
0.220795
Wall time: 5.66 s


In [27]:
%%time
clf = BernoulliNB(alpha=0.01, fit_prior=False)
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train BernoulliNB BernoulliNB
0.139015
Wall time: 5.27 s


In [28]:
%%time
clf = SGDClassifier(max_iter=100, tol=0.01, loss='squared_hinge', penalty='elasticnet')
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train BernoulliNB SGDClassifier
0.157445
Wall time: 6.1 s


In [29]:
%%time
clf = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, tol=0.0001, C=2.0)
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train BernoulliNB LinearSVC
0.180712
Wall time: 7.09 s


# На данных GradientBoostingClassifier

In [30]:
learnModel = 'GradientBoostingClassifier'

In [31]:
%%time
clf = MultinomialNB(alpha=0.01, fit_prior=False)
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train GradientBoostingClassifier MultinomialNB
0.228913
Wall time: 4.93 s


In [32]:
%%time
clf = BernoulliNB(alpha=0.01, fit_prior=False)
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train GradientBoostingClassifier BernoulliNB
0.164874
Wall time: 4.7 s


In [33]:
%%time
clf = SGDClassifier(max_iter=100, tol=0.01, loss='squared_hinge', penalty='elasticnet')
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train GradientBoostingClassifier SGDClassifier
0.183972
Wall time: 4.96 s


In [34]:
%%time
clf = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, tol=0.0001, C=2.0)
test_model(clf, trainwords, dataset, mode, learnModel, originList)

bts-rnc train GradientBoostingClassifier LinearSVC
0.203076
Wall time: 5.43 s


Так же были использованы, но по разным причинам не включены в ноутбук: GradientBoostingClassifier (время), AdaBoostClassifier (точность), KNeighborsClassifier (точность), RandomForestClassifier (точность).

# Детальное изучение наилучшего алгоритма

In [35]:
from evaluate import evaluate as eval_score_details
eval_score_details('Result\\bts-rnc\\train.KNeighborsClassifier.MultinomialNB.csv')

word	ari	count
балка	0.228342	119
вид	0.246520	77
винт	0.401767	123
горн	0.127359	51
губа	0.218559	137
жаба	0.103429	121
клетка	0.382173	150
крыло	0.269338	91
купюра	0.469913	150
курица	0.067821	93
лавка	0.084412	149
лайка	0.607617	99
лев	0.462464	44
лира	-0.037962	49
мина	0.234059	65
мишень	0.107486	121
обед	0.012241	100
оклад	0.780554	146
опушка	0.898969	148
полис	0.471737	142
пост	0.131352	144
поток	-0.082392	136
проказа	0.043743	146
пропасть	0.155421	127
проспект	0.534390	139
пытка	0.199271	143
рысь	0.593041	120
среда	0.222034	144
хвост	0.587657	121
штамп	0.078159	96
	0.302541	3491


# Формирование тестового файла

In [40]:
trainwords = [
    'акция',
    'баба',
    'байка',
    'бум',
    'бычок',
    'вал',
    'газ',
    'гвоздика',
    'гипербола', 
    'град',
    'гусеница', 
    'дождь', 
    'домино', 
    'забой', 
    'икра', 
    'кабачок',
    'капот', 
    'карьер', 
    'кличка', 
    'ключ', 
    'кок', 
    'кольцо', 
    'концерт', 
    'котелок', 
    'крона', 
    'круп',
    'кулак',
    'лейка',
    'лук',
    'мандарин',
    'ножка', 
    'опора', 
    'патрон', 
    'печать', 
    'пол',
    'полоз', 
    'почерк', 
    'пробка', 
    'рак', 
    'рок', 
    'свет', 
    'секрет', 
    'скат', 
    'слог', 
    'стан',
    'стопка',
    'таз',
    'такса',
    'тюрьма',
    'шах',
    'шашка'
]

In [41]:
dataset = 'bts-rnc'
mode = 'test'

In [42]:
%%time
# Считывание и лемматизация файла проверки
contextDictClean, originList = lemmatize(dataset, mode)

Wall time: 1min 30s


In [39]:
from sklearn.naive_bayes import MultinomialNB

learnModel = 'KNeighborsClassifier'
mode = 'test'

clf = MultinomialNB(alpha=0.01, fit_prior=False)
testModel = str(clf.__class__.__name__)

resultList = predict_results(trainwords, clf, printscreen=False)
write_result_file(dataset, mode, learnModel, testModel, originList, resultList)

FileNotFoundError: [Errno 2] No such file or directory: 'Input/full txt/bts-rnc(test)/KNeighborsClassifier/балка.csv'