In [1]:
from mystem import mystem as mstm

In [2]:
import unicodedata
import re

def remove_accents(input_str):
    """
    Removes non-unicode symbols from string
    """
    nfс_form = unicodedata.normalize('NFC', input_str)
    nfс_form = re.sub(r'[^А-Яа-яЁё\s\-]', u'', nfс_form, flags=re.UNICODE)
    return u"".join([c for c in nfс_form if not unicodedata.combining(c)])

In [3]:
# Выделение лемм из выхлопа mystem
def lemmatized_text_list(contextList):
    """
    Gets lemmas from list of tuples (mystem func)
    """
    line = ''
    textList = list()
    for sentence in contextList:
        if len(sentence) == 0:
            continue
        for word in sentence:
            if word[2] in ['UNKNOWN', 'CONJ', 'INTJ', 'PART', 'PR']:
                continue
            lemma = word[1]
            if lemma != '.':
                if len(lemma) > 1 and lemma[-1:] == '?':
                    lemma = lemma[:-1]                     
                line += lemma + ' '
        textList.append(line)
        line = ''
    return textList

In [4]:
def printresult(word, context, clf):
    predicted = clf.predict(context)
    print(word, len(predicted))
    return predicted

In [5]:
import csv
def learn_word(path, word, clf):
    """
    Make word classifier
    """
    contextTrain = list()
    targetTrain = list() 
    
    trainFile = path + word + '.csv'
    with open(trainFile, 'r', encoding='utf-8', newline='') as f:
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
        for row in reader:
            targetTrain.append(row[0])
            contextTrain.append(row[1])
    
    clf.fit(contextTrain, targetTrain)
    
    return clf

In [6]:
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.naive_bayes import BernoulliNB
# from sklearn.neighbors.nearest_centroid import NearestCentroid
# from sklearn.linear_model import SGDClassifier
# from sklearn.linear_model import Perceptron
# from sklearn.ensemble import GradientBoostingClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

from sklearn.neural_network import MLPClassifier

model = MLPClassifier(solver='lbfgs', alpha=0.001, max_iter=1000, tol=0.001)

# from sklearn.svm import LinearSVC
# model = LinearSVC() 

clf = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('model', model),
               ])


# model = Perceptron(penalty=None,
#                  alpha=0.0001, 
#                  fit_intercept=True, 
#                  max_iter=30000, 
#                  tol=0.1, 
#                  shuffle=True,   
#                 ).fit(X_train_tfidf, targetList) # 0.459148    
          
     
# model = SGDClassifier(loss="hinge",
#                     penalty="l2",
#                     max_iter=10000,
#                     alpha=0.00001,
#                    ).fit(X_train_tfidf, targetList) # 0.374446

# model = NearestCentroid().fit(X_train_tfidf, targetList) # 0.377511
# model = MultinomialNB(alpha=0.1, fit_prior=False).fit(X_train_tfidf, targetList) # 0.434186
# model = BernoulliNB(alpha=0.1, fit_prior=False).fit(X_train_tfidf, targetList) # 0.383812
# model = MLPClassifier(solver='lbfgs', alpha=0.001, max_iter=1000, tol=0.001).fit(X_train_tfidf, targetList) # 0.450178
# model = GradientBoostingClassifier(n_estimators=5000).fit(X_train_tfidf, targetList) # 0.384457

In [7]:
# trainwords = ['замок', 'лук', 'суда', 'бор']

trainwords = ['балка',
             'вид',
             'винт',
             'горн',
             'губа',
             'жаба',
             'клетка',
             'крыло',
             'купюра', 
             'курица', 
             'лавка', 
             'лайка', 
             'лев', 
             'лира', 
             'мина', 
             'мишень', 
             'обед', 
             'оклад', 
             'опушка', 
             'полис', 
             'пост', 
             'поток', 
             'проказа', 
             'пропасть', 
             'проспект', 
             'пытка',
             'рысь',
             'среда',
             'хвост',
             'штамп',
            ]

In [8]:
path = 'Data/НКРЯ/labeled txt/'
clfDict = dict()
for word in trainwords:
    
    model = MLPClassifier(solver='lbfgs', alpha=0.001, max_iter=1000, tol=0.001)
    
    clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('model', model),
                   ])
    
    clfDict[word] = learn_word(path, word, clf)
    print(word, clfDict[word].classes_)
    
print(clfDict['балка'].classes_)

балка ['34297' '39329']
вид ['16525' '18983' '38473' '46934']
винт ['16398' '17916' '32507' '39939']
горн ['0' '1' '30349' '32374']
губа ['32217' '36563']
жаба [' 38123' '0' '1' '22390' '38123']
клетка ['15531' '15764' '16575' '19864']
крыло ['0' '19801' '20531' '29548' '31524' '35257' '36977']
купюра [' 12767' '12767' '25844']
курица ['33131' '38375']
лавка ['38881']
лайка ['21481' '30243']
лев ['0' '1' '28022' '39252']
лира ['0' '34938']
мина ['0' '14200' '23313']
мишень ['28103' '35543']
обед [' 42118' '20613' '27747' '32048' '42118']
оклад ['20522' '25098' '29484']
опушка ['16750' '40487']
полис ['0' '20510']
пост ['19615' '21584' '22320' '36343']
поток ['41114']
проказа ['24456' '37005']
пропасть ['0' '1' '38487' '39536']
проспект ['0' '1' '13284' '28382']
пытка ['24806' '28262']
рысь ['20083' '36983']
среда ['14872' '27906' '37772' '41363']
хвост [' 18002' '0' '12982' '14619' '18002' '40730']
штамп [' 30859' '30859' '34398' '35716' '39926']
['34297' '39329']


In [14]:
import csv
from collections import namedtuple

testFile = 'Result//bts-rnc//train.csv'

WordBag = namedtuple('WordBag', 'context_id word gold_sense_id positions context')
originList = list()

contextAll = ''
lastWord = trainwords[0]

with open(testFile, 'r', encoding='utf-8', newline='') as f:
    reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
    next(reader, None)  # skip the headers
    
    for row in reader:
        word, context = row[1], remove_accents(row[5])
        
        originList.append(
            WordBag(context_id=int(row[0]),
                    word=word,
                    gold_sense_id=row[2],
                    positions=row[4],
                    context=context
                    )
        )

In [15]:
%%time
contextDict = dict()
contextDictClean = dict()
contextDictCount = dict()

for row in originList:
    
    if row.word not in contextDict:
        contextDict[row.word] = str()
        contextDictCount[row.word] = 0
        
    contextDict[row.word] = contextDict[row.word] + row.context + ' \n\n '
    contextDictCount[row.word] += 1
    
for word in contextDict:
    contextList = mstm(contextDict[word])
    contextDictClean[word] = lemmatized_text_list(contextList)

Wall time: 1min 27s


In [16]:
for word in contextDict:
    print(word, contextDictCount[word], len(contextDictClean[word]))

балка 119 119
вид 77 77
винт 123 123
горн 51 51
губа 137 137
жаба 121 121
клетка 150 150
крыло 91 91
купюра 150 150
курица 93 93
лавка 149 149
лайка 99 99
лев 44 44
лира 49 49
мина 65 65
мишень 121 121
обед 100 100
оклад 146 146
опушка 148 148
полис 142 142
пост 144 144
поток 136 136
проказа 146 146
пропасть 127 127
проспект 139 139
пытка 143 143
рысь 120 120
среда 144 144
хвост 121 121
штамп 96 96


In [17]:
resultList = list()

for word in trainwords:
    if word in clfDict.keys():
        context = contextDictClean[word]
        clf = clfDict[word]
        wordresult = printresult(word, context, clf)
        for result in wordresult:
            resultList.append(result)
    else:
        continue
        
print(len(resultList))

балка 119
вид 77
винт 123
горн 51
губа 137
жаба 121
клетка 150
крыло 91
купюра 150
курица 93
лавка 149
лайка 99
лев 44
лира 49
мина 65
мишень 121
обед 100
оклад 146
опушка 148
полис 142
пост 144
поток 136
проказа 146
пропасть 127
проспект 139
пытка 143
рысь 120
среда 144
хвост 121
штамп 96
3491


In [18]:
# To do:
# При записи приходится использовать escapechar для кавычек, попробовать исправить
outputName = 'Result//bts-rnc//train.KRWSD.csv'

with open(outputName, 'w', encoding='utf-8', newline='') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_NONE, escapechar='\\')

    wr.writerow(['context_id\tword\tgold_sense_id\tpredict_sense_id\tpositions\tcontext'])
    for index, row in enumerate(originList):
        try:
            line = '\t'.join([
                str(row.context_id),
                row.word,
                str(row.gold_sense_id),
#                 str(resultList[row.context_id - 1]),
                str(resultList[index]),
                row.positions,
                row.context
            ])
        except:
            continue
        wr.writerow([line])