## Частичное обучение

In [1]:
from sklearn.neural_network import MLPClassifier
# from sklearn.svm import LinearSVC
# from sklearn.calibration import CalibratedClassifierCV

model = MLPClassifier(solver='lbfgs', alpha=0.001, max_iter=1000, tol=0.001)
# svm = LinearSVC()
# model = CalibratedClassifierCV(svm, cv=2) 

In [2]:
def read_file(path, word):
    
    trainList = list()
    targetList = list()
    textList = list()
    filename = path + word + '.txt'
    
    with open(filename, 'r', encoding='utf-8', newline='') as f:
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
        for index, row in enumerate(reader):
            if len(row) > 1:
                targetList.append(row[0])
                trainList.append(row[1])
            else:
                try:
                    textList.append(row[0])
                except:
                    print('СМОТРИ:', row, index)
                    break
    return trainList, targetList, textList

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

def learn_clf(model, trainList, targetList):
    
    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', model),
                        ])
    
    text_clf.fit(trainList, targetList)
    
    return text_clf

In [4]:
import numpy as np

def semi_learn(clf, predicted, trainList, targetList, textList):

    startIndex = len(trainList)

    for index, result in enumerate(predicted):
        maximum = max(result)
        if maximum >= 0.9:
            label = np.argmax(result)
            targetList.append(clf.classes_[label])
            trainList.append(textList[index])

    endIndex = len(trainList)

    lenBefore = len(textList)

    for index in range(startIndex, endIndex):
        sentence = trainList[index]
        if sentence in textList:
            textList.remove(sentence)

    lenAfter = len(textList)
    
    count = lenBefore - lenAfter
    
    return trainList, targetList, textList, count

In [5]:
import csv

def write_learn_result(path, word, trainList, targetList):
    
    outputName = path + word + '.csv'
    
    with open(outputName, 'w', encoding='utf-8', newline='') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_NONE, escapechar='\\')

        for index in range(0, len(trainList) - 1):
            line = targetList[index] + '\t' + trainList[index]
            wr.writerow([line])

In [6]:
wordList = [
    'балка',
    'вид',
    'винт',
    'горн',
    'губа',
    'жаба',
    'клетка',
    'крыло',
    'купюра', 
    'курица',
    'лавка', 
    'лайка', 
    'лев', 
    'лира', 
    'мина', 
    'мишень',
    'обед', 
    'оклад', 
    'опушка', 
    'полис', 
    'пост', 
    'поток', 
    'проказа', 
    'пропасть', 
    'проспект', 
    'пытка',
    'рысь',
    'среда',
    'хвост',
    'штамп',
]

In [7]:
path = 'Data/НКРЯ/lemma txt/'

for word in wordList:
   
    bool_break = False
    trainList, targetList, textList = read_file(path, word)
    count = -1
    while (count != 0 and len(textList) > 0):
        try:
            clf = learn_clf(model, trainList, targetList)
            predicted = clf.predict_proba(textList)
            trainList, targetList, textList, count = semi_learn(clf, predicted, trainList, targetList, textList)
        except:
            print('Error', word)
            bool_break = True
            break
            
    if bool_break == False:
        write_learn_result('Data/НКРЯ/labeled txt/', word, trainList, targetList)
        print('Finish', word)

Finish балка
Finish вид
Finish винт
Finish горн
Finish губа
Finish жаба
СМОТРИ: [] 5
Finish клетка
Finish крыло
Finish купюра
Finish курица
СМОТРИ: [] 2
Finish лавка
Finish лайка
Finish лев
Finish лира
Finish мина
Finish мишень
Finish обед
Finish оклад
Finish опушка
Finish полис
Finish пост
СМОТРИ: [] 2
Finish поток
Finish проказа
Finish пропасть
Finish проспект
Finish пытка
Finish рысь
Finish среда
Finish хвост
Finish штамп
