## Частичное обучение

In [1]:
def read_file(path, word):
    
    trainList = list()
    targetList = list()
    textList = list()
    filename = path + word + '.txt'
    
    with open(filename, 'r', encoding='utf-8', newline='') as f:
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
        for index, row in enumerate(reader):
            if len(row) > 1:
                targetList.append(row[0])
                trainList.append(row[1])
            else:
                try:
                    textList.append(row[0])
                except:
                    print('СМОТРИ:', row, index)
                    pass
    return trainList, targetList, textList

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

def learn_clf(model, trainList, targetList):
    
    text_clf = Pipeline([('tdidfvect', TfidfVectorizer(ngram_range=(1,2))),
                         ('clf', model),
                        ])
    
    text_clf.fit(trainList, targetList)
    
    return text_clf

In [3]:
import numpy as np

def semi_learn(clf, predicted, trainList, targetList, textList):

    startIndex = len(trainList)

    for index, result in enumerate(predicted):
        maximum = max(result)
        if maximum >= 0.9:
            label = np.argmax(result)
            targetList.append(clf.classes_[label])
            trainList.append(textList[index])

    endIndex = len(trainList)

    lenBefore = len(textList)

    for index in range(startIndex, endIndex):
        sentence = trainList[index]
        if sentence in textList:
            textList.remove(sentence)

    lenAfter = len(textList)
    
    count = lenBefore - lenAfter
    
    return trainList, targetList, textList, count

In [4]:
import csv

def write_learn_result(path, word, trainList, targetList):
    
    outputName = path + word + '.csv'
    
    with open(outputName, 'w', encoding='utf-8', newline='') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_NONE, escapechar='\\')

        for index in range(0, len(trainList) - 1):
            line = targetList[index] + '\t' + trainList[index]
            wr.writerow([line])

In [5]:
import os
from collections import Counter

def result_func(dataset, mode, wordList):

    path = 'Input\\marked txt\\' + dataset + '(' + mode + ')/'

    for word in wordList:

        bool_break = False
        trainList, targetList, textList = read_file(path, word)

        print(word, Counter(targetList))

        count = -1
        while (count != 0 and len(textList) > 0):
            try:
                clf = learn_clf(model, trainList, targetList)
                predicted = clf.predict_proba(textList)
                trainList, targetList, textList, count = semi_learn(clf, predicted, trainList, targetList, textList)
            except Exception as e:
                print('Error', word)

                if hasattr(e, 'message'):
                    print(e.message)
                else:
                    print(e)

                bool_break = True
                break

        if bool_break == False:

            savepath = 'Input\\full txt\\' + dataset + '(' + mode + ')/' + str(model.__class__.__name__) + '/'        
            if not os.path.exists(savepath):
                os.makedirs(savepath)

            write_learn_result(savepath, word, trainList, targetList)
            print('-----')

    print('Finish')

# Начало работы

In [6]:
wordList = [
    'балка',
    'вид',
    'винт',
    'горн',
    'губа',
    'жаба',
    'клетка',
    'крыло',
    'купюра', 
    'курица',
    'лавка', 
    'лайка', 
    'лев', 
    'лира', 
    'мина', 
    'мишень',
    'обед', 
    'оклад', 
    'опушка', 
    'полис', 
    'пост', 
    'поток', 
    'проказа', 
    'пропасть', 
    'проспект', 
    'пытка',
    'рысь',
    'среда',
    'хвост',
    'штамп',
]

# wordList = [
#     'акция',
#     'баба',
#     'байка',
#     'бум',
#     'бычок',
#     'вал',
#     'газ',
#     'гвоздика',
#     'гипербола', 
#     'град',
#     'гусеница', 
#     'дождь', 
#     'домино', 
#     'забой', 
#     'икра', 
#     'кабачок',
#     'капот', 
#     'карьер', 
#     'кличка', 
#     'ключ', 
#     'кок', 
#     'кольцо', 
#     'концерт', 
#     'котелок', 
#     'крона', 
#     'круп',
#     'кулак',
#     'лейка',
#     'лук',
#     'мандарин',
#     'ножка', 
#     'опора', 
#     'патрон', 
#     'печать', 
#     'пол',
#     'полоз', 
#     'почерк', 
#     'пробка', 
#     'рак', 
#     'рок', 
#     'свет', 
#     'секрет', 
#     'скат', 
#     'слог', 
#     'стан',
#     'стопка',
#     'таз',
#     'такса',
#     'тюрьма',
#     'шах',
#     'шашка'
# ] 

dataset = 'bts-rnc'
mode = 'train'

In [8]:
# from sklearn.naive_bayes import MultinomialNB
# model = MultinomialNB(alpha=2.0, fit_prior=True)

from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(weights='uniform')

# from sklearn.naive_bayes import BernoulliNB
# model = BernoulliNB()

# from sklearn.ensemble import GradientBoostingClassifier
# model = GradientBoostingClassifier()

result_func(dataset=dataset, mode=mode, wordList=wordList)

балка Counter({'34297': 47, '39329': 44})
-----
вид Counter({'18983': 35, '38473': 34, '46934': 21, '16525': 15})
-----
винт Counter({'16398': 32, '17916': 22, '39939': 22, '32507': 8})
-----
горн Counter({'30349': 40, '1': 31, '32374': 24, '0': 5})
-----
губа Counter({'36563': 50, '32217': 17, '40001': 6})
-----
жаба Counter({'38123': 29, '1': 28, '0': 19, '22390': 18})
-----
клетка Counter({'16575': 33, '15531': 32, '24810': 29, '19864': 29, '15764': 28})
-----
крыло Counter({'35257': 37, '31524': 33, '19801': 28, '36977': 24, '20531': 22, '0': 12, '29548': 10})
-----
купюра Counter({'25844': 54, '12767': 34})
-----
курица Counter({'38375': 33, '33131': 27})
-----
лавка Counter({'19335': 27, '38881': 25})
-----
лайка Counter({'21481': 45, '30243': 21})
-----
лев Counter({'28022': 49, '1': 11, '0': 6, '39252': 2})
-----
лира Counter({'34938': 37, '0': 21})
-----
мина Counter({'14200': 45, '23313': 23, '0': 4})
-----
мишень Counter({'28103': 51, '35543': 27})
-----
обед Counter({'27747