## Частичное обучение

In [2]:
import pandas as pd

def read_file(path, word):
    """Read table with mixed-marked data for Word"""
    
    filename = path + word + '.csv'
    
    dataDf = pd.read_csv(filename, header=None, sep='\t')
    
    labeledDf = dataDf[dataDf[1].notnull()]
    cols = {labeledDf.columns[0]: "label", labeledDf.columns[1]: "text"}
    labeledDf = labeledDf.rename(columns=cols).reset_index(drop=True)
    
    nonLabeledDf = dataDf[dataDf[1].isnull()]
    cols = {nonLabeledDf.columns[0]: "text"}
    nonLabeledDf = nonLabeledDf.rename(columns=cols).drop(columns=[1]).reset_index(drop=True)

    return labeledDf, nonLabeledDf

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

def learn_clf(model, trainDf):
    """Modify data to tf-idf and learn a model"""
    
    text_clf = Pipeline([('tdidfvect', TfidfVectorizer(ngram_range=(1,2))),
                         ('clf', model),
                        ])
    
    text_clf.fit(trainDf.text, trainDf.label)
    
    return text_clf

In [4]:
import numpy as np

def semi_learn(classesList, predicted_proba, trainDf, testDf):
    """Expand training data"""
    
    startIndex = len(trainDf)
    
    # Find strong probability and write it down
    for index, result in enumerate(predicted_proba):
        maximum = max(result)
        if maximum >= 0.9:
            label = np.argmax(result)
            trainDf = trainDf.append({'label': classesList[label], 'text': testDf.iloc[index].text}, 
                                     ignore_index=True)

    endIndex = len(trainDf)
    
    lenBefore = len(testDf)
    
    # Remove expanded data from non-labeled data
    
    testDf = testDf.loc[~testDf['text'].isin(trainDf.text)].reset_index(drop=True)

    lenAfter = len(testDf)
    
    # How many expanded per step
    count = lenBefore - lenAfter
    
    return trainDf, testDf, count

In [9]:
import os
from collections import Counter

def result_func(dataset, mode, wordList, path, savePath, model, model2):

    for word in wordList:
        print(word)
        bool_break = False

        labeledDf, nonLabeledDf = read_file(path, word)
        print(word, len(labeledDf), Counter(labeledDf.label))

        count = -1
        while (count != 0 and len(nonLabeledDf) > 0):
            try:
                clf = learn_clf(model, labeledDf)
                predicted = clf.predict_proba(nonLabeledDf.text)
#                 clf2 = learn_clf(model2, labeledDf)
#                 predicted2 = clf2.predict_proba(nonLabeledDf.text)
                labeledDf, nonLabeledDf, count = semi_learn(clf.classes_, predicted, labeledDf, nonLabeledDf)
            except Exception as e:
                print('Error', word)

                if hasattr(e, 'message'):
                    print(e.message)
                else:
                    print(e)

                bool_break = True
                break
                
        if bool_break == False:      
            if not os.path.exists(savePath):
                os.makedirs(savePath)

            outputName = savePath + word + '.csv'
            labeledDf.to_csv(outputName, sep='\t', header=False, index=False)
            print('-----')

    print('Finish')

# Начало работы

In [6]:
dataset = 'bts-rnc'

wordList = ['балка', 'вид', 'винт', 'горн', 'губа', 'жаба', 'клетка',
            'крыло', 'купюра', 'курица', 'лавка', 'лайка', 'лев', 'лира',
            'мина', 'мишень', 'обед', 'оклад', 'опушка', 'полис', 'пост', 
            'поток', 'проказа', 'пропасть', 'проспект', 'пытка', 'рысь',
            'среда', 'хвост', 'штамп'
           ]
mode = 'train'

# wordList = ['акция', 'баба', 'байка', 'бум', 'бычок', 'вал', 'газ', 'гвоздика',
#              'гипербола', 'град', 'гусеница', 'дождь', 'домино', 'забой', 'икра',
#              'кабачок', 'капот', 'карьер', 'кличка', 'ключ', 'кок', 'кольцо',
#              'концерт', 'котелок', 'крона', 'круп', 'кулак', 'лейка', 'лук',
#              'мандарин', 'ножка', 'опора', 'патрон', 'печать', 'пол', 'полоз',
#              'почерк', 'пробка', 'рак', 'рок', 'свет', 'секрет', 'скат', 'слог',
#              'стан', 'стопка', 'таз', 'такса', 'тюрьма', 'шах', 'шашка'
#             ]
# mode = 'test'

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

model = KNeighborsClassifier(weights='uniform')
model2 = MultinomialNB(alpha=2.0, fit_prior=True)

model_name_1 = str(model.__class__.__name__)
model_name_2 = str(model2.__class__.__name__)

inputPath = "Data/Mixed txt/{}({})/".format(dataset, mode)
savePath = "Data/Expanded txt/{}({})/{}_{}/".format(dataset, mode, model_name_1, model_name_2)

In [10]:
result_func(dataset=dataset, mode=mode, 
            wordList=wordList, path=inputPath, 
            savePath=savePath, model=model, 
            model2=model2)

балка
балка 91 Counter({'34297': 47, '39329': 44})
-----
вид
вид 105 Counter({'18983': 35, '38473': 34, '46934': 21, '16525': 15})
-----
винт
винт 84 Counter({'16398': 32, '17916': 22, '39939': 22, '32507': 8})
-----
горн
горн 100 Counter({'30349': 40, '1': 31, '32374': 24, '0': 5})
-----
губа
губа 73 Counter({'36563': 50, '32217': 17, '40001': 6})
-----
жаба
жаба 94 Counter({'38123': 29, '1': 28, '0': 19, '22390': 18})
-----
клетка
клетка 151 Counter({'16575': 33, '15531': 32, '24810': 29, '19864': 29, '15764': 28})
-----
крыло
крыло 166 Counter({'35257': 37, '31524': 33, '19801': 28, '36977': 24, '20531': 22, '0': 12, '29548': 10})
-----
купюра
купюра 88 Counter({'25844': 54, '12767': 34})
-----
курица
курица 60 Counter({'38375': 33, '33131': 27})
-----
лавка
лавка 52 Counter({'19335': 27, '38881': 25})
-----
лайка
лайка 66 Counter({'21481': 45, '30243': 21})
-----
лев
лев 68 Counter({'28022': 49, '1': 11, '0': 6, '39252': 2})
-----
лира
лира 59 Counter({'34938': 37, '0': 22})
-----
