## Частичное обучение

In [1]:
import pandas as pd

def read_file(path, word):
    """Read table with mixed-marked data for Word"""
    
    filename = path + word + '.csv'
    
    dataDf = pd.read_csv(filename, header=None, sep='\t', engine='python', encoding='utf-8')
    
    labeledDf = dataDf[dataDf[1].notnull()]
    cols = {labeledDf.columns[0]: "label", labeledDf.columns[1]: "text"}
    labeledDf = labeledDf.rename(columns=cols).reset_index(drop=True)
    
    nonLabeledDf = dataDf[dataDf[1].isnull()]
    cols = {nonLabeledDf.columns[0]: "text"}
    nonLabeledDf = nonLabeledDf.rename(columns=cols).drop(columns=[1]).reset_index(drop=True)

    return labeledDf, nonLabeledDf

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import FunctionTransformer

def learn_clf(model, trainDf):
    """Modify data to tf-idf and learn a model"""
    
    text_clf = Pipeline([('tfidfvect', TfidfVectorizer(ngram_range=(1,2))),
                         ('vect_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
                         ('clf', model),
                        ])
    
    text_clf.fit(trainDf.text, trainDf.label)
    
    return text_clf

In [3]:
import numpy as np

def semi_learn(classesList, predicted_proba, trainDf, testDf, t):
    """Expand training data"""
    
#     t = 1.0
        
    startIndex = len(trainDf)
    
    for index, result in enumerate(predicted_proba):
        if max(result) >= t:
            label = np.argmax(result)
            trainDf = trainDf.append({'label': classesList[label],
                                      'text': testDf.iloc[index].text},
                                     ignore_index=True)

    endIndex = len(trainDf)
    
    lenBefore = len(testDf)
    
    # Remove expanded data from non-labeled data
    
    testDf = testDf.loc[~testDf['text'].isin(trainDf.text)].reset_index(drop=True)

    lenAfter = len(testDf)
    
    # How many expanded per step
    count = lenBefore - lenAfter
    
    return trainDf, testDf, count

In [4]:
import os
from collections import Counter

def result_func(dataset, mode, wordList, path, savePath, model, t=1.0):

    for word in wordList:
#         print(word)
        bool_break = False

        labeledDf, nonLabeledDf = read_file(path, word)
        print(word, len(labeledDf), Counter(labeledDf.label))

        count = -1
        while (count != 0 and len(nonLabeledDf) > 0):
            try:
                clf = learn_clf(model, labeledDf)
                predicted = clf.predict_proba(nonLabeledDf.text)
                labeledDf, nonLabeledDf, count = semi_learn(clf.classes_, predicted, labeledDf, nonLabeledDf, t)
            except Exception as e:
                print('Error', word)

                if hasattr(e, 'message'):
                    print(e.message)
                else:
                    print(e)

                bool_break = True
                break
                
        if bool_break == False:      
            if not os.path.exists(savePath):
                os.makedirs(savePath)

            outputName = savePath + word + '.csv'
            labeledDf.to_csv(outputName, sep='\t', header=False, index=False, encoding='utf-8')
            print('Became', len(labeledDf))
            print('-----')

    print('Finish')

# Начало работы

In [5]:
dataset = 'bts-rnc'

wordList = ['балка', 'вид', 'винт', 'горн', 'губа', 'жаба', 'клетка',
            'крыло', 'купюра', 'курица', 'лавка', 'лайка', 'лев', 'лира',
            'мина', 'мишень', 'обед', 'оклад', 'опушка', 'полис', 'пост', 
            'поток', 'проказа', 'пропасть', 'проспект', 'пытка', 'рысь',
            'среда', 'хвост', 'штамп'
           ]
mode = 'train'

# wordList = ['акция', 'баба', 'байка', 'бум', 'бычок', 'вал', 'газ', 'гвоздика',
#              'гипербола', 'град', 'гусеница', 'дождь', 'домино', 'забой', 'икра',
#              'кабачок', 'капот', 'карьер', 'кличка', 'ключ', 'кок', 'кольцо',
#              'концерт', 'котелок', 'крона', 'круп', 'кулак', 'лейка', 'лук',
#              'мандарин', 'ножка', 'опора', 'патрон', 'печать', 'пол', 'полоз',
#              'почерк', 'пробка', 'рак', 'рок', 'свет', 'секрет', 'скат', 'слог',
#              'стан', 'стопка', 'таз', 'такса', 'тюрьма', 'шах', 'шашка'
#             ]
# mode = 'test'

In [46]:
# from sklearn.neighbors import KNeighborsClassifier
# model = KNeighborsClassifier(weights='uniform')

from sklearn.semi_supervised import LabelPropagation
model = LabelPropagation(kernel='rbf', gamma=15, n_jobs=2)

model_name = str(model.__class__.__name__)

inputPath = "Data\\Mixed txt\\{}({})\\".format(dataset, mode)
savePath = "Data\\Expanded txt\\{}({})\\{}\\".format(dataset, mode, model_name)

In [47]:
%%time
result_func(dataset=dataset, mode=mode,
            wordList=wordList, path=inputPath, 
            savePath=savePath, model=model, t=0.95)

балка 91 Counter({'34297': 47, '39329': 44})
Became 705
-----
вид 105 Counter({'18983': 35, '38473': 34, '46934': 21, '16525': 15})
Became 719
-----
винт 84 Counter({'16398': 32, '17916': 22, '39939': 22, '32507': 8})
Became 557
-----
горн 100 Counter({'30349': 40, '1': 31, '32374': 24, '0': 5})
Became 550
-----
губа 73 Counter({'36563': 50, '32217': 17, '40001': 6})
Became 3280
-----
жаба 94 Counter({'38123': 29, '1': 28, '0': 19, '22390': 18})
Became 363
-----
клетка 151 Counter({'16575': 33, '15531': 32, '24810': 29, '19864': 29, '15764': 28})
Became 679
-----
крыло 166 Counter({'35257': 37, '31524': 33, '19801': 28, '36977': 24, '20531': 22, '0': 12, '29548': 10})
Became 475
-----
купюра 88 Counter({'25844': 54, '12767': 34})
Became 912
-----
курица 60 Counter({'38375': 33, '33131': 27})
Became 749
-----
лавка 52 Counter({'19335': 27, '38881': 25})
Became 756
-----
лайка 66 Counter({'21481': 45, '30243': 21})
Became 250
-----
лев 68 Counter({'28022': 49, '1': 11, '0': 6, '39252': 2