## Частичное обучение

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn.preprocessing import FunctionTransformer

def learn_clf(model, trainDf, dense):
    """Modify data to tf-idf and learn a model"""
    
    if dense:
        text_clf = Pipeline([('tfidfvect', TfidfVectorizer(ngram_range=(1,2))),
                             ('vect_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
                             ('clf', model),
                            ])
    else:
        text_clf = Pipeline([('tfidfvect', TfidfVectorizer(ngram_range=(1,2))),
                             ('clf', model),
                            ])
    
    text_clf.fit(trainDf.text, trainDf.label)
    
    return text_clf

In [2]:
import numpy as np

def semi_learn(classesList, predicted_proba, trainDf, testDf, t):
    """Expand training data"""
    
#     t = 1.0
        
    startIndex = len(trainDf)
    
    for index, result in enumerate(predicted_proba):
        if max(result) >= t:
            label = np.argmax(result)
            trainDf = trainDf.append({'label': classesList[label],
                                      'text': testDf.iloc[index].text},
                                     ignore_index=True)

    endIndex = len(trainDf)
    
    lenBefore = len(testDf)
    
    # Remove expanded data from non-labeled data
    
    testDf = testDf.loc[~testDf['text'].isin(trainDf.text)].reset_index(drop=True)

    lenAfter = len(testDf)
    
    # How many expanded per step
    count = lenBefore - lenAfter
    
    return trainDf, testDf, count

In [35]:
import os
from collections import Counter

def result_func(dataset, mode, wordList, labeledPath, nonlabeledPath, savePath, model, dense=False, t=1.0):

    for word in wordList:
#         print(word)
        bool_break = False

#         labeledDf, nonLabeledDf = read_file(path, word)
        
        labeledWord = labeledPath + word + '.csv'
        labeledDf = pd.read_csv(labeledWord, header=None, sep='\t', engine='python', encoding='utf-8')
#         print(labeledDf.head())
        labeledDf = labeledDf.rename(columns={labeledDf.columns[0]: "label", 
                                              labeledDf.columns[1]: "text"}
                                    )
        
        nonlabeledWord = nonlabeledPath + word + '.csv'
        nonLabeledDf = pd.read_csv(nonlabeledWord, header=None, sep='\t', engine='python', encoding='utf-8')
        nonLabeledDf = nonLabeledDf.rename(columns={nonLabeledDf.columns[0]: "text"})
        
        print(word, len(labeledDf), Counter(labeledDf.label))

        count = -1
        while (count != 0 and len(nonLabeledDf) > 0):
            try:
                clf = learn_clf(model, labeledDf, dense)
                predicted = clf.predict_proba(nonLabeledDf.text)
                labeledDf, nonLabeledDf, count = semi_learn(clf.classes_, predicted, 
                                                            labeledDf, nonLabeledDf, t)
            except Exception as e:
                print('Error', word)

                if hasattr(e, 'message'):
                    print(e.message)
                else:
                    print(e)

                bool_break = True
                break
                
        if bool_break == False:      
            if not os.path.exists(savePath + 'Not Expanded/'):
                os.makedirs(savePath + 'Not Expanded/')

            outputName = savePath + word + '.csv'
            labeledDf.to_csv(outputName, sep='\t', header=False, 
                             index=False, encoding='utf-8')
            
            LeftName = savePath + '/Not Expanded/' + word + '.csv'
            nonLabeledDf.to_csv(LeftName, sep='\t', header=False,
                                index=False, encoding='utf-8')
            
            print('Became', len(labeledDf))
            print('Left', len(nonLabeledDf))
            print('-----')

    print('Finish')

# Начало работы

In [4]:
dataset = 'bts-rnc'

wordList = ['балка', 'вид', 'винт', 'горн', 'губа', 'жаба', 'клетка',
            'крыло', 'купюра', 'курица', 'лавка', 'лайка', 'лев', 'лира',
            'мина', 'мишень', 'обед', 'оклад', 'опушка', 'полис', 'пост', 
            'поток', 'проказа', 'пропасть', 'проспект', 'пытка', 'рысь',
            'среда', 'хвост', 'штамп'
           ]
mode = 'train'

# wordList = ['акция', 'баба', 'байка', 'бум', 'бычок', 'вал', 'газ', 'гвоздика',
#              'гипербола', 'град', 'гусеница', 'дождь', 'домино', 'забой', 'икра',
#              'кабачок', 'капот', 'карьер', 'кличка', 'ключ', 'кок', 'кольцо',
#              'концерт', 'котелок', 'крона', 'круп', 'кулак', 'лейка', 'лук',
#              'мандарин', 'ножка', 'опора', 'патрон', 'печать', 'пол', 'полоз',
#              'почерк', 'пробка', 'рак', 'рок', 'свет', 'секрет', 'скат', 'слог',
#              'стан', 'стопка', 'таз', 'такса', 'тюрьма', 'шах', 'шашка'
#             ]
# mode = 'test'

In [5]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(weights='uniform')

# from sklearn.semi_supervised import LabelPropagation
# model = LabelPropagation(kernel='rbf', gamma=15, n_jobs=2)

model_name = str(model.__class__.__name__)

In [6]:
labeledPath = "Data/Annotated/lemma/{}({})/".format(dataset, mode)
nonlabeledPath = "Data/Non-Annotated/lemma/{}({})/".format(dataset, mode)
savePath = "Data/Expanded/{}({})/{}/".format(dataset, mode, model_name)

In [8]:
%%time
result_func(dataset=dataset, mode=mode, wordList=wordList, 
            labeledPath=labeledPath, nonlabeledPath=nonlabeledPath,
            savePath=savePath, model=model, t=1.00)

балка 91 Counter({34297: 47, 39329: 44})
Became 1471
Left 606
-----
вид 105 Counter({18983: 35, 38473: 34, 46934: 21, 16525: 15})
Became 1226
Left 1618
-----
винт 84 Counter({16398: 32, 17916: 22, 39939: 22, 32507: 8})
Became 757
Left 1170
-----
горн 100 Counter({30349: 40, 1: 31, 32374: 24, 0: 5})
Became 1243
Left 376
-----
губа 73 Counter({36563: 50, 32217: 17, 40001: 6})
Became 3289
Left 46
-----
жаба 94 Counter({38123: 29, 1: 28, 0: 19, 22390: 18})
Became 492
Left 644
-----
клетка 151 Counter({16575: 33, 15531: 32, 24810: 29, 19864: 29, 15764: 28})
Became 2295
Left 709
-----
крыло 166 Counter({35257: 37, 31524: 33, 19801: 28, 36977: 24, 20531: 22, 0: 12, 29548: 10})
Became 1188
Left 1031
-----
купюра 88 Counter({25844: 54, 12767: 34})
Became 1318
Left 225
-----
курица 60 Counter({38375: 33, 33131: 27})
Became 1523
Left 328
-----
лавка 52 Counter({19335: 27, 38881: 25})
Became 1416
Left 738
-----
лайка 66 Counter({21481: 45, 30243: 21})
Became 322
Left 90
-----
лев 68 Counter({28022

In [31]:
# from sklearn.neighbors import KNeighborsClassifier
# model = KNeighborsClassifier(weights='uniform')
# model_name = 'RBF_KNeighborsClassifier'

from sklearn.semi_supervised import LabelPropagation
model = LabelPropagation(kernel='rbf', gamma=15, n_jobs=2)
model_name = 'KNeighborsClassifier_RBF'

In [30]:
labeledPath = "Data/Expanded/{}({})/KNeighborsClassifier/".format(dataset, mode)
nonlabeledPath = "Data/Expanded/{}({})/KNeighborsClassifier/Not Expanded/".format(dataset, mode)
savePath = "Data/Expanded/{}({})/{}/".format(dataset, mode, model_name)

In [40]:
%%time
result_func(dataset=dataset, mode=mode, wordList=wordList, 
            labeledPath=labeledPath, nonlabeledPath=nonlabeledPath,
            savePath=savePath, model=model, dense=True, t=0.95)

балка 1471 Counter({39329: 1280, 34297: 191})
Became 1528
Left 549
-----
вид 1226 Counter({38473: 537, 18983: 495, 46934: 172, 16525: 22})
Became 1310
Left 1534
-----
винт 757 Counter({39939: 360, 17916: 242, 16398: 147, 32507: 8})
Became 826
Left 1101
-----
горн 1243 Counter({30349: 1173, 1: 39, 32374: 26, 0: 5})
Became 1375
Left 244
-----
губа 3289 Counter({36563: 3259, 32217: 24, 40001: 6})
Became 3320
Left 15
-----
жаба 492 Counter({38123: 407, 22390: 32, 1: 28, 0: 25})
Became 595
Left 541
-----
клетка 2295 Counter({15531: 2061, 19864: 101, 24810: 59, 16575: 39, 15764: 35})
Became 2363
Left 641
-----
крыло 1188 Counter({19801: 956, 35257: 82, 20531: 66, 31524: 38, 36977: 24, 0: 12, 29548: 10})
Became 1271
Left 948
-----
купюра 1318 Counter({25844: 1246, 12767: 72})
Became 1360
Left 183
-----
курица 1523 Counter({33131: 1402, 38375: 121})
Became 1566
Left 285
-----
лавка 1416 Counter({38881: 1158, 19335: 258})
Became 1478
Left 676
-----
лайка 322 Counter({21481: 299, 30243: 23})
Bec