In [1]:
import pandas as pd
import gensim
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [3]:
dataset = 'bts-rnc'
mode = 'train'

wordList = ['балка', 'вид', 'винт', 'горн', 'губа', 'жаба', 'клетка',
            'крыло', 'купюра', 'курица', 'лавка', 'лайка', 'лев', 'лира',
            'мина', 'мишень', 'обед', 'оклад', 'опушка', 'полис', 'пост', 
            'поток', 'проказа', 'пропасть', 'проспект', 'пытка', 'рысь',
#             'поток', 'проказа', 'проспект', 'пытка', 'рысь',
            'среда', 'хвост', 'штамп'
           ]

In [4]:
folder = 'Data\\Original\\Lemmatized\\' + dataset + '(' + mode + ')\\'

In [5]:
def sentence_vectorize(sentence):
    """Calculate vector for a sentence.
    
    Parameters:
    
    sentence - (list) list of tokens.
    """
    sentence_vector = np.zeros(w2v.vector_size)
    word_count = 0
    
    for word in wordsList:
        try:
            sentence_vector += w2v[word]
            word_count += 1
        except:
            continue
    if word_count != 0:
        sentence_vector /= word_count
    
    return sentence_vector

def df_vectorize(path):
    """Read csv and add vector column."""
    
    semiLearnDf = pd.read_csv(path, engine='python', sep='\t', header=None, encoding="utf-8")
    
    if semiLearnDf.shape[1] == 1:
        col = {semiLearnDf.columns[0]: "text"}
    else:
        col = {semiLearnDf.columns[0]: "label", semiLearnDf.columns[1]: "text"}
        
    semiLearnDf = semiLearnDf.rename(columns=col)
    semiLearnDf['vector'] = semiLearnDf.text.str.split().apply(sentence_vectorize)
    return semiLearnDf

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

def cv_score(clf, X, y, metric, cv=5):
    scores = cross_val_score(clf, X, y, cv=cv, scoring=metric, n_jobs=1)
    print("%s: %0.2f (+/- %0.2f)" % (metric, scores.mean(), scores.std() * 2))
    return scores.mean()

def train_model(clf, X, y):
    score = cv_score(clf, X, y, metric='f1_macro')
    clf.fit(X, y)
    return clf

In [7]:
# Исключение доменов, которые уже есть в обучении
def remove_marked(df, textsTrained):
    return df.loc[~df['text'].isin(textsTrained)].reset_index(drop=True)

# Вероятности для записей датафрейма
def get_proba(df, clf, textsTrained):
    
    # Собираем отдельный df для векторов текстов
    VectorDf = pd.DataFrame(df.vector.tolist())
    
    # Записываем уровень уверенности для каждого домена (с округлением)
    predictArray = clf.predict_proba(VectorDf)
    labels = [np.argmax(x) for x in predictArray]
    
    # Фиксируем самый вероятный класс
    df['label'] = [clf.classes_[x] for x in labels]
    df['proba'] = [max(x) for x in predictArray]
    
    return df

# Отбираем записи по порогу t
def get_confidence(semiLearnDf, df_vect, t, delta):
    
    df_vect_confidence = df_vect[df_vect['proba'] >= t].reset_index(drop=True).drop(columns='proba')
    
    # Перекидываем колонку label на первое место для будущего сложения
    cols = df_vect_confidence.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df_vect_confidence = df_vect_confidence[cols]
    
    length = len(df_vect_confidence)
    
    # Записываем, в чем уверены
    if length > delta:    
        semiLearnDf = semiLearnDf.append(df_vect_confidence, ignore_index=True)
    
    return semiLearnDf, length

In [12]:
# def second_step(word):
#     semiLearnFilepath = folder + word + '.csv'
#     semiLearnDf = df_vectorize(semiLearnFilepath)
#     textFilepath = 'Data\\Non-Annotated\\' + word + '.csv'
#     textDfUnmarked = df_vectorize(textFilepath)
    
#     semiLearnDf = semi_learn_w2v(semiLearnDf, textDfUnmarked)
#     return semiLearnDf

In [39]:
def semi_learn_w2v(semiLearnDf, textDfUnmarked):

    while True:
        # Обучаем классификатор на размеченном тексте
        trainVectorDf = pd.DataFrame(semiLearnDf.vector.tolist())
        model = RandomForestClassifier(criterion='entropy', n_estimators=300, 
                                     max_depth=5, min_samples_leaf=5, min_samples_split=3)

        clf = train_model(model, trainVectorDf, semiLearnDf.label)

        # Пересобираем набор размеченных записей
        textsTrained = semiLearnDf.text.tolist()

        # Находим вероятности для неразмеченных записей
        df_vect = get_proba(textDfUnmarked, clf, textsTrained)

        # Записываем уверенные записи и удаляем их из неразмеченных
        semiLearnDf, delta = get_confidence(semiLearnDf, df_vect, 0.90, 500)
        
        print('Delta =', delta)
        
        if delta <= 500:
            break
        else:
            textDfUnmarked = remove_marked(textDfUnmarked, textsTrained)
            
    return semiLearnDf

In [7]:
def write_word(word, semiLearnDf):
    filepath = 'Data\\Expanded w2v\\' + '\\Original\\' + word + '.csv'
    resultDf = semiLearnDf.drop(columns='vector')
    resultDf.to_csv(filepath, sep='\t', encoding='utf-8', index=False)

In [9]:
import urllib

filename = "http://panchenko.me/data/dsl-backup/w2v-ru/all.norm-sz100-w10-cb0-it1-min100.w2v"
urllib.urlretrieve (filename, "all.norm-sz100-w10-cb0-it1-min100.w2v")

AttributeError: module 'urllib' has no attribute 'urlretrieve'

In [11]:
w2v_fpath = "all.norm-sz100-w10-cb0-it1-min100.w2v"
w2v = gensim.models.KeyedVectors.load_word2vec_format(w2v_fpath, binary=True, unicode_errors='ignore')
w2v.init_sims(replace=True)

In [42]:
%%time

for word in wordList:
    print(word)
    semiLearnDf = second_step(word)
    write_word(word, semiLearnDf)
    print('----------------------------------------------------')

проспект
f1_macro: 0.27 (+/- 0.10)
Delta = 480
----------------------------------------------------
пытка
f1_macro: 0.50 (+/- 0.00)
Delta = 1331
f1_macro: 0.50 (+/- 0.00)
Delta = 1411
f1_macro: 0.50 (+/- 0.00)
Delta = 98
----------------------------------------------------
рысь
f1_macro: 0.77 (+/- 0.14)
Delta = 721
f1_macro: 0.76 (+/- 0.14)
Delta = 814
f1_macro: 0.76 (+/- 0.11)
Delta = 155
----------------------------------------------------
поток
f1_macro: 0.61 (+/- 0.10)
Delta = 715
f1_macro: 0.60 (+/- 0.09)
Delta = 812
f1_macro: 0.61 (+/- 0.08)
Delta = 132
----------------------------------------------------
проказа
f1_macro: 0.52 (+/- 0.11)
Delta = 0
----------------------------------------------------
проспект
f1_macro: 0.24 (+/- 0.00)
Delta = 458
----------------------------------------------------
пытка
f1_macro: 0.50 (+/- 0.00)
Delta = 1329
f1_macro: 0.50 (+/- 0.00)
Delta = 1414
f1_macro: 0.50 (+/- 0.00)
Delta = 99
----------------------------------------------------
рысь
f1_ma