#### Алгоритм "бустинга": берём товары, на которых ошибается или неверно предсказывает FastText, обучаем на них иерархический классификатор, а дальше - пытаемся состыковать два алгоритма так, чтобы там, где FastText предсказывает неуверенно, бралось предсказание иерархического классификатора.

In [100]:
import os
from pathlib import Path
import tqdm
import fasttext
import numpy as np 
import pandas as pd
import csv
from gensim.utils import simple_preprocess

from HierarchicalLibrary import Classifier, CategoryTree, TextProcessor
from HierarchicalLibrary.Encoders import LdaEncoder, NavecEncoder, FasttextEncoder, BertEncoder

Подготавливаем полный, тренировочный и валидационный датасеты:
перемешиваем данные в фрейме,
удаляем колонки рейтинга и кол-ва отзывов,
корректируем типы данных колонок,
заполняем пропущенные значения,
текст из колонок 'title', 'short_description' и 'name_value_characteristics' объединяем в колонку "Document", колонку 'title' берём дважды, чтобы увеличить её вес.

In [6]:
SEED = 1

# Method for increasing the weight of the first words of title
def word_pyramid(string: str, min_n_words: int, max_n_words: int) -> list:
    result = []
    split = string.split(' ')
    for i in range(min_n_words, max_n_words+1):
        result += split[:i]
    return ' '.join(result)

def prepare_data(full_train_data: pd.DataFrame, seed: int, valid_size: int):
    data_full = full_train_data.sample(frac=1, random_state=seed).copy()
    data_full.drop(['rating', 'feedback_quantity'], axis=1, inplace=True)
    data_full.title = data_full.title.astype('string')
    data_full.short_description = data_full.short_description.astype('string')
    data_full.fillna(value='', inplace=True)
    data_full.name_value_characteristics = data_full.name_value_characteristics.astype('string')
    data_full = data_full.assign(Document=[str(x) + ' ' + str(y) + ' ' + str(z) + ' ' + word_pyramid(x, 2, 3) for x, y, z in zip(data_full['title'], data_full['short_description'], data_full['name_value_characteristics'])])
    data_full.drop(['title', 'short_description', 'name_value_characteristics'], axis=1, inplace=True)
    data_full.Document = data_full.Document.astype('string')

    data = data_full[:-valid_size].reset_index(drop=True)
    data_valid = data_full[-valid_size:].reset_index(drop=True)
    return data, data_valid

def set_seeds(seed: int):  
    np.random.seed(seed)

def predict_proba(documents: list) -> tuple:
    prediction = model.predict(documents, k=1)
    labels_result = []
    proba_result = []
    for label in prediction[0]:
        labels_result.append(int(label[0][9:]))
    return np.array(labels_result), np.array(prediction[1])[:, 0]

def predict(document):
    return int(model.predict(document)[0][0][9:])

In [7]:
full_train_data = pd.read_parquet('train.parquet')

set_seeds(SEED)
data, data_valid = prepare_data(full_train_data, seed=SEED, valid_size=4000)

Преобразуем данные в формат, принимаемый FastText.

In [8]:
data.Document = data.Document.apply(lambda x: ' '.join(simple_preprocess(x)))
data_valid.Document = data_valid.Document.apply(lambda x: ' '.join(simple_preprocess(x)))

data.category_id = data.category_id.apply(lambda x: '__label__' + str(x))
data_valid.category_id = data_valid.category_id.apply(lambda x: '__label__' + str(x))

FastText принимает данные в виде текстовых файлов, поэтому сохраняем данные на диск.

In [103]:
# Saving the CSV file as a text file to train/test the classifier

path_train = os.path.join(Path(".").parent, 'FastTextBoost', 'train_fasttext.txt')
data[['Document', 'category_id']].to_csv(path_train, 
                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")

path_test = os.path.join(Path(".").parent, 'FastTextBoost', 'test_fasttext.txt')
data_valid[['Document', 'category_id']].to_csv(path_test, 
                                               index = False, 
                                               sep = ' ',
                                               header = None, 
                                               quoting = csv.QUOTE_NONE, 
                                               quotechar = "", 
                                               escapechar = " ")


Обучаем модель.

In [98]:
# Training the fastText classifier
model = fasttext.train_supervised(path_train,
                                  lr=0.5,                # learning rate [0.1]
                                  dim=48,               # size of word vectors [100]
                                  ws=5,                # size of the context window [5]
                                  epoch=40,             # number of epochs [5]
                                  neg=250,   
                                  minn=2,
                                  maxn=6,
                                  wordNgrams=0) 

Read 4M words
Number of words:  87455
Number of labels: 1231
Progress: 100.0% words/sec/thread:   53616 lr:  0.000000 avg.loss:  0.461916 ETA:   0h 0m 0s  0.4% words/sec/thread:   52548 lr:  0.497938 avg.loss:  9.178720 ETA:   0h12m30s 0.454719 avg.loss:  1.635319 ETA:   0h11m13s 11.7% words/sec/thread:   53622 lr:  0.441376 avg.loss:  1.424610 ETA:   0h10m52s 12.5% words/sec/thread:   53678 lr:  0.437577 avg.loss:  1.377536 ETA:   0h10m46s10m30s  53546 lr:  0.377913 avg.loss:  0.965925 ETA:   0h 9m19s lr:  0.372407 avg.loss:  0.944517 ETA:   0h 9m10s  0h 9m 8s 31.6% words/sec/thread:   53701 lr:  0.342002 avg.loss:  0.846909 ETA:   0h 8m24s 32.1% words/sec/thread:   53711 lr:  0.339323 avg.loss:  0.839646 ETA:   0h 8m20s  53800 lr:  0.330121 avg.loss:  0.816017 ETA:   0h 8m 6s 38.0% words/sec/thread:   53807 lr:  0.309783 avg.loss:  0.770510 ETA:   0h 7m36sm35s 52.1% words/sec/thread:   53897 lr:  0.239713 avg.loss:  0.655485 ETA:   0h 5m52s 0.186738 avg.loss:  0.594511 ETA:   0h 4m34

Проверяем качество классификации:

In [99]:
# Evaluating performance on the entire test file
_, precision, recall = model.test(path_test)                      
leaf_F1 = (2*precision*recall) / (precision+recall)

In [101]:
print(f'Leaf F1={leaf_F1:.4f}') #0.8640

Leaf F1=0.8410


In [104]:
# Evaluating performance on the entire train file
_, precision, recall = model.test(path_train) 
leaf_F1 = (2*precision*recall) / (precision+recall)

In [97]:
print(f'Leaf F1={leaf_F1:.4f}') #0.9682

Leaf F1=0.9339


Для того чтобы проверить качество иерархической классификации, инициализируем и заполняем класс дерева категорий.

In [105]:
cat_tree_df = pd.read_csv('categories_tree.csv', index_col=0)

In [106]:
cat_tree = CategoryTree()
cat_tree.add_nodes_from_df(cat_tree_df, parent_id_col='parent_id', title_col='title')
cat_tree.add_goods_from_df(full_train_data, category_id_col='category_id', good_id_col='id')

Предсказываем категории в тестовом сете.

In [107]:
data_valid_test = data_valid.copy()
data_valid_test.category_id = data_valid_test.category_id.apply(lambda text: text[9:]).astype('int')
data_valid_test['predicted_id'] = data_valid_test.Document.astype('string')
data_valid_test.predicted_id = data_valid_test.predicted_id.apply(lambda text: predict(text)).astype('int')

Подготавливаем данные для расчета иерархической метрики.

In [108]:
test_target = data_valid_test.category_id.tolist()
pred_leafs = data_valid_test.predicted_id.tolist()

Расчет иерархической F1-меры. 

In [109]:
print(f'hF1={cat_tree.hF1_score(test_target, pred_leafs):.4f}') #0.9187
print(f'hF1_01={cat_tree.hF1_score_01(test_target, pred_leafs):.4f}') #0.9463

hF1=0.9029
hF1_01=0.9359


Теперь то же самое для train датасета.

In [110]:
data_test = data[:10000].copy()
data_test.category_id = data_test.category_id.apply(lambda text: text[9:]).astype('int')
data_test['predicted_id'] = data_test.Document.astype('string')
data_test.predicted_id = data_test.predicted_id.apply(lambda text: predict(text)).astype('int')

test_target = data_test.category_id.tolist()
pred_leafs = data_test.predicted_id.tolist()

print(f'hF1={cat_tree.hF1_score(test_target, pred_leafs):.4f}') #0.9187
print(f'hF1_01={cat_tree.hF1_score_01(test_target, pred_leafs):.4f}') #0.9463

hF1=0.9685
hF1_01=0.9792


### Предсказание ошибок для бустинга

In [124]:
boost_data = data.copy()
boost_data.category_id = boost_data.category_id.apply(lambda text: text[9:]).astype('int')
documents = boost_data.Document.tolist()
#boost_data.Document = data_full[:-4000].reset_index(drop=True).Document
pred_ids, pred_probas = predict_proba(documents)
boost_data['fastt_leaf'] = pred_ids
boost_data['fastt_proba'] = pred_probas

In [125]:
boost_data = boost_data[(boost_data.category_id!=boost_data.fastt_leaf) | (boost_data.fastt_proba<0.9)]

In [126]:
boost_data.Document = boost_data.Document.apply(lambda x: ' '.join(simple_preprocess(x)))

In [127]:
boost_data

Unnamed: 0,id,category_id,Document,fastt_leaf,fastt_proba
1,304936,12917,силиконовый дорожный контейнер футляр чехол дл...,13201,0.330415
5,708294,13982,топ женский открытым декольте базовый топ женс...,13982,0.722982
9,1429067,13451,подарочный набор из предметов подарочный набор...,13451,0.612714
10,897670,13302,подарочный набор на марта для приготовления бь...,13302,0.452063
11,539414,13171,брюки мужские трикотажные брюки прямого кроя и...,13171,0.884519
...,...,...,...,...,...
279434,1187927,11745,комплект бижутерии для девочек ожерелье колечк...,11745,0.768189
279441,818112,12078,резинка пружинка для волос шт резинка пружинка...,12078,0.674274
279445,381587,12523,рубашка коротким рукавом мужская прямая отличн...,12523,0.821654
279448,1002594,12476,цепочка на шею см красивые легкие очень удобны...,12476,0.411630


### Обучение иерархического классификатора

In [128]:
text_processor = TextProcessor()
text_processor.lemmatize_data(boost_data, document_col='Document', id_col='id')

Lemmatize: 100%|██████████| 75280/75280 [07:10<00:00, 174.83it/s]


In [129]:
text_processor.save_lemms_data('75280_boost_set_lemm', directory='FastTextBoost')

In [130]:
text_processor.load_lemms_data('75280_boost_set_lemm', directory='FastTextBoost')

In [131]:
lda_encoder = LdaEncoder()
lda_encoder.load_model('full_set_model_128', directory='FastTextBoost')

In [140]:
nevec_encoder = NavecEncoder(alpha=0.2, dim=128)
nevec_encoder.load_model('navec_hudlit_v1_12B_500K_300d_100q.tar', directory='FastTextBoost')
nevec_encoder.load_pca('PCA_navec.pickle', directory='FastTextBoost')

In [141]:
nevec_encoder.transform([['foo']]).shape[1]

128

In [160]:
fasttext_encoder = FasttextEncoder()
fasttext_encoder.load_model('fasttext_model_300_s', directory='FastTextBoost')



In [161]:
fasttext_encoder.transform([['foo']]).shape[1]

300

In [162]:
encoders=[lda_encoder, nevec_encoder, fasttext_encoder]

In [163]:
embeddings_dict = text_processor.make_embeddings_dict(encoders=encoders)

In [164]:
embeddings_dict[next(iter(embeddings_dict))].shape

(556,)

In [165]:
cat_tree = CategoryTree()
cat_tree.add_nodes_from_df(cat_tree_df, parent_id_col='parent_id', title_col='title')
cat_tree.add_goods_from_df(boost_data, category_id_col='category_id', good_id_col='id')

In [166]:
cat_tree.update_embeddings(embeddings_dict)
cat_tree.mix_in_description_embs(lambda titles: text_processor.get_embeddings(titles, encoders=encoders), weight=25)

In [167]:
classifier = Classifier(tol=0.003, max_iter=3000)

In [None]:
cat_tree.fit_local_weights(classifier, embeddings_dict, C=0.05, reg_count_power=0.5)
cat_tree.save_tree('75280_boost_set_tree.pickle', directory='FastTextBoost')



In [None]:
begin_exampl = 0
end_exampl = 4000
valid_documents = data_valid.Document.tolist()[begin_exampl:end_exampl]
valid_ids = data_valid.id.tolist()[begin_exampl:end_exampl]
valid_target = data_valid.category_id.apply(lambda text: text[9:]).astype('int').tolist()[begin_exampl:end_exampl]
embs_valid = encoder.get_embeddings(valid_documents, encoders=encoders)

hier_leafs_valid = []
hier_probas_valid = []
for i in tqdm.tqdm(range(len(embs_valid)), total=len(embs_valid)):
    leaf, proba = cat_tree.choose_leaf_proba(embs_valid[i], classifier)
    hier_leafs_valid.append(leaf)
    hier_probas_valid.append(proba)

In [None]:
print(f'hF1={cat_tree.hF1_score(valid_target, hier_leafs_valid):.3f}') #0.721/0.71

In [None]:
hier_valid_data = data_valid.copy()
hier_valid_data['hier_leaf'] = hier_leafs_valid
hier_valid_data['hier_proba'] = hier_probas_valid

In [None]:
hier_valid_data.category_id = hier_valid_data.category_id.apply(lambda text: text[9:]).astype('int')

In [None]:
hier_path = os.path.join(Path(".").parent, 'FastTextBoost', 'hier_valid_data_for_stacking')
hier_valid_data.to_csv(hier_path)

Снова обучаем FastText с гиперпараметрами, обеспечивающими лучшее обучение.

In [124]:
# Training the fastText classifier
model = fasttext.train_supervised(path_train,
                                  lr=0.25,                # learning rate [0.1]
                                  dim=100,               # size of word vectors [100]
                                  ws=4,                # size of the context window [5]
                                  epoch=30,             # number of epochs [5]
                                  neg=5,               # number of negatives sampled [5]
                                  wordNgrams=3) 

Read 4M words
Number of words:  87455
Number of labels: 1231
Progress: 100.0% words/sec/thread:   33411 lr:  0.000000 avg.loss:  0.332084 ETA:   0h 0m 0s 0.240870 avg.loss:  3.849155 ETA:   0h14m18s 2.704057 ETA:   0h13m49s 12.5% words/sec/thread:   33572 lr:  0.218804 avg.loss:  1.700464 ETA:   0h12m54s 1.507039 ETA:   0h12m33sm53ss 30.8% words/sec/thread:   33572 lr:  0.172961 avg.loss:  0.863627 ETA:   0h10m12s words/sec/thread:   33572 lr:  0.172565 avg.loss:  0.859984 ETA:   0h10m11s 0.826417 ETA:   0h 9m56s14s lr:  0.151209 avg.loss:  0.709268 ETA:   0h 8m55s 45.2% words/sec/thread:   33581 lr:  0.136944 avg.loss:  0.636838 ETA:   0h 8m 4s 50.8% words/sec/thread:   33584 lr:  0.123031 avg.loss:  0.579581 ETA:   0h 7m15s  33588 lr:  0.118965 avg.loss:  0.565370 ETA:   0h 7m 1sm18sh 6m11s lr:  0.098407 avg.loss:  0.502883 ETA:   0h 5m48s 0.071615 avg.loss:  0.440273 ETA:   0h 4m13s 77.7% words/sec/thread:   33533 lr:  0.055643 avg.loss:  0.410226 ETA:   0h 3m17s 79.2% words/sec/thr

In [125]:
# Evaluating performance on the entire test file
_, precision, recall = model.test(path_test)                      
leaf_F1 = (2*precision*recall) / (precision+recall)

In [126]:
print(f'Test Leaf F1={leaf_F1:.4f}') #0.8640

Leaf F1=0.8595


In [162]:
# Evaluating performance on the entire train file
_, precision, recall = model.test(path_train) 
leaf_F1 = (2*precision*recall) / (precision+recall)
print(f'Train Leaf F1={leaf_F1:.4f}') #0.9682

Train Leaf F1=0.9851


In [161]:
data_valid_test = data_valid.copy()
data_valid_test.category_id = data_valid_test.category_id.apply(lambda text: text[9:]).astype('int')
data_valid_test['predicted_id'] = data_valid_test.Document.astype('string')
data_valid_test.predicted_id = data_valid_test.predicted_id.apply(lambda text: predict(text)).astype('int')
test_target = data_valid_test.category_id.tolist()
pred_leafs = data_valid_test.predicted_id.tolist()
print(f'hF1={cat_tree.hF1_score(test_target, pred_leafs):.4f}') 
print(f'hF1_01={cat_tree.hF1_score_01(test_target, pred_leafs):.4f}') 

hF1=0.9154
hF1_01=0.9441


### Стекинг двух алгоритмов

In [152]:
hyer_test_data = pd.read_csv(hier_path, index_col=0)

In [153]:
hyer_test_data

Unnamed: 0,id,category_id,Document,hier_leaf,hier_proba
0,1262422,12456,сорочка цвета подарок на марта сорочка размер ...,13205,0.264725
1,526277,13061,серьги серьги серьги,14076,0.226782
2,113365,12980,беспроводные наушники sports headset bluetooth...,12980,0.551842
3,642934,11937,чехол накладка принтом iphone xs max xiaomi mi...,13408,0.569301
4,448921,13066,дисплеи micromax canvas fire дисплеи micromax ...,12422,0.012839
...,...,...,...,...,...
3995,982751,11567,цепь чокер женская цепь чокер женская цепь чок...,12476,0.316677
3996,747972,12751,школьный бант школьный бант школьный бант школ...,13260,0.029716
3997,832637,12454,наклейка для дизайна ногтей lucky rose тема ли...,12454,0.749909
3998,1378353,11745,ключик замочек ключик ключик замочек,12727,0.009855


In [154]:
fastt_test_data = data_valid.copy()
fastt_test_data.category_id = fastt_test_data.category_id.apply(lambda text: text[9:]).astype('int')
documents = fastt_test_data.Document.tolist()
pred_ids, pred_probas = predict_proba(documents)
fastt_test_data['fastt_leaf'] = pred_ids
fastt_test_data['fastt_proba'] = pred_probas

In [155]:
stacked_test = fastt_test_data.join(hyer_test_data, how='inner', rsuffix='_h')
stacked_test['prob_ratio'] = stacked_test.fastt_proba / stacked_test.hier_proba
stacked_test['prob_diff'] = stacked_test.fastt_proba - stacked_test.hier_proba

Проверим, правильно ли соединились фреймы:

In [156]:
stacked_test[stacked_test.id==stacked_test.id_h]

Unnamed: 0,id,category_id,Document,fastt_leaf,fastt_proba,id_h,category_id_h,Document_h,hier_leaf,hier_proba,prob_ratio,prob_diff
0,1262422,12456,сорочка цвета подарок на марта сорочка размер ...,12456,0.996993,1262422,12456,сорочка цвета подарок на марта сорочка размер ...,13205,0.264725,3.766151,0.732268
1,526277,13061,серьги серьги серьги,13061,0.968346,526277,13061,серьги серьги серьги,14076,0.226782,4.269942,0.741564
2,113365,12980,беспроводные наушники sports headset bluetooth...,12980,0.999829,113365,12980,беспроводные наушники sports headset bluetooth...,12980,0.551842,1.811804,0.447987
3,642934,11937,чехол накладка принтом iphone xs max xiaomi mi...,11937,0.999569,642934,11937,чехол накладка принтом iphone xs max xiaomi mi...,13408,0.569301,1.755781,0.430267
4,448921,13066,дисплеи micromax canvas fire дисплеи micromax ...,13066,0.997870,448921,13066,дисплеи micromax canvas fire дисплеи micromax ...,12422,0.012839,77.723119,0.985032
...,...,...,...,...,...,...,...,...,...,...,...,...
3995,982751,11567,цепь чокер женская цепь чокер женская цепь чок...,12476,0.501856,982751,11567,цепь чокер женская цепь чокер женская цепь чок...,12476,0.316677,1.584760,0.185180
3996,747972,12751,школьный бант школьный бант школьный бант школ...,12751,0.999991,747972,12751,школьный бант школьный бант школьный бант школ...,13260,0.029716,33.651286,0.970275
3997,832637,12454,наклейка для дизайна ногтей lucky rose тема ли...,12454,0.999843,832637,12454,наклейка для дизайна ногтей lucky rose тема ли...,12454,0.749909,1.333285,0.249933
3998,1378353,11745,ключик замочек ключик ключик замочек,12727,0.069996,1378353,11745,ключик замочек ключик ключик замочек,12727,0.009855,7.102964,0.060142


In [160]:
for treshold in [-3, -1.5, -1, -0.9, -0.8, -0.7, -0.6, -0.5, -0.4, -0.3, -0.25, -0.2, -0.13, -0.095, -0.05, 0, 0.01, 0.02, 0.03, 0.05, 0.07, 0.09, 0.11]:
    stack_pred=[f_leaf if diff>treshold else h_leaf for diff, f_leaf, h_leaf in zip(stacked_test['prob_diff'], stacked_test['fastt_leaf'], stacked_test['hier_leaf'])]
    print(f'treshold={treshold:.2f}')
    print(f'hF1={cat_tree.hF1_score(stacked_test.category_id.tolist(), stack_pred):.4f}\n')
    

treshold=-3.00
hF1=0.9154

treshold=-1.50
hF1=0.9154

treshold=-1.00
hF1=0.9154

treshold=-0.90
hF1=0.9154

treshold=-0.80
hF1=0.9154

treshold=-0.70
hF1=0.9154

treshold=-0.60
hF1=0.9154

treshold=-0.50
hF1=0.9152

treshold=-0.40
hF1=0.9149

treshold=-0.30
hF1=0.9146

treshold=-0.25
hF1=0.9148

treshold=-0.20
hF1=0.9146

treshold=-0.13
hF1=0.9148

treshold=-0.10
hF1=0.9153

treshold=-0.05
hF1=0.9153

treshold=0.00
hF1=0.9137

treshold=0.01
hF1=0.9134

treshold=0.02
hF1=0.9128

treshold=0.03
hF1=0.9114

treshold=0.05
hF1=0.9106

treshold=0.07
hF1=0.9100

treshold=0.09
hF1=0.9092

treshold=0.11
hF1=0.9074



In [158]:
for treshold in [0, 0.1, 0.2, 0.3, 0.4, 0.45, 0.5, 0.55, 0.65, 1, 2, 3]:
    stack_pred=[f_leaf if ratio>treshold else h_leaf for ratio, f_leaf, h_leaf in zip(stacked_test['prob_ratio'], stacked_test['fastt_leaf'], stacked_test['hier_leaf'])]
    print(f'treshold={treshold:.2f}')
    print(f'hF1={cat_tree.hF1_score(stacked_test.category_id.tolist(), stack_pred):.4f}\n')

treshold=0.00
hF1=0.9154

treshold=0.10
hF1=0.9154

treshold=0.20
hF1=0.9156

treshold=0.30
hF1=0.9153

treshold=0.40
hF1=0.9147

treshold=0.45
hF1=0.9147

treshold=0.50
hF1=0.9148

treshold=0.55
hF1=0.9146

treshold=0.65
hF1=0.9153

treshold=1.00
hF1=0.9137

treshold=2.00
hF1=0.8623

treshold=3.00
hF1=0.8317

