# Libraries

In [1]:
import json
import pandas as pd

from nltk.tokenize import RegexpTokenizer

import gc
from thefuzz import fuzz
import numpy as np
from tqdm.auto import tqdm

from catboost import CatBoostClassifier, Pool

import warnings

In [2]:
pd.set_option('display.max_columns', 256)
warnings.filterwarnings('ignore')

# Data

In [3]:
train_pairs = pd.read_parquet('./datasets/train_pairs_w_target.parquet')
train_pairs['target'] = train_pairs['target'].astype(int)
rtrain_pairs = train_pairs.copy()
rtrain_pairs.variantid1, rtrain_pairs.variantid2 = rtrain_pairs.variantid2, rtrain_pairs.variantid1
train_pairs

Unnamed: 0,target,variantid1,variantid2
0,0,51197862,51198054
1,1,53062686,536165289
2,1,53602615,587809782
3,1,53888651,89598677
4,0,56930698,551526166
...,...,...,...
306535,0,817327230,822083612
306536,0,817560551,818069912
306537,0,817854719,817857267
306538,0,820036017,820037019


In [4]:
test_pairs = pd.read_parquet('./datasets/test_pairs_wo_target.parquet')
rtest_pairs = test_pairs.copy()
rtest_pairs.variantid1, rtest_pairs.variantid2 = rtest_pairs.variantid2, rtest_pairs.variantid1
test_pairs

Unnamed: 0,variantid1,variantid2
0,52076340,290590137
1,64525522,204128919
2,77243372,479860557
3,86065820,540678372
4,91566575,258840506
...,...,...
18079,666998614,667074522
18080,670036240,670048449
18081,670284509,684323809
18082,692172005,704805270


In [5]:
train_data = pd.read_parquet('./datasets/train_data.parquet').set_index('variantid')
train_data['characteristic_attributes_mapping'] = train_data['characteristic_attributes_mapping'].fillna('{}').apply(lambda x: json.loads(x))
train_data['categories'] = train_data['categories'].apply(lambda x: json.loads(x))
train_data['main_pic_embeddings_resnet_v1'] = train_data['main_pic_embeddings_resnet_v1'].apply(lambda x: x[0])
train_data = pd.concat([train_data, pd.read_parquet('./datasets/name_labse_embs_train.parquet').set_index('variantid')], axis=1)
train_data

Unnamed: 0_level_0,name,categories,color_parsed,pic_embeddings_resnet_v1,main_pic_embeddings_resnet_v1,name_bert_64,characteristic_attributes_mapping,name_labse_768
variantid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
51195767,"Удлинитель Партнер-Электро ПВС 2х0,75 ГОСТ,6A,...","{'1': 'EPG', '2': 'Электроника', '3': 'Сетевые...",[оранжевый],,"[0.04603629, 0.18839523, -0.09973055, -0.66368...","[-0.47045058, 0.67237014, 0.48984158, -0.54485...","{'Номинальный ток, А': ['10'], 'Цвет товара': ...","[-0.033874325, 0.03722446, 0.0029757991, 0.068..."
53565809,Магнитный кабель USB 2.0 A (m) - USB Type-C (m...,"{'1': 'EPG', '2': 'Электроника', '3': 'Кабели ...",[красный],"[[0.26863545, -0.3130674, 0.29023397, 0.073978...","[1.1471839, -0.665361, 0.7745614, 0.26716197, ...","[-0.6575592, 0.6522429, 0.5426037, -0.54347897...",{'Конструктивные особенности': ['Магнитная кон...,"[0.015568526, -0.03899538, 0.064447366, 0.0383..."
56763357,"Набор микропрепаратов Konus 25: ""Клетки и ткан...","{'1': 'EPG', '2': 'Электроника', '3': 'Оптичес...",,"[[0.66954195, 1.0643557, 0.78324044, -0.338267...","[-0.90570974, 1.0296293, 1.0769907, 0.27746, -...","[-0.7384308, 0.70784587, 0.3012653, -0.3583719...","{'Тип аксессуара': ['Набор микропрепаратов'], ...","[-0.033072222, -0.04237577, 0.020771954, 0.065..."
56961772,"Мобильный телефон BQ 1848 Step, черный","{'1': 'EPG', '2': 'Электроника', '3': 'Смартфо...",[черный],"[[0.6580482, -0.35763323, -0.16939065, -0.4249...","[0.13133773, -0.5577079, 0.32498044, 0.1917174...","[-0.44812852, 0.5283565, 0.28981736, -0.506841...","{'Тип карты памяти': ['microSD'], 'Число SIM-к...","[0.014727573, -0.025661988, 0.023943473, -0.00..."
61054740,"Штатив трипод Tripod 330A для фотоаппаратов, в...","{'1': 'EPG', '2': 'Электроника', '3': 'Штативы...",[черный],"[[-0.10406649, 0.080646515, -0.28668788, 0.739...","[0.21696381, 0.10989461, -0.08012986, 0.691861...","[-0.72692573, 0.75206333, 0.37740713, -0.52502...","{'Материал': ['Металл'], 'Количество секций, ш...","[0.043145332, -0.052424084, 0.017260496, 0.045..."
...,...,...,...,...,...,...,...,...
820128810,"Комплект 2 шт, Чернила Cactus CS-EPT6733B пурп...","{'1': 'EPG', '2': 'Электроника', '3': 'Расходн...",[пурпурный],,"[-1.4492652, -0.80129164, -0.12344764, 0.71945...","[-0.8253241, 0.6785133, 0.53978086, -0.4888316...","{'Тип': ['Чернила для принтера'], 'Бренд печат...","[-0.003678058, -0.031628493, 0.0065589263, 0.0..."
821135769,"Защитное стекло закаленное Xiaomi Redmi 7, Y3 ...","{'1': 'EPG', '2': 'Электроника', '3': 'Защитны...",[черный],"[[0.09564891, 0.27437285, -0.19054827, -0.7992...","[0.012127608, -0.8534423, 0.5415518, -0.449125...","[-0.7413257, 0.46105132, 0.5639801, -0.5462132...","{'Вид стекла': ['3D'], 'Тип': ['Защитное стекл...","[-0.06858361, 0.027011767, -0.016400583, -0.02..."
822095690,Системный блок ЮКОМС 9400-268 (AMD A6-9400 (3....,"{'1': 'EPG', '2': 'Электроника', '3': 'Компьют...",[черный],,"[0.4248176, -0.15944786, -0.22844064, 0.427686...","[-0.49261805, 0.56726897, 0.7037877, -0.697246...","{'Общий объем HDD, ГБ': ['10000'], 'Видеокарта...","[-0.04474233, -0.034224413, 0.026076552, 0.026..."
822101044,Системный блок ЮКОМС 9400-9 (AMD A6-9400 (3.4 ...,"{'1': 'EPG', '2': 'Электроника', '3': 'Компьют...",[черный],,"[0.4248176, -0.15944786, -0.22844064, 0.427686...","[-0.44051006, 0.54029673, 0.63768685, -0.68040...","{'Общий объем HDD, ГБ': ['8000'], 'Видеокарта'...","[-0.05541598, 0.000863006, 0.01093415, 0.02208..."


In [6]:
test_data = pd.read_parquet('./datasets/test_data.parquet').set_index('variantid')
test_data['characteristic_attributes_mapping'] = test_data['characteristic_attributes_mapping'].fillna('{}').apply(lambda x: json.loads(x))
test_data['categories'] = test_data['categories'].apply(lambda x: json.loads(x))
test_data['main_pic_embeddings_resnet_v1'] = test_data['main_pic_embeddings_resnet_v1'].apply(lambda x: x[0])
test_data = pd.concat([test_data, pd.read_parquet('./datasets/name_labse_embs_test.parquet').set_index('variantid')], axis=1)
test_data

Unnamed: 0_level_0,name,categories,color_parsed,pic_embeddings_resnet_v1,main_pic_embeddings_resnet_v1,name_bert_64,characteristic_attributes_mapping,name_labse_768
variantid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
51201254,Колодка TDM Electric четырехместная без заземл...,"{'1': 'EPG', '2': 'Электроника', '3': 'Сетевые...",[белый],"[[0.34383398, -0.2962618, 0.07987049, -0.08257...","[0.38310742, -0.7876679, 0.5018278, 0.20900711...","[-0.5060825, 0.5773388, 0.59435517, -0.4958292...","{'Страна-изготовитель': ['Китай'], 'Бренд': ['...","[-0.0058242553, 0.0010011946, 0.015051351, 0.0..."
77151532,Клавиатура черная с черной рамкой для 25-011879,"{'1': 'EPG', '2': 'Электроника', '3': 'Запчаст...",[черный],,"[0.50964713, 0.7958329, -1.4113188, 0.19993813...","[-0.43467724, 0.6614495, 0.48050267, -0.588880...","{'Страна-изготовитель': ['Китай'], 'Комплектац...","[0.0088402, -0.0050699823, 0.026550002, -0.015..."
89664856,"15.6"" Игровой ноутбук Acer Predator Helios 300...","{'1': 'EPG', '2': 'Электроника', '3': 'Компьют...",[черный],"[[0.7804302, -0.245446, -0.67754817, -0.614691...","[0.9958085, -0.113175124, -0.7623152, -0.91648...","[-0.70010763, 0.48152006, 0.47597092, -0.51727...",{'Видеокарта': ['NVIDIA GeForce RTX 2070 (8 Гб...,"[-0.026623247, -0.018851712, 0.011397564, -0.0..."
90701982,Портативная колонка Borofone BR7 Empyreal Spor...,"{'1': 'EPG', '2': 'Электроника', '3': 'Акустик...","[red, красный]","[[-0.24636984, -1.0719914, -0.49986655, 0.3423...","[-0.26596686, -1.143009, -0.5289628, 0.4285588...","[-0.73135185, -0.039796613, 0.38907066, -0.496...","{'Основной материал корпуса': ['Металл'], 'Мак...","[0.016539363, 0.03778109, 0.025718935, 0.08805..."
92484118,Аккумулятор для Meizu BA712 ( M6s ),"{'1': 'EPG', '2': 'Электроника', '3': 'Батарей...",,,"[0.42047608, 0.75828516, 0.5440093, -0.0068945...","[-0.600158, 0.13944691, 0.48706242, -0.5050975...","{'Рекомендовано для': ['Meizu'], 'Бренд': ['Me...","[-0.0024493372, 0.02346121, 0.068452105, 0.023..."
...,...,...,...,...,...,...,...,...
702785891,Кабель USB - Lightning HOCO X21 PLUS (черно-бе...,"{'1': 'EPG', '2': 'Электроника', '3': 'Кабели ...",[черный],"[[1.1820095, -0.16312826, 1.4916217, 0.0288323...","[0.3297959, -0.16444838, 0.9350716, 0.34787956...","[-0.66597974, 0.7140731, 0.43572947, -0.445908...","{'Бренд': ['hoco'], 'Тип': ['Кабель'], 'Цвет т...","[-0.031527344, -0.06875799, 0.03187686, -0.004..."
704096517,Блок питания для ноутбука Asus f5gl (19V 90W 4...,"{'1': 'EPG', '2': 'Электроника', '3': 'Зарядны...",[черный],"[[-0.013610864, -0.68512607, 0.77639246, -1.04...","[0.2785852, -0.16053033, 1.1653559, 1.0619084,...","[-0.7575411, 0.4196694, 0.46428213, -0.4916808...","{'Комплектация': ['Зарядное устройство, сетево...","[-0.023706086, -0.012301952, -0.014316322, -0...."
705874953,Оперативная память HyperX FURY Black DDR4 2666...,"{'1': 'EPG', '2': 'Электроника', '3': 'Операти...",[black],"[[0.34073856, 0.65070343, 0.31146732, 1.261663...","[0.31382418, 0.60041714, 0.3067428, 1.1233345,...","[-0.60506856, 0.4477128, 0.62255704, -0.720129...","{'Тайминги': ['16-18-18-29'], 'Пропускная спос...","[-0.028754005, -0.025122717, 0.048854, -0.0297..."
706965102,8 ТБ Внутренний жесткий диск Toshiba TOSHIBA N...,"{'1': 'EPG', '2': 'Электроника', '3': 'Жесткие...",,"[[-0.9360045, -0.43083164, -1.1651772, 1.23836...","[0.404035, -0.20071658, -0.44533533, 0.2038879...","[-0.62029105, 0.45747545, 0.6659858, -0.671704...","{'Комплектация': ['HDWG480UZSVA'], 'Форм-факто...","[-0.026827315, 0.032079216, 0.040149417, -0.01..."


# Features

## Utils

In [7]:
def calc_dists(df, prefix, embs_1, embs_2):
    l1_dists, l2_dists, cos_dists = [], [], []
    for emb_1, emb_2 in zip(embs_1, embs_2):
        len_1 = (emb_1**2).sum()**0.5
        len_2 = (emb_2**2).sum()**0.5
        l1_dists.append(
            np.abs(emb_1 - emb_2).sum()
        )
        l2_dists.append(
            ((emb_1 - emb_2)**2).sum()**0.5
        )
        cos_dists.append(
            (emb_1 @ emb_2) / len_1 / len_2
        )
    df[f'{prefix}_l1_dist'] = l1_dists
    df[f'{prefix}_l2_dist'] = l2_dists
    df[f'{prefix}_cos_dist'] = cos_dists

In [8]:
nums_tokenizer = RegexpTokenizer(r'\d+[.]\d+|\d+')
tokenizer = RegexpTokenizer(r'\d+[ ]+\d+[ ]+\d+|\d+[ ]+\d+|[a-zA-Z]+[.]+[a-zA-Z]+|[A-Z]+[a-z]+|\d+[.,:+-]+\d+|\w+')

## Categories

In [9]:
train_cat3 = set()
for categories in train_data.categories:
    train_cat3.add(categories['3'])
test_cat3 = set()
for categories in test_data.categories:
    test_cat3.add(categories['3'])
both_cat3 = train_cat3 & test_cat3
len(train_cat3), len(test_cat3), len(both_cat3)

(127, 74, 74)

In [10]:
train_cat4 = set()
for categories in train_data.categories:
    train_cat4.add(categories['4'])
test_cat4 = set()
for categories in test_data.categories:
    test_cat4.add(categories['4'])
both_cat4 = train_cat4 & test_cat4
len(train_cat4), len(test_cat4), len(both_cat4)

(357, 236, 234)

In [11]:
def make_categories_features(pairs, data):
    gc.collect()
    df = pairs.copy()
    categories_1 = data.loc[pairs.variantid1, 'categories']
    categories_2 = data.loc[pairs.variantid2, 'categories']
    df['cat3'] = categories_1.apply(lambda x: x['3']).values
    df.loc[~df.cat3.isin(both_cat3), 'cat3'] = 'rest'
    df['cat4'] = categories_1.apply(lambda x: x['4']).values
    df.loc[~df.cat4.isin(both_cat4), 'cat4'] = 'rest'
    df['is_eq_cat4'] = categories_1.apply(lambda x: x['4']).values == categories_2.apply(lambda x: x['4']).values
    return df.drop(['variantid1', 'variantid2'], axis=1)

In [12]:
%%time
train_df_categories = make_categories_features(train_pairs, train_data)
train_df_categories

CPU times: total: 2.41 s
Wall time: 2.4 s


Unnamed: 0,target,cat3,cat4,is_eq_cat4
0,0,"Сетевые фильтры, разветвители и удлинители","Сетевой фильтр, удлинитель, разветвитель",True
1,1,Расходник для печати,Картридж,True
2,1,Расходник для печати,Картридж,True
3,1,Смарт-часы,Умные часы,True
4,0,Батарейки и аккумуляторы,Аккумулятор для телефона,True
...,...,...,...,...
306535,0,"Смартфоны, планшеты, мобильные телефоны",Смартфон,True
306536,0,Карты памяти и флешки,USB Флеш-накопитель,True
306537,0,"Смартфоны, планшеты, мобильные телефоны",Смартфон,True
306538,0,Кабели и переходники,Кабель,True


In [13]:
rtrain_df_categories = make_categories_features(rtrain_pairs, train_data)

## Colors

In [14]:
colors_mapper = {
 'ярко-синий': 'ярко-синий',
 'ярко-розовый': 'ярко-розовый',
 'ярко-зеленый': 'ярко-зеленый',
 'ярко-желтый': 'ярко-желтый',
 'янтарный': 'янтарный',
 'электрик': 'электрик',
 'экрю': 'экрю',
 'шоколадный': 'шоколадный',
 'черный': 'черный',
 'черно-синий': 'черно-синий',
 'черно-серый': 'черно-серый',
 'черно-красный': 'черно-красный',
 'черно-зеленый': 'черно-зеленый',
 'черн': 'черный',
 'чер': 'черный',
 'циан': 'бирюзовый',
 'цементный': 'цементный',
 'хаки': 'хаки',
 'фуксия': 'фуксия',
 'фисташковый': 'фисташковый',
 'фиолетовый': 'фиолетовый',
 'фиолетово-синий': 'фиолетово-синий',
 'фиолет': 'фиолетовый',
 'фиол': 'фиолетовый',
 'фиалковый': 'фиалковый',
 'тыквенный': 'тыквенный',
 'тыква': 'тыквенный',
 'травяной': 'травяной',
 'томатный': 'томатный',
 'тиффани': 'тиффани',
 'терракотовый': 'терракотовый',
 'терракота': 'терракотовый',
 'темно-фиолетовый': 'темно-фиолетовый',
 'темно-синий': 'темно-синий',
 'темно-серый': 'темно-серый',
 'темно-розовый': 'темно-розовый',
 'темно-оранжевый': 'темно-оранжевый',
 'темно-оливковый': 'темно-оливковый',
 'темно-красный': 'темно-красный',
 'темно-коричневый': 'темно-коричневый',
 'темно-зеленый': 'темно-зеленый',
 'темно-голубой': 'темно-голубой',
 'темно-бирюзовый': 'темно-бирюзовый',
 'темно-бежевый': 'темно-бежевый',
 'сливовый': 'сливовый',
 'сиреневый': 'сиреневый',
 'синий': 'синий',
 'сине-зеленый': 'сине-зеленый',
 'син': 'синий',
 'серый': 'серый',
 'серовато-зеленый': 'серовато-зеленый',
 'серо-коричневый': 'серо-коричневый',
 'серо-зеленый': 'серо-зеленый',
 'серо-голубой': 'серо-голубой',
 'серо-бежевый': 'серо-бежевый',
 'серебряный': 'серебряный',
 'серебристый': 'серебристый',
 'серебристо-серый': 'серебристо-серый',
 'сер': 'серый',
 'сепия': 'сепия',
 'светло-фиолетовый': 'светло-фиолетовый',
 'светло-синий': 'светло-синий',
 'светло-серый': 'светло-серый',
 'светло-розовый': 'светло-розовый',
 'светло-пурпурный': 'светло-пурпурный',
 'светло-коричневый': 'светло-коричневый',
 'светло-золотистый': 'светло-золотистый',
 'светло-зеленый': 'светло-зеленый',
 'светло-желтый': 'светло-желтый',
 'светло-голубой': 'светло-голубой',
 'светло-бирюзовый': 'светло-бирюзовый',
 'светло-бежевый': 'светло-бежевый',
 'сапфировый': 'сапфировый',
 'салатовый': 'салатовый',
 'рыжий': 'рыжий',
 'розовый': 'розовый',
 'розово-фиолетовый': 'розово-фиолетовый',
 'розово-золотой': 'розово-золотой',
 'разноцветный': 'разноцветный',
 'пурпурный': 'пурпурный',
 'пурпурно-фиолетовый': 'пурпурно-фиолетовый',
 'песочный': 'песочный',
 'перу': 'перу',
 'персиковый': 'персиковый',
 'охра': 'охра',
 'орхидея': 'орхидея',
 'оранжевый': 'оранжевый',
 'оранжево-розовый': 'оранжево-розовый',
 'оливковый': 'оливковый',
 'огненно-красный': 'огненно-красный',
 'нефритовый': 'нефритовый',
 'небесный': 'небесный',
 'мятный': 'мятный',
 'мятно-зеленый': 'мятно-зеленый',
 'мята': 'мятный',
 'мультиколор': 'мультиколор',
 'морковный': 'морковный',
 'молочный': 'молочный',
 'многоцветный': 'многоцветный',
 'медный': 'медный',
 'марсала': 'марсала',
 'малиновый': 'малиновый',
 'малиново-красный': 'малиново-красный',
 'малахитовый': 'малахитовый',
 'льняной': 'льняной',
 'лимонный': 'лимонный',
 'лиловый': 'лиловый',
 'латунный': 'латунный',
 'лаймовый': 'лаймовый',
 'лайм': 'лаймовый',
 'лазурный': 'лазурный',
 'лавандовый': 'лавандовый',
 'лаванда': 'лавандовый',
 'кремовый': 'кремовый',
 'красный': 'красный',
 'красновато-коричневый': 'красновато-коричневый',
 'красно-оранжевый': 'красно-оранжевый',
 'красно-коричневый': 'красно-коричневый',
 'красн': 'красный',
 'крас': 'красный',
 'кофейный': 'кофейный',
 'космос': 'космос',
 'коричневый': 'коричневый',
 'коричнево-красный': 'коричнево-красный',
 'коричнево-бежевый': 'коричнево-бежевый',
 'коралловый': 'коралловый',
 'кораллово-красный': 'кораллово-красный',
 'кобальтовый': 'кобальтовый',
 'кирпичный': 'кирпичный',
 'кирпично-красный': 'кирпично-красный',
 'кварцевый': 'кварцевый',
 'кардинал': 'кардинал',
 'канареечный': 'канареечный',
 'камуфляжный': 'камуфляжный',
 'индиго': 'индиго',
 'изумрудный': 'изумрудный',
 'изумрудно-зеленый': 'изумрудно-зеленый',
 'изумруд': 'изумрудный',
 'золотой': 'золотой',
 'золотистый': 'золотистый',
 'зеленый': 'зеленый',
 'зелено-серый': 'зелено-серый',
 'зел': 'зеленый',
 'жемчужно-белый': 'жемчужно-белый',
 'желтый': 'желтый',
 'желто-розовый': 'желто-розовый',
 'желто-зеленый': 'желто-зеленый',
 'желт': 'желтый',
 'гусеница': 'гусеница',
 'грушевый': 'грушевый',
 'графит': 'графит',
 'гранитный': 'гранитный',
 'гранатовый': 'гранатовый',
 'горчичный': 'горчичный',
 'голубой': 'голубой',
 'голуб': 'голубой',
 'глициния': 'глициния',
 'вишня': 'вишневый',
 'вишневый': 'вишневый',
 'васильковый': 'васильковый',
 'ванильный': 'ванильный',
 'бурый': 'бурый',
 'бронзовый': 'бронзовый',
 'бордовый': 'бордовый',
 'бордо': 'бордовый',
 'болотный': 'болотный',
 'бледно-розовый': 'бледно-розовый',
 'бледно-пурпурный': 'бледно-пурпурный',
 'бледно-желтый': 'бледно-желтый',
 'бирюзовый': 'бирюзовый',
 'бирюзово-зеленый': 'бирюзово-зеленый',
 'белый': 'белый',
 'белоснежный': 'белоснежный',
 'бело-зеленый': 'бело-зеленый',
 'бел': 'белый',
 'бежевый': 'бежевый',
 'бежево-серый': 'бежево-серый',
 'бежево-розовый': 'бежево-розовый',
 'баклажановый': 'баклажановый',
 'антрацитовый': 'антрацитовый',
 'аметистовый': 'аметистовый',
 'алый': 'алый',
 'аквамариновый': 'аквамариновый',
 'аква': 'аква',
 'абрикосовый': 'абрикосовый',
 'yellow': 'желтый',
 'wine': 'wine',
 'white': 'белый',
 'violet': 'фиолетовый',
 'vanilla': 'ванильный',
 'ultramarine': 'ultramarine',
 'turquoise': 'бирюзовый',
 'tomato': 'томатный',
 'teal': 'teal',
 'tan': 'tan',
 'snow': 'snow',
 'silver': 'серебряный',
 'sapphire': 'сапфировый',
 'red': 'красный',
 'purple': 'фиолетовый',
 'pink': 'розовый',
 'peru': 'перу',
 'pear': 'грушевый',
 'peach': 'персиковый',
 'orchid': 'орхидея',
 'orange': 'оранжевый',
 'olive': 'оливковый',
 'navy': 'navy',
 'magenta': 'пурпурный',
 'linen': 'linen',
 'lime': 'лаймовый',
 'lilac': 'сиреневый',
 'lemon': 'lemon',
 'lavender': 'лавандовый',
 'khaki': 'хаки',
 'jade': 'нефритовый',
 'ivory': 'ivory',
 'indigo': 'индиго',
 'grey': 'серый',
 'green': 'зеленый',
 'gray': 'серый',
 'gold': 'золотой',
 'fuchsia': 'фуксия',
 'flax': 'flax',
 'emerald': 'emerald',
 'denim': 'denim',
 'cyan': 'бирюзовый',
 'cream': 'кремовый',
 'corn': 'corn',
 'coral': 'коралловый',
 'copper': 'медный',
 'cobalt': 'кобальтовый',
 'chocolate': 'шоколадный',
 'burgundy': 'бордовый',
 'buff': 'buff',
 'brown': 'коричневый',
 'bronze': 'бронзовый',
 'brass': 'латунный',
 'blue': 'голубой',
 'blond': 'blond',
 'black': 'черный',
 'beige': 'бежевый',
 'azure': 'лазурный',
 'aquamarine': 'аквамариновый',
 'aqua': 'аквамариновый',
 'amethyst': 'аметистовый',
 'amber': 'янтарный'
}

In [15]:
def make_colors_features(pairs, data):
    gc.collect()
    df = pairs.copy()
    colors_1 = data.loc[pairs.variantid1, 'color_parsed']
    colors_2 = data.loc[pairs.variantid2, 'color_parsed']
    same_colors = []
    all_colors = []
    for color_1, color_2 in zip(colors_1, colors_2):
        if color_1 is not None:
            color_1 = list(set([colors_mapper[c] for c in color_1]))
        if color_2 is not None:
            color_2 = list(set([colors_mapper[c] for c in color_2]))
        
        if color_1 is None or color_2 is None:
            same_colors.append(0)
            if color_1 is not None:
                all_colors.append(len(color_1))
            elif color_2 is not None:
                all_colors.append(len(color_2))
            else:
                all_colors.append(0)
        else:
            same_colors.append(
                len(set(color_1) & set(color_2))
            )
            all_colors.append(
                len(set(color_1) | set(color_2))
            )
    df['same_colors'] = same_colors
    df['all_colors'] = all_colors
    df['iou_colors'] = df['same_colors'] / df['all_colors']
    df.loc[df['all_colors']==0, 'iou_colors'] = 0
    df['not_same_colors'] = df['all_colors'] - df['same_colors']
    return df.drop(['variantid1', 'variantid2'], axis=1)

In [16]:
%%time
train_df_color = make_colors_features(train_pairs, train_data)
train_df_color

CPU times: total: 2.7 s
Wall time: 2.7 s


Unnamed: 0,target,same_colors,all_colors,iou_colors,not_same_colors
0,0,1,1,1.0,0
1,1,0,2,0.0,2
2,1,0,2,0.0,2
3,1,1,2,0.5,1
4,0,1,1,1.0,0
...,...,...,...,...,...
306535,0,1,1,1.0,0
306536,0,1,1,1.0,0
306537,0,0,2,0.0,2
306538,0,0,2,0.0,2


In [17]:
rtrain_df_color = make_colors_features(rtrain_pairs, train_data)

## Pictures

In [18]:
def make_pictures_features(pairs, data):
    gc.collect()
    df = pairs.copy()
    main_pics_1 = data.loc[df.variantid1, 'main_pic_embeddings_resnet_v1'].values
    main_pics_2 = data.loc[df.variantid2, 'main_pic_embeddings_resnet_v1'].values
    calc_dists(
        df, 'main_pic', 
        main_pics_1,
        main_pics_2
    )
    embs_1 = data.loc[df.variantid1, 'pic_embeddings_resnet_v1'].values
    embs_2 = data.loc[df.variantid2, 'pic_embeddings_resnet_v1'].values
    min_dists, mean_dists, max_dists, std_dists = [], [], [], []
    pic_cnts_1, pic_cnts_2 = [], []
    mean_dists_to_main_1, mean_dists_to_main_2 = [], []
    for main_pic_1, main_pic_2, emb_1, emb_2 in tqdm(zip(main_pics_1, main_pics_2, embs_1, embs_2), total=len(df)):
        dists = []
        pics_1 = [main_pic_1]
        pics_2 = [main_pic_2]
        if emb_1 is not None:
            pics_1.extend(list(emb_1))
            pic_cnts_1.append(len(emb_1))
        else:
            pic_cnts_1.append(0)
        if emb_2 is not None:
            pics_2.extend(list(emb_2))
            pic_cnts_2.append(len(emb_2))
        else:
            pic_cnts_2.append(0)
        for pic_1 in pics_1:
            for pic_2 in pics_2:
                dists.append(
                    (pic_1 @ pic_2) / (pic_1**2).sum()**0.5 / (pic_2**2).sum()**0.5
                )
        min_dists.append(np.min(dists))
        mean_dists.append(np.mean(dists))
        max_dists.append(np.max(dists))
        std_dists.append(np.std(dists))
        
        dists = []
        for pic_2 in pics_2:
            dists.append(
                (main_pic_1 @ pic_2) / (main_pic_1**2).sum()**0.5 / (pic_2**2).sum()**0.5
            )
        mean_dists_to_main_1.append(np.mean(dists))
        dists = []
        for pic_1 in pics_1:
            dists.append(
                (pic_1 @ main_pic_2) / (pic_1**2).sum()**0.5 / (main_pic_2**2).sum()**0.5
            )
        mean_dists_to_main_2.append(np.mean(dists))
    df['pic_cnt_1'] = pic_cnts_1
    df['pic_cnt_2'] = pic_cnts_2
    df['pic_cnt_diff'] = np.abs(df['pic_cnt_1'] - df['pic_cnt_2'])
    df['pics_min_dist'] = min_dists
    df['pics_mean_dist'] = mean_dists
    df['pics_max_dist'] = max_dists
    df['pics_std_dist'] = std_dists
    df['pics_diff_dist'] = df['pics_max_dist'] - df['pics_min_dist']
    df['mean_dist_to_main_1'] = mean_dists_to_main_1
    df['mean_dist_to_main_2'] = mean_dists_to_main_2
    df['mean_dist_to_main_diff'] = np.abs(df['mean_dist_to_main_1'] - df['mean_dist_to_main_2'])
    return df.drop(['variantid1', 'variantid2'], axis=1)

In [19]:
%%time
train_df_pictures = make_pictures_features(train_pairs, train_data)
train_df_pictures

  0%|          | 0/306540 [00:00<?, ?it/s]

CPU times: total: 2min 19s
Wall time: 2min 43s


Unnamed: 0,target,main_pic_l1_dist,main_pic_l2_dist,main_pic_cos_dist,pic_cnt_1,pic_cnt_2,pic_cnt_diff,pics_min_dist,pics_mean_dist,pics_max_dist,pics_std_dist,pics_diff_dist,mean_dist_to_main_1,mean_dist_to_main_2,mean_dist_to_main_diff
0,0,2.361205,0.259265,0.999613,0,0,0,0.999613,0.999613,0.999613,0.000000,0.000000,0.999613,0.999613,0.000000e+00
1,1,10.075069,1.078670,0.987629,0,0,0,0.987629,0.987629,0.987629,0.000000,0.000000,0.987629,0.987629,0.000000e+00
2,1,9.318584,1.008816,0.988570,0,0,0,0.988570,0.988570,0.988570,0.000000,0.000000,0.988570,0.988570,0.000000e+00
3,1,30.330664,3.335172,0.927439,10,4,6,0.588097,0.804703,0.985439,0.086139,0.397342,0.859577,0.826733,3.284451e-02
4,0,7.489506,0.834596,0.989531,0,0,0,0.989531,0.989531,0.989531,0.000000,0.000000,0.989531,0.989531,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306535,0,0.000000,0.000000,1.000000,11,11,0,0.048930,0.476243,1.000000,0.259810,0.951071,0.525988,0.525988,1.110223e-16
306536,0,0.000000,0.000000,1.000000,6,6,0,0.281870,0.681711,1.000000,0.202007,0.718130,0.769754,0.769754,1.110223e-16
306537,0,14.295962,1.579893,0.968501,14,14,0,-0.074328,0.568082,1.000000,0.261149,1.074328,0.671603,0.685545,1.394250e-02
306538,0,0.000000,0.000000,1.000000,0,0,0,1.000000,1.000000,1.000000,0.000000,0.000000,1.000000,1.000000,0.000000e+00


In [20]:
rtrain_df_pictures = make_pictures_features(rtrain_pairs, train_data)

  0%|          | 0/306540 [00:00<?, ?it/s]

# Names

In [21]:
def make_names_features(pairs, data):
    gc.collect()
    df = pairs.copy()
    calc_dists(
        df, 'name_bert', 
        data.loc[df.variantid1, 'name_bert_64'],
        data.loc[df.variantid2, 'name_bert_64']
    )
    calc_dists(
        df, 'name_labse', 
        data.loc[df.variantid1, 'name_labse_768'],
        data.loc[df.variantid2, 'name_labse_768']
    )
    
    names_1 = data.loc[pairs.variantid1, 'name']
    names_2 = data.loc[pairs.variantid2, 'name']
    dist, partial_dist, token_sort_dist, token_set_dist = [], [], [], []
    for name_1, name_2 in zip(names_1, names_2):
        dist.append(
            fuzz.ratio(name_1, name_2)
        )
        partial_dist.append(
            fuzz.partial_ratio(name_1, name_2)
        )
        token_sort_dist.append(
            fuzz.token_sort_ratio(name_1, name_2)
        )
        token_set_dist.append(
            fuzz.token_set_ratio(name_1, name_2)
        )
    df['name_dist'] = dist
    df['name_partial_dist'] = partial_dist
    df['name_token_sort_dist'] = token_sort_dist
    df['name_token_set_dist'] = token_set_dist
    
    same_words = []
    all_words = []
    for name_1, name_2 in zip(names_1, names_2):
        words_1 = set(name_1.split())
        words_2 = set(name_2.split())
        same_words.append(len(words_1 & words_2))
        all_words.append(len(words_1 | words_2))
    df['same_words'] = same_words
    df['all_words'] = all_words
    df['iou_words'] = df['same_words'] / df['all_words']
    df.loc[df['all_words']==0, 'iou_words'] = 0
    df['not_same_words'] = df['all_words'] - df['same_words']
    
    same_nums = []
    all_nums = []
    for name_1, name_2 in zip(names_1, names_2):
        # nums_1 = set(re.sub(r'\D+', ' ', name_1).split())
        # nums_2 = set(re.sub(r'\D+', ' ', name_2).split())
        nums_1 = set(nums_tokenizer.tokenize(name_1))
        nums_2 = set(nums_tokenizer.tokenize(name_2))
        same_nums.append(len(nums_1 & nums_2))
        all_nums.append(len(nums_1 | nums_2))
    df['same_nums'] = same_nums
    df['all_nums'] = all_nums
    df['iou_nums'] = df['same_nums'] / df['all_nums']
    df.loc[df['all_nums']==0, 'iou_nums'] = 1
    df['not_same_nums'] = df['all_nums'] - df['same_nums']
    
    df['name_len_1'] = names_1.apply(lambda x: len(x)).values
    df['name_len_2'] = names_2.apply(lambda x: len(x)).values
    df['name_words_1'] = names_1.apply(lambda x: len(x.split())).values
    df['name_words_2'] = names_2.apply(lambda x: len(x.split())).values
    df['name_digit_cnt_1'] = names_1.apply(lambda x: np.sum(['0' <= letter <= '9' for letter in x])).values
    df['name_digit_cnt_2'] = names_2.apply(lambda x: np.sum(['0' <= letter <= '9' for letter in x])).values
    df['name_eng_cnt_1'] = names_1.apply(lambda x: np.sum(['a' <= letter <= 'z' for letter in x.lower()])).values
    df['name_eng_cnt_2'] = names_2.apply(lambda x: np.sum(['a' <= letter <= 'z' for letter in x.lower()])).values
    df['name_rus_cnt_1'] = names_1.apply(lambda x: np.sum(['а' <= letter <= 'я' or letter=='ё' for letter in x.lower()])).values
    df['name_rus_cnt_2'] = names_2.apply(lambda x: np.sum(['а' <= letter <= 'я' or letter=='ё' for letter in x.lower()])).values
    df['name_upper_cnt_1'] = names_1.apply(lambda x: np.sum([letter.isupper() for letter in x])).values
    df['name_upper_cnt_2'] = names_2.apply(lambda x: np.sum([letter.isupper() for letter in x])).values
    for feature in ('len', 'words', 'digit_cnt', 'eng_cnt', 'rus_cnt', 'upper_cnt'):
        df[f'name_{feature}_diff'] = np.abs(df[f'name_{feature}_1'] - df[f'name_{feature}_2'])
    return df.drop(['variantid1', 'variantid2'], axis=1)

In [22]:
%%time
train_df_names = make_names_features(train_pairs, train_data)
train_df_names

CPU times: total: 2min
Wall time: 2min


Unnamed: 0,target,name_bert_l1_dist,name_bert_l2_dist,name_bert_cos_dist,name_labse_l1_dist,name_labse_l2_dist,name_labse_cos_dist,name_dist,name_partial_dist,name_token_sort_dist,name_token_set_dist,same_words,all_words,iou_words,not_same_words,same_nums,all_nums,iou_nums,not_same_nums,name_len_1,name_len_2,name_words_1,name_words_2,name_digit_cnt_1,name_digit_cnt_2,name_eng_cnt_1,name_eng_cnt_2,name_rus_cnt_1,name_rus_cnt_2,name_upper_cnt_1,name_upper_cnt_2,name_len_diff,name_words_diff,name_digit_cnt_diff,name_eng_cnt_diff,name_rus_cnt_diff,name_upper_cnt_diff
0,0,2.036843,3.384575e-01,0.996394,6.450904,0.289437,0.958113,96,96,96,96,6,10,0.600000,4,2,6,0.333333,4,52,54,8,8,11,12,13,13,18,18,11,11,2,0,1,0,0,0
1,1,3.462012,5.778676e-01,0.989412,7.236592,0.334692,0.943991,91,89,91,91,6,10,0.600000,4,4,6,0.666667,2,68,66,8,8,18,18,10,10,30,27,8,8,2,0,0,0,3,0
2,1,3.115397,4.912246e-01,0.992545,6.322046,0.277990,0.961361,89,85,89,89,6,10,0.600000,4,4,6,0.666667,2,71,66,8,8,18,18,10,10,33,27,8,8,5,0,0,0,6,0
3,1,1.128109,1.721761e-01,0.999034,9.309541,0.424357,0.909961,84,100,85,100,4,7,0.571429,3,1,1,1.000000,0,23,32,5,6,1,1,1,1,17,24,2,2,9,1,0,0,7,0
4,0,1.829911,2.948854e-01,0.997475,4.707582,0.216588,0.976545,87,91,90,90,10,16,0.625000,6,2,2,1.000000,0,68,74,14,14,4,4,24,24,23,29,8,9,6,0,0,0,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306535,0,1.684155,2.589245e-01,0.998024,6.900630,0.315783,0.950141,91,90,88,91,5,7,0.714286,2,1,5,0.200000,4,39,40,6,6,6,7,7,7,17,17,5,5,1,0,1,0,0,0
306536,0,0.000005,8.380848e-07,1.000000,12.931353,0.589798,0.826069,82,79,77,83,3,9,0.333333,6,0,4,0.000000,4,39,41,6,6,3,5,6,6,23,23,9,9,2,0,2,0,0,0
306537,0,2.880129,4.998099e-01,0.992803,5.519185,0.249838,0.968791,91,88,78,90,6,10,0.600000,4,2,4,0.500000,2,44,42,8,8,7,6,11,11,16,15,8,8,2,0,1,0,1,0
306538,0,0.000005,8.097516e-07,1.000000,0.000000,0.000000,1.000000,100,100,100,100,19,19,1.000000,0,4,4,1.000000,0,132,132,19,19,6,6,2,2,102,102,5,5,0,0,0,0,0,0


In [23]:
rtrain_df_names = make_names_features(rtrain_pairs, train_data)

## Attributes

In [24]:
hard_pairs = train_pairs.copy()
hard_pairs = hard_pairs[hard_pairs.target==0]
mask = train_data.loc[hard_pairs.variantid1, 'name'].values==train_data.loc[hard_pairs.variantid2, 'name'].values
hard_pairs = hard_pairs[mask]
mask = []
pics_1 = train_data.loc[hard_pairs.variantid1, 'main_pic_embeddings_resnet_v1'].values
pics_2 = train_data.loc[hard_pairs.variantid2, 'main_pic_embeddings_resnet_v1'].values
for pic_1, pic_2 in zip(pics_1, pics_2):
    mask.append((pic_1 == pic_2).all())
hard_pairs = hard_pairs[mask]

imp_attrs = set()
attrs_1 = train_data.loc[hard_pairs.variantid1, 'characteristic_attributes_mapping'].values
attrs_2 = train_data.loc[hard_pairs.variantid2, 'characteristic_attributes_mapping'].values
error = []
for i, (attr_1, attr_2) in enumerate(zip(attrs_1, attrs_2)):
    assert len(attr_1.keys() & attr_2.keys()) > 0
    cnt = 0
    for key in attr_1.keys() & attr_2.keys():
        if len(set(attr_1[key]) & set(attr_2[key])) == 0:
            cnt += 1
            imp_attrs.add(key)
    if cnt == 0:
        error.append(i)
print(len(imp_attrs), imp_attrs)

121 {'Время автономной работы, ч', 'Длина, см', 'ОС (краткое название)', 'Количество камер, шт.', 'Материал проводника', 'Высота, см', 'Размер коврика', 'Общий объем SSD, ГБ', 'Партномер', 'Стандарты связи', 'Видеокарта', 'Оперативная память', 'Цвет клавиатуры', 'Модуль связи Bluetooth', 'Частота процессора, ГГц', 'Макс. разрешение видеозаписи', 'Версия Android', 'Конструкция наушников', 'Макс. время работы (музыка),  ч', 'Модуль связи WiFi', 'Вес товара, г', 'Диагональ экрана, дюймы', 'Покрытие', 'Модель устройства', 'Количество SSD', 'Объем встроенной памяти', 'Размер', 'Суммарный объем памяти', 'Мощность, Вт', 'Встроенные датчики', 'Версия iOS', 'Назначение слотов', 'Макс. объем карты памяти,  ГБ', 'Цвет тонера/чернил', 'Домашний регион', 'Интерфейсы', 'Тип карты памяти', 'Категория патч-корда и витой пары', 'Формат фона, м', 'Количество в упаковке, шт', 'Число портов USB 3.2 Gen 1', 'Бренд', 'Модель процессора', 'Материал', 'Вес с упаковкой, г', 'Название цвета', 'Тип аккумулятора'

In [25]:
train_attrs = set()
for attrs in tqdm(train_data.characteristic_attributes_mapping):
    train_attrs |= attrs.keys()
print(len(train_attrs))
test_attrs = set()
for attrs in tqdm(test_data.characteristic_attributes_mapping):
    test_attrs |= attrs.keys()
print(len(test_attrs))
all_attrs = train_attrs | test_attrs  
both_attrs = train_attrs & test_attrs    
len(all_attrs), len(both_attrs)

  0%|          | 0/457063 [00:00<?, ?it/s]

1447


  0%|          | 0/35730 [00:00<?, ?it/s]

1238


(1461, 1224)

In [26]:
num_attrs = all_attrs.copy()
for attrs in tqdm(train_data.characteristic_attributes_mapping):
    for key, value in attrs.items():
        nums = nums_tokenizer.tokenize(value[0])
        if len(nums) != 1 and key in num_attrs:
            num_attrs.remove(key)
for attrs in tqdm(test_data.characteristic_attributes_mapping):
    for key, value in attrs.items():
        nums = nums_tokenizer.tokenize(value[0])
        if len(nums) != 1 and key in num_attrs:
            num_attrs.remove(key)
len(num_attrs)

  0%|          | 0/457063 [00:00<?, ?it/s]

  0%|          | 0/35730 [00:00<?, ?it/s]

549

In [27]:
def make_df(pairs, data):
    gc.collect()
    df = pairs.copy()
    calc_dists(
        df, 'name_bert', 
        data.loc[df.variantid1, 'name_bert_64'],
        data.loc[df.variantid2, 'name_bert_64']
    )
    main_pics_1 = data.loc[df.variantid1, 'main_pic_embeddings_resnet_v1'].values
    main_pics_2 = data.loc[df.variantid2, 'main_pic_embeddings_resnet_v1'].values
    calc_dists(
        df, 'main_pic', 
        main_pics_1,
        main_pics_2
    )
    attrs_1 = data.loc[pairs.variantid1, 'characteristic_attributes_mapping']
    attrs_2 = data.loc[pairs.variantid2, 'characteristic_attributes_mapping']  
    for attr in tqdm(num_attrs & both_attrs): 
        values = []
        for attr_1, attr_2 in zip(attrs_1, attrs_2):
            if attr not in attr_1 or attr not in attr_2:
                values.append(-(attr not in attr_1)-(attr not in attr_2))
            else:
                values.append(np.abs(
                    float(nums_tokenizer.tokenize(attr_1[attr][0])[0]) - float(nums_tokenizer.tokenize(attr_2[attr][0])[0])
                ))
        df[f'diff_{attr}'] = values
    for attr in tqdm(both_attrs): 
        values = []
        for attr_1, attr_2 in zip(attrs_1, attrs_2):
            if attr not in attr_1 or attr not in attr_2:
                values.append(-(attr not in attr_1)-(attr not in attr_2))
            else:
                text_1 = ' '.join(attr_1[attr])
                text_1 = tokenizer.tokenize(text_1)
                text_1 = ' '.join(text_1).lower()
                text_2 = ' '.join(attr_2[attr])
                text_2 = tokenizer.tokenize(text_2)
                text_2 = ' '.join(text_2).lower()
                values.append(fuzz.token_set_ratio(text_1, text_2))
        df[f'fuzz_{attr}'] = values
    return df.drop(['variantid1', 'variantid2'], axis=1)

In [None]:
train_df = make_df(train_pairs, train_data)
train_df

In [29]:
train_pool = Pool(
    data=train_df.drop('target', axis = 1),
    label=train_df['target'],
)

In [30]:
params = {
    'loss_function': 'CrossEntropy',
    'task_type': 'CPU',
    'max_depth': 7
}

In [32]:
imp = 0
for seed in tqdm((7, 13, 19, 23, 31, 56)):
    model_cb = CatBoostClassifier(**params, random_seed=seed)
    model_cb.fit(train_pool, verbose=500, plot=False)
    imp += model_cb.get_feature_importance(prettified=True).set_index('Feature Id')
imp.sort_values(by=['Importances'], ascending=False, inplace=True)
imp

  0%|          | 0/6 [00:00<?, ?it/s]

0:	learn: 0.6833883	total: 117ms	remaining: 1m 56s
500:	learn: 0.4320365	total: 19.6s	remaining: 19.5s
999:	learn: 0.4122467	total: 38.8s	remaining: 0us
0:	learn: 0.6833291	total: 91.7ms	remaining: 1m 31s
500:	learn: 0.4328766	total: 19.7s	remaining: 19.6s
999:	learn: 0.4123343	total: 38.9s	remaining: 0us
0:	learn: 0.6834032	total: 109ms	remaining: 1m 49s
500:	learn: 0.4327123	total: 20.3s	remaining: 20.2s
999:	learn: 0.4123886	total: 40.1s	remaining: 0us
0:	learn: 0.6834789	total: 79.5ms	remaining: 1m 19s
500:	learn: 0.4324802	total: 20.1s	remaining: 20.1s
999:	learn: 0.4124774	total: 39.9s	remaining: 0us
0:	learn: 0.6831586	total: 110ms	remaining: 1m 50s
500:	learn: 0.4332078	total: 19.9s	remaining: 19.8s
999:	learn: 0.4126612	total: 39.7s	remaining: 0us
0:	learn: 0.6833669	total: 79.1ms	remaining: 1m 18s
500:	learn: 0.4330034	total: 20.2s	remaining: 20.1s
999:	learn: 0.4123441	total: 40.2s	remaining: 0us


Unnamed: 0_level_0,Importances
Feature Id,Unnamed: 1_level_1
fuzz_Гарантийный срок,46.475638
main_pic_l2_dist,32.037550
fuzz_Объем,27.306043
fuzz_Оперативная память,27.305963
fuzz_Цвет товара,25.172370
...,...
fuzz_Тип RFID считывателя,0.000000
fuzz_Версия MacOS,0.000000
fuzz_Способ подключения,0.000000
"fuzz_Мин. скорость затвора, сек.",0.000000


In [39]:
best_num_attrs, best_attrs = set(), set()

for feature in imp.index:
    if feature.startswith('fuzz_'):
        best_attrs.add(feature[5:])
    elif feature.startswith('diff_'):
        best_num_attrs.add(feature[5:])
    if len(best_num_attrs) + len(best_attrs) == 600:
        break

In [40]:
len(best_num_attrs), len(best_attrs)

(143, 457)

In [41]:
def make_attributes_features(pairs, data):
    gc.collect()
    df = pairs.copy()
    attrs_1 = data.loc[pairs.variantid1, 'characteristic_attributes_mapping']
    attrs_2 = data.loc[pairs.variantid2, 'characteristic_attributes_mapping']
    same_keys, all_keys = [], []
    same_values, same_values_dist = [], []
    for attr_1, attr_2 in tqdm(zip(attrs_1, attrs_2), total=len(df)):
        same_keys.append(len(attr_1.keys() & attr_2.keys()))
        all_keys.append(len(attr_1.keys() | attr_2.keys()))
        count, dist = 0, 0
        for key in attr_1.keys() & attr_2.keys():
            count += attr_1[key] == attr_2[key]
            dist += fuzz.ratio(attr_1[key], attr_2[key])
        same_values.append(count)
        same_values_dist.append(dist / 100.)
    df['same_keys'] = same_keys
    df['all_keys'] = all_keys
    df['iou_keys'] = df['same_keys'] / df['all_keys']
    df.loc[df['all_keys']==0, 'iou_keys'] = 0
    df['not_same_keys'] = df['all_keys'] - df['same_keys']
    df['same_values'] = same_values
    df['same_values_ratio'] = df['same_values'] / df['same_keys'] 
    df.loc[df['same_keys']==0, 'same_values_ratio'] = 0
    df['same_values_dist'] = same_values_dist
    df['same_values_dist_ratio'] = df['same_values_dist'] / df['same_keys'] 
    df.loc[df['same_keys']==0, 'same_values_dist_ratio'] = 0
    
    imp_neq_cnt, imp_cnt = [], []
    for attr_1, attr_2 in tqdm(zip(attrs_1, attrs_2), total=len(df)):
        keys = attr_1.keys() & attr_2.keys() & imp_attrs
        cnt = 0
        for key in keys:
            if len(set(attr_1[key]) & set(attr_2[key])) == 0:
                cnt += 100 - fuzz.token_set_ratio(attr_1[key], attr_2[key])
        imp_neq_cnt.append(cnt)
        imp_cnt.append(len(keys))
    df['imp_neq_cnt'] = imp_neq_cnt
    df['imp_cnt'] = imp_cnt
    df['imp_eq_cnt'] = df['imp_cnt'] * 100 - df['imp_neq_cnt']
    df['neq/imp_cnt'] = df['imp_neq_cnt'] / df['imp_cnt']
    df.loc[df['imp_cnt'] == 0, 'neq/imp_cnt'] = 0
    
    attr_same_nums = []
    attr_all_nums = []
    for attr_1, attr_2 in tqdm(zip(attrs_1, attrs_2), total=len(df)):
        p, q = 0, 0
        for key in attr_1.keys() & attr_2.keys():
            nums_1 = set(nums_tokenizer.tokenize(' '.join(attr_1[key])))
            nums_2 = set(nums_tokenizer.tokenize(' '.join(attr_2[key])))
            p += len(nums_1 & nums_2)
            q += len(nums_1 | nums_2)
        attr_same_nums.append(p)
        attr_all_nums.append(q)
    df['attr_same_nums'] = attr_same_nums
    df['attr_all_nums'] = attr_all_nums
    df['attr_iou_nums'] = df['attr_same_nums'] / df['attr_all_nums']
    df.loc[df['attr_all_nums']==0, 'attr_iou_nums'] = 1
    df['attr_not_same_nums'] = df['attr_all_nums'] - df['attr_same_nums']  
    
    attr_same_words = []
    attr_all_words = []
    for attr_1, attr_2 in tqdm(zip(attrs_1, attrs_2), total=len(df)):
        p, q = 0, 0
        for key in attr_1.keys() & attr_2.keys():
            words_1 = set(' '.join(attr_1[key]).split())
            words_2 = set(' '.join(attr_2[key]).split())
            p += len(words_1 & words_2)
            q += len(words_1 | words_2)
        attr_same_words.append(p)
        attr_all_words.append(q)
    df['attr_same_words'] = attr_same_words
    df['attr_all_words'] = attr_all_words
    df['attr_iou_words'] = df['attr_same_words'] / df['attr_all_words']
    df.loc[df['attr_all_words']==0, 'attr_iou_words'] = 1
    df['attr_not_same_words'] = df['attr_all_words'] - df['attr_same_words']  
    
    num_attrs_smape_sum = []
    num_attrs_total = []
    for attr_1, attr_2 in tqdm(zip(attrs_1, attrs_2), total=len(df)):
        cur = 0
        keys = attr_1.keys() & attr_2.keys() & num_attrs
        for key in keys:
            num_1 = float(nums_tokenizer.tokenize(attr_1[key][0])[0])
            num_2 = float(nums_tokenizer.tokenize(attr_2[key][0])[0])
            cur += 2 * np.abs(num_1 - num_2) / (num_1 + num_2 + 1e-9)
        num_attrs_smape_sum.append(cur)
        num_attrs_total.append(len(keys))
    df['num_attrs_smape_sum'] = num_attrs_smape_sum
    df['num_attrs_total'] = num_attrs_total
    df['num_attrs_smape_mean'] = df['num_attrs_smape_sum'] / df['num_attrs_total']
    df.loc[df['num_attrs_total']==0, 'num_attrs_smape_mean'] = 2
    
    for attr in tqdm(best_num_attrs): 
        values = []
        for attr_1, attr_2 in zip(attrs_1, attrs_2):
            if attr not in attr_1 or attr not in attr_2:
                values.append(-(attr not in attr_1)-(attr not in attr_2))
            else:
                values.append(np.abs(
                    float(nums_tokenizer.tokenize(attr_1[attr][0])[0]) - float(nums_tokenizer.tokenize(attr_2[attr][0])[0])
                ))
        df[f'diff_{attr}'] = values
        
    for attr in tqdm(best_attrs): 
        values = []
        for attr_1, attr_2 in zip(attrs_1, attrs_2):
            if attr not in attr_1 or attr not in attr_2:
                values.append(-(attr not in attr_1)-(attr not in attr_2))
            else:
                text_1 = ' '.join(attr_1[attr])
                text_1 = tokenizer.tokenize(text_1)
                text_1 = ' '.join(text_1).lower()
                text_2 = ' '.join(attr_2[attr])
                text_2 = tokenizer.tokenize(text_2)
                text_2 = ' '.join(text_2).lower()
                values.append(fuzz.token_set_ratio(text_1, text_2))
        df[f'fuzz_{attr}'] = values
    
    cat_features = [
        'Страна-изготовитель', 'Оперативная память', 'Бренд процессора', 'Модуль связи Bluetooth', 'Назначение'
    ]
    fill_values = ['не указана', '', '', '', 'нет', '']
    for feature, fill_value in tqdm(zip(cat_features, fill_values), total=len(cat_features)):
        train_values = set()
        for attrs in train_data.characteristic_attributes_mapping:
            train_values.add(attrs.get(feature, [fill_value])[0].lower())
        test_values = set()
        for attrs in test_data.characteristic_attributes_mapping:
            test_values.add(attrs.get(feature, [fill_value])[0].lower())
        both_values = train_values&test_values
        values_1 = []
        for attrs in attrs_1:
            value = attrs.get(feature, [fill_value])[0].lower()
            if value in both_values:
                values_1.append(value)
            else:
                values_1.append('другое')
        df[f'{feature}_1'] = values_1  
        values_2 = []
        for attrs in attrs_2:
            value = attrs.get(feature, [fill_value])[0].lower()
            if value in both_values:
                values_2.append(value)
            else:
                values_2.append('другое')
        df[f'{feature}_2'] = values_2
        
    return df.drop(['variantid1', 'variantid2'], axis=1)

In [None]:
%%time
train_df_attributes = make_attributes_features(train_pairs, train_data)
train_df_attributes

In [None]:
rtrain_df_attributes = make_attributes_features(rtrain_pairs, train_data)

## Embeddings

In [44]:
def make_embeddings_features(pairs, data):
    gc.collect()
    df = pairs.copy()
    df['main_pic_1'] = list(data.loc[pairs.variantid1, 'main_pic_embeddings_resnet_v1'])
    df['main_pic_2'] = list(data.loc[pairs.variantid2, 'main_pic_embeddings_resnet_v1'])
    df['name_bert_1'] = list(data.loc[pairs.variantid1, 'name_bert_64'])
    df['name_bert_2'] = list(data.loc[pairs.variantid2, 'name_bert_64'])
    df['name_labse_1'] = list(data.loc[pairs.variantid1, 'name_labse_768'])
    df['name_labse_2'] = list(data.loc[pairs.variantid2, 'name_labse_768'])
    return df.drop(['variantid1', 'variantid2'], axis=1)

In [45]:
%%time
train_df_embeddings = make_embeddings_features(train_pairs, train_data)
train_df_embeddings

CPU times: total: 2.52 s
Wall time: 2.51 s


Unnamed: 0,target,main_pic_1,main_pic_2,name_bert_1,name_bert_2,name_labse_1,name_labse_2
0,0,"[-0.4304909, -0.49474272, -0.46439183, -0.0609...","[-0.42941108, -0.5129398, -0.4753536, -0.06778...","[-0.5104684, 0.56158644, 0.58873796, -0.529718...","[-0.455473, 0.58157134, 0.5870387, -0.5325003,...","[0.056775797, -0.0154226115, 0.022237448, 0.07...","[0.050625164, -0.015585997, 0.013225925, 0.065..."
1,1,"[-0.0032773763, 0.32531193, -0.33156675, 0.410...","[-0.043616347, 0.49310583, -0.3069673, 0.48206...","[-0.53706163, 0.37264067, 0.44363657, -0.37289...","[-0.51572454, 0.40346462, 0.43528882, -0.34104...","[-0.009024617, -0.013692401, 0.022784136, 0.00...","[0.00334543, -0.0057770386, 0.025496263, 0.031..."
2,1,"[0.027616128, 0.33428708, -0.37326592, 0.41088...","[-0.041107245, 0.48485547, -0.2837791, 0.46370...","[-0.61162275, 0.4953002, 0.47400212, -0.429568...","[-0.5087511, 0.46164495, 0.42914906, -0.462508...","[0.0006934129, -0.009137327, 0.03223274, 0.011...","[0.003256556, 0.0072548394, 0.02573709, 0.0333..."
3,1,"[0.8777658, -0.42428812, 1.1310052, 0.42804056...","[0.6647656, -0.07456402, 1.2397044, 0.6757724,...","[-0.45303595, 0.35168907, 0.45074046, -0.29676...","[-0.43019646, 0.3099462, 0.45133576, -0.297509...","[-0.03378712, -0.013189872, 0.052684598, 0.020...","[-0.052225627, -0.0086708, 0.05642182, 0.02599..."
4,0,"[-0.2516712, -0.042763397, 0.29271233, 0.28473...","[-0.16422251, -0.089478336, 0.29960525, 0.2463...","[-0.637286, 0.5249721, 0.6592931, -0.5001871, ...","[-0.6326568, 0.5248482, 0.6736372, -0.5164456,...","[-0.060493078, -0.013913398, 0.045734614, -0.0...","[-0.0647266, -0.0018018482, 0.04374917, 0.0077..."
...,...,...,...,...,...,...,...
306535,0,"[0.12376598, -0.56147766, 0.8182319, 0.3562862...","[0.12376598, -0.56147766, 0.8182319, 0.3562862...","[-0.6552933, 0.43776116, 0.61501455, -0.564846...","[-0.63105303, 0.45106313, 0.63246083, -0.55864...","[-0.009680024, -0.03594739, 0.032366067, 0.015...","[-0.021700157, -0.018895453, 0.027215147, 0.00..."
306536,0,"[0.1316294, -0.050370954, 0.76977086, -0.76957...","[0.1316294, -0.050370954, 0.76977086, -0.76957...","[-0.6374895, 0.68722314, 0.58021176, -0.699589...","[-0.6374897, 0.68722296, 0.58021176, -0.699589...","[-0.009263517, -0.034818575, 0.0373524, 0.0037...","[0.026554149, -0.053082276, 0.026956635, 0.006..."
306537,0,"[0.43587458, -0.6701832, -0.032724172, 0.32214...","[0.34710243, -0.55791867, 0.15381934, 0.123870...","[-0.59312373, 0.42882127, 0.36561173, -0.54967...","[-0.6144494, 0.50512916, 0.40611482, -0.582776...","[-0.043391995, -0.016225116, 0.0608503, 0.0115...","[-0.024951177, -0.021576101, 0.06375695, 0.022..."
306538,0,"[-0.039506897, 0.13222088, 1.6788204, -0.01456...","[-0.039506897, 0.13222088, 1.6788204, -0.01456...","[-0.5437152, 0.36101848, 0.5552317, -0.5141926...","[-0.54371524, 0.36101842, 0.5552317, -0.514192...","[0.019028962, 0.02708066, 0.0255603, -0.061653...","[0.019028962, 0.02708066, 0.0255603, -0.061653..."


In [46]:
rtrain_df_embeddings = make_embeddings_features(rtrain_pairs, train_data)

## Unite

In [47]:
train_df = pd.concat([
    train_df_categories,
    train_df_color,
    train_df_pictures,
    train_df_names,
    train_df_attributes,
    train_df_embeddings
], axis=1)

rtrain_df = pd.concat([
    rtrain_df_categories,
    rtrain_df_color,
    rtrain_df_pictures,
    rtrain_df_names,
    rtrain_df_attributes,
    rtrain_df_embeddings
], axis=1)

# Model

In [48]:
val_df = train_df[pd.read_csv('./datasets/val_idx.csv', index_col=0).values].copy()
train_df = train_df[pd.read_csv('./datasets/train_idx.csv', index_col=0).values].copy()
rval_df = rtrain_df[pd.read_csv('./datasets/val_idx.csv', index_col=0).values].copy()
rtrain_df = rtrain_df[pd.read_csv('./datasets/train_idx.csv', index_col=0).values].copy()
all_train = pd.concat([train_df, rtrain_df], axis=0)
all_val = pd.concat([val_df, rval_df], axis=0)
all_train

Unnamed: 0,target,cat3,cat4,is_eq_cat4,target.1,same_colors,all_colors,iou_colors,not_same_colors,target.2,main_pic_l1_dist,main_pic_l2_dist,main_pic_cos_dist,pic_cnt_1,pic_cnt_2,pic_cnt_diff,pics_min_dist,pics_mean_dist,pics_max_dist,pics_std_dist,pics_diff_dist,mean_dist_to_main_1,mean_dist_to_main_2,mean_dist_to_main_diff,target.3,name_bert_l1_dist,name_bert_l2_dist,name_bert_cos_dist,name_labse_l1_dist,name_labse_l2_dist,name_labse_cos_dist,name_dist,name_partial_dist,name_token_sort_dist,name_token_set_dist,same_words,all_words,iou_words,not_same_words,same_nums,all_nums,iou_nums,not_same_nums,name_len_1,name_len_2,name_words_1,name_words_2,name_digit_cnt_1,name_digit_cnt_2,name_eng_cnt_1,name_eng_cnt_2,name_rus_cnt_1,name_rus_cnt_2,name_upper_cnt_1,name_upper_cnt_2,name_len_diff,name_words_diff,name_digit_cnt_diff,name_eng_cnt_diff,name_rus_cnt_diff,name_upper_cnt_diff,target.4,same_keys,all_keys,iou_keys,not_same_keys,same_values,same_values_ratio,same_values_dist,same_values_dist_ratio,imp_neq_cnt,imp_cnt,imp_eq_cnt,neq/imp_cnt,attr_same_nums,attr_all_nums,attr_iou_nums,attr_not_same_nums,attr_same_words,attr_all_words,attr_iou_words,attr_not_same_words,num_attrs_smape_sum,num_attrs_total,num_attrs_smape_mean,diff_Количество ядер,diff_RAS to CAS Delay (tRCD),"diff_Длина, см","diff_Стартовый баланс, руб","diff_Длина видеокарты, мм",diff_Кол-во подключаемых трубок,diff_CAS Latency (CL),diff_Количество антенн,"diff_Макс. скорость беспроводного соединения, Мбит/с","diff_Высота, см","diff_Общий объем SSD, ГБ","diff_Объем корзины, л",diff_Кол-во выходов Display Port,"diff_Кэш L3, МБ",diff_Количество USB портов,diff_Частоты Wi-Fi,diff_Модуль связи Bluetooth,"diff_Частота процессора, ГГц","diff_Емкость, А•ч",diff_Толщина пленки,"diff_Макс. расстояние от стены, мм",diff_Количество PCI-E x1,"diff_Кол-во входных аналоговых каналов, шт.","diff_Диаметр, мм","diff_Макс. время работы (музыка), ч",diff_Количество PCI-E x16,"diff_Теплопроводность, Вт/мК","diff_Макс. рабочая температура, °С",diff_Модуль связи WiFi,diff_Кол-во выходов HDMI,diff_Число подключаемых мониторов,diff_Количество SSD,diff_Кол-во встроенных игр,"diff_Скорость чтения, Мб/с",diff_Макс. поддерживаемая частота RAM,"diff_Диаметр динамика, мм",diff_Револьверное устройство,"diff_Число пикселей матрицы, Мпикс","diff_Мощность, Вт",diff_Кол-во внутренних разъемов SATA 6 Гбит/с,diff_Версия iOS,"diff_Макс. объем карты памяти, ГБ",diff_Стандарт защиты,...,fuzz_Соотношение сторон экрана,fuzz_Поддержка eSim,fuzz_Защита информации,fuzz_Материал фотофона,fuzz_Форм-фактор ноутбука,"fuzz_Время автономной работы, ч",fuzz_Переключение скоростей,fuzz_Место крепления,fuzz_Комплектация зарядного устройства,fuzz_Тип лампы,fuzz_Радиатор,fuzz_CAS Latency (CL),fuzz_Размер коврика,fuzz_Технология матрицы ТВ,fuzz_Емкость,fuzz_Емкость одного модуля,"fuzz_Потребляемая мощность, Вт",fuzz_Стекло,fuzz_Вид стекла,fuzz_Интерфейс подключения,"fuzz_Емкость, А•ч",fuzz_Толщина пленки,fuzz_Обратная связь,fuzz_Вид оплаты,fuzz_Технология HDR,fuzz_Совместимость с фотокамерами,fuzz_Навигационное ПО,"fuzz_Макс. рабочая температура, °С",fuzz_Подставка под запястье,"fuzz_Диагональ экрана, дюймы",fuzz_Наличие дисплея,"fuzz_Скорость чтения, Мб/с",fuzz_Модуль сотовой связи,fuzz_Особенности дисплея,"fuzz_Скорости, об/мин",fuzz_Кол-во внутренних разъемов SATA 6 Гбит/с,fuzz_Цвет тонера/чернил,fuzz_Стандарт защиты,fuzz_Разъем питания процессора,fuzz_Ресурс SSD (TBW),fuzz_Категория патч-корда и витой пары,fuzz_Блок питания,"fuzz_Максимальный вес ворот, кг","fuzz_Макс. частота графического процессора (Boost), МГц","fuzz_Макс. ускорение, G",fuzz_Уровни регулировки подставки,fuzz_Бренд,fuzz_Модель процессора,fuzz_Оригинальность расходника,fuzz_Тип коннектора 1,fuzz_Бесконтактная оплата,fuzz_Тип телескопа,fuzz_Интерфейсы и разъемы,fuzz_Тип аккумулятора,fuzz_Наличие автоответчика,fuzz_Подключение к Smart TV,fuzz_Монтировка,"fuzz_Полная выходная мощность звука, Вт",fuzz_Интерфейсы регистратора,fuzz_Интерфейс,fuzz_Тип насадки микроскопа,"fuzz_Макс. воздушный поток, CFM",fuzz_Мониторинг,fuzz_Рекомендовано для,"fuzz_MTBF, кликов","fuzz_Длина ремешка, мм","fuzz_Максимальная высота/длина, см",fuzz_Конфигурация,"fuzz_Сечение жилы, кв.мм",fuzz_Химический тип,fuzz_Вид микрофона,"fuzz_Размер без подставки (ШxВxГ), мм",fuzz_Настенная установка,fuzz_Назначение кулера,fuzz_Слот для карты памяти,fuzz_Список совместимых устройств,"fuzz_Макс. разрешение фронтальной камеры, Мпикс",fuzz_Линейка карты памяти,fuzz_Форма циферблата,fuzz_Выходное напряжение,fuzz_Входные интерфейсы,fuzz_Взаимодействие с носителями,fuzz_Работа в режиме телефона,"fuzz_Срок службы, г",fuzz_Тип матрицы,fuzz_Кол-во разъемов Molex,"fuzz_Длина кабеля, см",fuzz_Аудиокодек,fuzz_Вид модуля,fuzz_Питание от,fuzz_Модель браслета/умных часов,"fuzz_Длина, мм",fuzz_Назначение,fuzz_Тип подсветки,fuzz_Особенности коврика,fuzz_Общее количество пикселей,fuzz_Макс. разрешение фото,"fuzz_Межзрачковое расстояние, мм",fuzz_Раскладка клавиатуры,fuzz_Количество клавиш клавиатуры,fuzz_Тип датчика,fuzz_Функции зарядки,fuzz_Единиц в одном товаре,"fuzz_Время зарядки аккумулятора, мин",fuzz_Доп. комплектация,"fuzz_Яркость, кд/м2",fuzz_Тип связи,fuzz_Разрешение,fuzz_Серия графического процессора,"fuzz_Количество в упаковке, шт.",fuzz_Разъем на кабеле,Страна-изготовитель_1,Страна-изготовитель_2,Оперативная память_1,Оперативная память_2,Бренд процессора_1,Бренд процессора_2,Модуль связи Bluetooth_1,Модуль связи Bluetooth_2,Назначение_1,Назначение_2,target.5,main_pic_1,main_pic_2,name_bert_1,name_bert_2,name_labse_1,name_labse_2
0,0,"Сетевые фильтры, разветвители и удлинители","Сетевой фильтр, удлинитель, разветвитель",True,0,1,1,1.0,0,0,2.361205,0.259265,0.999613,0,0,0,0.999613,0.999613,0.999613,0.000000,0.000000,0.999613,0.999613,0.000000e+00,0,2.036843,3.384575e-01,0.996394,6.450904,0.289437,0.958113,96,96,96,96,6,10,0.600000,4,2,6,0.333333,4,52,54,8,8,11,12,13,13,18,18,11,11,2,0,1,0,0,0,0,18,18,1.000000,0,17,0.944444,17.83,0.990556,0,7,700,0.000000,10,12,0.833333,2,22,24,0.916667,2,1.076923,7,0.153846,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0,...,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,100,-2,-2,-2,-2,-2,-2,-2,-2,100,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,100,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,не указана,не указана,,,,,,,нет,нет,0,"[-0.4304909, -0.49474272, -0.46439183, -0.0609...","[-0.42941108, -0.5129398, -0.4753536, -0.06778...","[-0.5104684, 0.56158644, 0.58873796, -0.529718...","[-0.455473, 0.58157134, 0.5870387, -0.5325003,...","[0.056775797, -0.0154226115, 0.022237448, 0.07...","[0.050625164, -0.015585997, 0.013225925, 0.065..."
2,1,Расходник для печати,Картридж,True,1,0,2,0.0,2,1,9.318584,1.008816,0.988570,0,0,0,0.988570,0.988570,0.988570,0.000000,0.000000,0.988570,0.988570,0.000000e+00,1,3.115397,4.912246e-01,0.992545,6.322046,0.277990,0.961361,89,85,89,89,6,10,0.600000,4,4,6,0.666667,2,71,66,8,8,18,18,10,10,33,27,8,8,5,0,0,0,6,0,1,2,13,0.153846,11,1,0.500000,1.35,0.675000,100,2,100,50.000000,0,0,1.000000,0,1,4,0.250000,3,0.000000,0,2.000000,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-2,-2,-2,-2,-2,-2,-2,-2,-2,0,-2,-1,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,не указана,не указана,,,,,,,для лазерного принтера,нет,1,"[0.027616128, 0.33428708, -0.37326592, 0.41088...","[-0.041107245, 0.48485547, -0.2837791, 0.46370...","[-0.61162275, 0.4953002, 0.47400212, -0.429568...","[-0.5087511, 0.46164495, 0.42914906, -0.462508...","[0.0006934129, -0.009137327, 0.03223274, 0.011...","[0.003256556, 0.0072548394, 0.02573709, 0.0333..."
3,1,Смарт-часы,Умные часы,True,1,1,2,0.5,1,1,30.330664,3.335172,0.927439,10,4,6,0.588097,0.804703,0.985439,0.086139,0.397342,0.859577,0.826733,3.284451e-02,1,1.128109,1.721761e-01,0.999034,9.309541,0.424357,0.909961,84,100,85,100,4,7,0.571429,3,1,1,1.000000,0,23,32,5,6,1,1,1,1,17,24,2,2,9,1,0,0,7,0,1,19,31,0.612903,12,12,0.631579,16.59,0.873158,174,9,726,19.333333,3,9,0.333333,6,24,45,0.533333,21,1.333333,2,0.666667,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,100,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,100,-2,-2,-2,-1,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,100,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,100,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-2,-2,-2,-2,-2,-2,-2,не указана,китай,,,,,,,нет,нет,1,"[0.8777658, -0.42428812, 1.1310052, 0.42804056...","[0.6647656, -0.07456402, 1.2397044, 0.6757724,...","[-0.45303595, 0.35168907, 0.45074046, -0.29676...","[-0.43019646, 0.3099462, 0.45133576, -0.297509...","[-0.03378712, -0.013189872, 0.052684598, 0.020...","[-0.052225627, -0.0086708, 0.05642182, 0.02599..."
4,0,Батарейки и аккумуляторы,Аккумулятор для телефона,True,0,1,1,1.0,0,0,7.489506,0.834596,0.989531,0,0,0,0.989531,0.989531,0.989531,0.000000,0.000000,0.989531,0.989531,0.000000e+00,0,1.829911,2.948854e-01,0.997475,4.707582,0.216588,0.976545,87,91,90,90,10,16,0.625000,6,2,2,1.000000,0,68,74,14,14,4,4,24,24,23,29,8,9,6,0,0,0,6,1,0,6,10,0.600000,4,2,0.333333,4.75,0.791667,145,5,355,29.000000,1,6,0.166667,5,8,20,0.400000,12,1.333333,1,1.333333,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,0,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,100,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,не указана,китай,,,,,,,нет,нет,0,"[-0.2516712, -0.042763397, 0.29271233, 0.28473...","[-0.16422251, -0.089478336, 0.29960525, 0.2463...","[-0.637286, 0.5249721, 0.6592931, -0.5001871, ...","[-0.6326568, 0.5248482, 0.6736372, -0.5164456,...","[-0.060493078, -0.013913398, 0.045734614, -0.0...","[-0.0647266, -0.0018018482, 0.04374917, 0.0077..."
5,0,SIM-карты,Тариф для телефона,True,0,0,0,0.0,0,0,0.798171,0.086939,0.999841,5,5,0,0.118992,0.493088,1.000000,0.275484,0.881008,0.605348,0.604105,1.242875e-03,0,1.488625,2.440297e-01,0.998104,7.358721,0.336890,0.943253,93,93,87,94,18,22,0.818182,4,4,4,1.000000,0,130,129,20,20,7,7,7,7,89,88,8,8,1,0,0,0,1,0,0,11,11,1.000000,0,8,0.727273,10.49,0.953636,28,4,372,7.000000,3,7,0.428571,4,14,20,0.700000,6,0.500000,2,0.250000,-2.0,-2.0,-2.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,100,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,100,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,100,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,не указана,не указана,,,,,,,мобильная связь,мобильная связь,0,"[-0.4402296, 0.08776277, 0.5703386, 0.46222636...","[-0.44197887, 0.097086996, 0.5637255, 0.450112...","[-0.54970133, 0.56535983, 0.5113189, -0.571969...","[-0.5275077, 0.5432708, 0.52933013, -0.5633142...","[0.0009698064, -0.047686934, 0.043287214, 0.01...","[-0.0070494404, -0.05083248, 0.052278094, 0.02..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306534,0,Карты памяти и флешки,Карта памяти,True,0,0,0,0.0,0,0,7.070901,0.773892,0.990122,7,7,0,0.034431,0.431369,1.000000,0.260927,0.965569,0.519765,0.502102,1.766361e-02,0,0.000004,6.134073e-07,1.000000,17.236347,0.789134,0.688634,86,82,74,86,2,6,0.333333,4,0,2,0.000000,2,17,18,4,4,1,2,0,0,13,13,3,3,1,0,1,0,0,0,0,8,8,1.000000,0,7,0.875000,7.71,0.963750,56,6,544,9.333333,3,5,0.600000,2,15,19,0.789474,4,0.000000,1,0.000000,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,100,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,не указана,не указана,,,,,,,нет,нет,0,"[0.13195059, 0.55129564, 0.56075263, 0.5999450...","[0.19128616, 0.530021, 0.57061875, 0.59129846,...","[-0.41885418, 0.46867815, 0.48758543, -0.45227...","[-0.41885406, 0.4686782, 0.48758537, -0.452275...","[0.03807568, -0.01929693, 0.054270603, 0.01067...","[0.055627737, -0.03249889, 0.015624781, 0.0112..."
306535,0,"Смартфоны, планшеты, мобильные телефоны",Смартфон,True,0,1,1,1.0,0,0,0.000000,0.000000,1.000000,11,11,0,0.048930,0.476243,1.000000,0.259810,0.951071,0.525988,0.525988,1.110223e-16,0,1.684155,2.589245e-01,0.998024,6.900630,0.315783,0.950141,91,90,88,91,5,7,0.714286,2,1,5,0.200000,4,40,39,6,6,7,6,7,7,17,17,5,5,1,0,1,0,0,0,0,56,56,1.000000,0,54,0.964286,55.62,0.993214,66,45,4434,1.466667,36,40,0.900000,4,105,109,0.963303,4,0.666667,12,0.055556,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0,-2.0,-2.0,-2.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0,-2.0,...,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,100,-2,-2,-2,100,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,100,100,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,100,-2,100,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,100,-2,-2,-2,-2,-2,-2,-2,-2,-2,китай,китай,10 гб,8 гб,mediatek,mediatek,4.0,4.0,нет,нет,0,"[0.12376598, -0.56147766, 0.8182319, 0.3562862...","[0.12376598, -0.56147766, 0.8182319, 0.3562862...","[-0.63105303, 0.45106313, 0.63246083, -0.55864...","[-0.6552933, 0.43776116, 0.61501455, -0.564846...","[-0.021700157, -0.018895453, 0.027215147, 0.00...","[-0.009680024, -0.03594739, 0.032366067, 0.015..."
306536,0,Карты памяти и флешки,USB Флеш-накопитель,True,0,1,1,1.0,0,0,0.000000,0.000000,1.000000,6,6,0,0.281870,0.681711,1.000000,0.202007,0.718130,0.769754,0.769754,1.110223e-16,0,0.000005,8.380848e-07,1.000000,12.931353,0.589798,0.826069,82,79,77,83,3,9,0.333333,6,0,4,0.000000,4,41,39,6,6,5,3,6,6,23,23,9,9,2,0,2,0,0,0,0,11,11,1.000000,0,10,0.909091,10.67,0.970000,60,6,540,10.000000,2,4,0.500000,2,19,23,0.826087,4,0.000000,0,2.000000,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,100,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,не указана,не указана,,,,,,,нет,нет,0,"[0.1316294, -0.050370954, 0.76977086, -0.76957...","[0.1316294, -0.050370954, 0.76977086, -0.76957...","[-0.6374897, 0.68722296, 0.58021176, -0.699589...","[-0.6374895, 0.68722314, 0.58021176, -0.699589...","[0.026554149, -0.053082276, 0.026956635, 0.006...","[-0.009263517, -0.034818575, 0.0373524, 0.0037..."
306538,0,Кабели и переходники,Кабель,True,0,0,2,0.0,2,0,0.000000,0.000000,1.000000,0,0,0,1.000000,1.000000,1.000000,0.000000,0.000000,1.000000,1.000000,0.000000e+00,0,0.000005,8.097516e-07,1.000000,0.000000,0.000000,1.000000,100,100,100,100,19,19,1.000000,0,4,4,1.000000,0,132,132,19,19,6,6,2,2,102,102,5,5,0,0,0,0,0,0,0,17,17,1.000000,0,15,0.882353,16.37,0.962941,150,12,1050,12.500000,6,8,0.750000,2,20,24,0.833333,4,1.200000,3,0.400000,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,100,-2,-2,-2,-2,-2,100,-2,-2,100,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,100,-2,-2,-2,-2,-2,-2,-2,-2,-2,100,-2,-2,-2,-2,-2,-2,-2,-2,россия,россия,,,,,,,для сетевого оборудования,для сетевого оборудования,0,"[-0.039506897, 0.13222088, 1.6788204, -0.01456...","[-0.039506897, 0.13222088, 1.6788204, -0.01456...","[-0.54371524, 0.36101842, 0.5552317, -0.514192...","[-0.5437152, 0.36101848, 0.5552317, -0.5141926...","[0.019028962, 0.02708066, 0.0255603, -0.061653...","[0.019028962, 0.02708066, 0.0255603, -0.061653..."


In [49]:
gc.collect()

0

In [50]:
cat_features = [
    'cat3', 'cat4',
    'Страна-изготовитель_1', 'Страна-изготовитель_2', 
    'Оперативная память_1', 'Оперативная память_2',
    'Бренд процессора_1', 'Бренд процессора_2',
    'Модуль связи Bluetooth_1', 'Модуль связи Bluetooth_2',
    'Назначение_1', 'Назначение_2',
]
embedding_features = ['main_pic_1', 'main_pic_2', 'name_bert_1', 'name_bert_2', 'name_labse_1', 'name_labse_2']

In [60]:
train_pool = Pool(
    data=all_train.drop('target', axis = 1),
    label=all_train['target'].values[:, 0],
    cat_features=cat_features,
    embedding_features=embedding_features
)

val_pool = Pool(
    data=all_val.drop('target', axis = 1),
    label=all_val['target'].values[:, 0],
    cat_features=cat_features,
    embedding_features=embedding_features
)

In [61]:
params = {
    'loss_function': 'CrossEntropy',
    'eval_metric': 'PRAUC',
    'task_type': 'CPU',
    'max_depth': 9,
    'learning_rate': 0.05,
    'iterations': 5000
}

In [62]:
%%time
model_cb = CatBoostClassifier(**params, random_seed=56, cat_features=cat_features, embedding_features=embedding_features)
model_cb.fit(train_pool, eval_set=val_pool, verbose=250, plot=True, use_best_model=True, early_stopping_rounds=100)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.8417247	test: 0.8376910	best: 0.8376910 (0)	total: 804ms	remaining: 1h 6m 57s
250:	learn: 0.9038088	test: 0.8924785	best: 0.8924785 (250)	total: 2m 41s	remaining: 50m 49s
500:	learn: 0.9193248	test: 0.8985758	best: 0.8985758 (500)	total: 5m 20s	remaining: 47m 59s
750:	learn: 0.9291598	test: 0.9010977	best: 0.9010977 (750)	total: 7m 57s	remaining: 45m 4s
1000:	learn: 0.9364967	test: 0.9023008	best: 0.9023008 (1000)	total: 10m 31s	remaining: 42m 2s
1250:	learn: 0.9424123	test: 0.9029177	best: 0.9029183 (1249)	total: 13m 5s	remaining: 39m 15s
1500:	learn: 0.9475573	test: 0.9033260	best: 0.9033378 (1496)	total: 15m 41s	remaining: 36m 35s
1750:	learn: 0.9520252	test: 0.9036620	best: 0.9036620 (1750)	total: 18m 20s	remaining: 34m 1s
2000:	learn: 0.9556677	test: 0.9038658	best: 0.9038748 (1971)	total: 20m 55s	remaining: 31m 21s
2250:	learn: 0.9589368	test: 0.9040807	best: 0.9041006 (2232)	total: 23m 31s	remaining: 28m 43s
Stopped by overfitting detector  (100 iterations wait)

bes

<catboost.core.CatBoostClassifier at 0x1bbfaa82470>

In [63]:
np.max(model_cb.get_evals_result()['validation']['PRAUC']) # 0.9041600573676182

0.9041600573676182

In [64]:
model_cb.get_feature_importance(prettified=True).head(10)

Unnamed: 0,Feature Id,Importances
0,name_token_set_dist,5.724578
1,cat3,4.877425
2,not_same_nums,4.024933
3,not_same_keys,2.886084
4,pics_max_dist,2.851479
5,name_dist,2.593401
6,cat4,2.250511
7,num_attrs_smape_sum,2.229732
8,iou_nums,2.218133
9,iou_keys,2.084411


In [65]:
model_cb.save_model('./datasets/model_cb.cbm')

# Inference

In [None]:
test_df_categories = make_categories_features(test_pairs, test_data)
test_df_color = make_colors_features(test_pairs, test_data)
test_df_pictures = make_pictures_features(test_pairs, test_data)
test_df_names = make_names_features(test_pairs, test_data)
test_df_attributes = make_attributes_features(test_pairs, test_data)
test_df_embeddings = make_embeddings_features(test_pairs, test_data)

test_df = pd.concat([
    test_df_categories,
    test_df_color,
    test_df_pictures,
    test_df_names,
    test_df_attributes,
    test_df_embeddings
], axis=1)

In [67]:
test_pool = Pool(
    data=test_df,
    cat_features=cat_features,
    embedding_features=embedding_features
)

In [None]:
rtest_df_categories = make_categories_features(rtest_pairs, test_data)
rtest_df_color = make_colors_features(rtest_pairs, test_data)
rtest_df_pictures = make_pictures_features(rtest_pairs, test_data)
rtest_df_names = make_names_features(rtest_pairs, test_data)
rtest_df_attributes = make_attributes_features(rtest_pairs, test_data)
rtest_df_embeddings = make_embeddings_features(rtest_pairs, test_data)

rtest_df = pd.concat([
    rtest_df_categories,
    rtest_df_color,
    rtest_df_pictures,
    rtest_df_names,
    rtest_df_attributes,
    rtest_df_embeddings
], axis=1)

In [69]:
rtest_pool = Pool(
    data=rtest_df,
    cat_features=cat_features,
    embedding_features=embedding_features
)

In [74]:
preds = model_cb.predict_proba(test_pool)[:,1]
preds

array([0.07441111, 0.09179865, 0.28064555, ..., 0.6092132 , 0.86382119,
       0.41339669])

In [75]:
rpreds = model_cb.predict_proba(rtest_pool)[:,1]
rpreds

array([0.07134127, 0.10770924, 0.29795185, ..., 0.65539731, 0.8534858 ,
       0.41223574])

In [76]:
final_preds = (preds + rpreds) / 2.
final_preds

array([0.07287619, 0.09975395, 0.2892987 , ..., 0.63230526, 0.8586535 ,
       0.41281621])

In [77]:
test_pairs['target'] = final_preds
test_pairs.to_csv('./golden_submit.csv', index=False)
test_pairs.drop('target', axis=1, inplace=True)