In [1]:
import json
import pandas as pd

In [359]:
pd.set_option('display.max_columns', 512)
pd.set_option('display.max_rows', 128)

In [2]:
train_pairs = pd.read_parquet('./datasets/train_pairs_w_target.parquet')
train_pairs['target'] = train_pairs['target'].astype(int)
train_pairs

Unnamed: 0,target,variantid1,variantid2
0,0,51197862,51198054
1,1,53062686,536165289
2,1,53602615,587809782
3,1,53888651,89598677
4,0,56930698,551526166
...,...,...,...
306535,0,817327230,822083612
306536,0,817560551,818069912
306537,0,817854719,817857267
306538,0,820036017,820037019


In [3]:
test_pairs = pd.read_parquet('./datasets/test_pairs_wo_target.parquet')
test_pairs

Unnamed: 0,variantid1,variantid2
0,52076340,290590137
1,64525522,204128919
2,77243372,479860557
3,86065820,540678372
4,91566575,258840506
...,...,...
18079,666998614,667074522
18080,670036240,670048449
18081,670284509,684323809
18082,692172005,704805270


In [4]:
val_pairs = train_pairs[pd.read_csv('./datasets/val_idx.csv', index_col=0).values].copy()
hard_pairs = train_pairs[pd.read_csv('./datasets/hard_idx.csv', index_col=0).values].copy()
train_pairs = train_pairs[pd.read_csv('./datasets/train_idx.csv', index_col=0).values].copy()
len(train_pairs), len(val_pairs), len(hard_pairs)

(204376, 102164, 10388)

In [5]:
train_data = pd.read_parquet('./datasets/train_data.parquet').set_index('variantid')
train_data['characteristic_attributes_mapping'] = train_data['characteristic_attributes_mapping'].fillna('{}').apply(lambda x: json.loads(x))
train_data['categories'] = train_data['categories'].apply(lambda x: json.loads(x))
train_data['main_pic_embeddings_resnet_v1'] = train_data['main_pic_embeddings_resnet_v1'].apply(lambda x: x[0])
train_data = pd.concat([train_data, pd.read_parquet('./datasets/name_labse_embs_train.parquet').set_index('variantid')], axis=1)
train_data

Unnamed: 0_level_0,name,categories,color_parsed,pic_embeddings_resnet_v1,main_pic_embeddings_resnet_v1,name_bert_64,characteristic_attributes_mapping,name_labse_768
variantid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
51195767,"Удлинитель Партнер-Электро ПВС 2х0,75 ГОСТ,6A,...","{'1': 'EPG', '2': 'Электроника', '3': 'Сетевые...",[оранжевый],,"[0.04603629, 0.18839523, -0.09973055, -0.66368...","[-0.47045058, 0.67237014, 0.48984158, -0.54485...","{'Номинальный ток, А': ['10'], 'Цвет товара': ...","[-0.033874325, 0.03722446, 0.0029757991, 0.068..."
53565809,Магнитный кабель USB 2.0 A (m) - USB Type-C (m...,"{'1': 'EPG', '2': 'Электроника', '3': 'Кабели ...",[красный],"[[0.26863545, -0.3130674, 0.29023397, 0.073978...","[1.1471839, -0.665361, 0.7745614, 0.26716197, ...","[-0.6575592, 0.6522429, 0.5426037, -0.54347897...",{'Конструктивные особенности': ['Магнитная кон...,"[0.015568526, -0.03899538, 0.064447366, 0.0383..."
56763357,"Набор микропрепаратов Konus 25: ""Клетки и ткан...","{'1': 'EPG', '2': 'Электроника', '3': 'Оптичес...",,"[[0.66954195, 1.0643557, 0.78324044, -0.338267...","[-0.90570974, 1.0296293, 1.0769907, 0.27746, -...","[-0.7384308, 0.70784587, 0.3012653, -0.3583719...","{'Тип аксессуара': ['Набор микропрепаратов'], ...","[-0.033072222, -0.04237577, 0.020771954, 0.065..."
56961772,"Мобильный телефон BQ 1848 Step, черный","{'1': 'EPG', '2': 'Электроника', '3': 'Смартфо...",[черный],"[[0.6580482, -0.35763323, -0.16939065, -0.4249...","[0.13133773, -0.5577079, 0.32498044, 0.1917174...","[-0.44812852, 0.5283565, 0.28981736, -0.506841...","{'Тип карты памяти': ['microSD'], 'Число SIM-к...","[0.014727573, -0.025661988, 0.023943473, -0.00..."
61054740,"Штатив трипод Tripod 330A для фотоаппаратов, в...","{'1': 'EPG', '2': 'Электроника', '3': 'Штативы...",[черный],"[[-0.10406649, 0.080646515, -0.28668788, 0.739...","[0.21696381, 0.10989461, -0.08012986, 0.691861...","[-0.72692573, 0.75206333, 0.37740713, -0.52502...","{'Материал': ['Металл'], 'Количество секций, ш...","[0.043145332, -0.052424084, 0.017260496, 0.045..."
...,...,...,...,...,...,...,...,...
820128810,"Комплект 2 шт, Чернила Cactus CS-EPT6733B пурп...","{'1': 'EPG', '2': 'Электроника', '3': 'Расходн...",[пурпурный],,"[-1.4492652, -0.80129164, -0.12344764, 0.71945...","[-0.8253241, 0.6785133, 0.53978086, -0.4888316...","{'Тип': ['Чернила для принтера'], 'Бренд печат...","[-0.003678058, -0.031628493, 0.0065589263, 0.0..."
821135769,"Защитное стекло закаленное Xiaomi Redmi 7, Y3 ...","{'1': 'EPG', '2': 'Электроника', '3': 'Защитны...",[черный],"[[0.09564891, 0.27437285, -0.19054827, -0.7992...","[0.012127608, -0.8534423, 0.5415518, -0.449125...","[-0.7413257, 0.46105132, 0.5639801, -0.5462132...","{'Вид стекла': ['3D'], 'Тип': ['Защитное стекл...","[-0.06858361, 0.027011767, -0.016400583, -0.02..."
822095690,Системный блок ЮКОМС 9400-268 (AMD A6-9400 (3....,"{'1': 'EPG', '2': 'Электроника', '3': 'Компьют...",[черный],,"[0.4248176, -0.15944786, -0.22844064, 0.427686...","[-0.49261805, 0.56726897, 0.7037877, -0.697246...","{'Общий объем HDD, ГБ': ['10000'], 'Видеокарта...","[-0.04474233, -0.034224413, 0.026076552, 0.026..."
822101044,Системный блок ЮКОМС 9400-9 (AMD A6-9400 (3.4 ...,"{'1': 'EPG', '2': 'Электроника', '3': 'Компьют...",[черный],,"[0.4248176, -0.15944786, -0.22844064, 0.427686...","[-0.44051006, 0.54029673, 0.63768685, -0.68040...","{'Общий объем HDD, ГБ': ['8000'], 'Видеокарта'...","[-0.05541598, 0.000863006, 0.01093415, 0.02208..."


In [6]:
test_data = pd.read_parquet('./datasets/test_data.parquet').set_index('variantid')
test_data['characteristic_attributes_mapping'] = test_data['characteristic_attributes_mapping'].fillna('{}').apply(lambda x: json.loads(x))
test_data['categories'] = test_data['categories'].apply(lambda x: json.loads(x))
test_data['main_pic_embeddings_resnet_v1'] = test_data['main_pic_embeddings_resnet_v1'].apply(lambda x: x[0])
test_data = pd.concat([test_data, pd.read_parquet('./datasets/name_labse_embs_test.parquet').set_index('variantid')], axis=1)
test_data

Unnamed: 0_level_0,name,categories,color_parsed,pic_embeddings_resnet_v1,main_pic_embeddings_resnet_v1,name_bert_64,characteristic_attributes_mapping,name_labse_768
variantid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
51201254,Колодка TDM Electric четырехместная без заземл...,"{'1': 'EPG', '2': 'Электроника', '3': 'Сетевые...",[белый],"[[0.34383398, -0.2962618, 0.07987049, -0.08257...","[0.38310742, -0.7876679, 0.5018278, 0.20900711...","[-0.5060825, 0.5773388, 0.59435517, -0.4958292...","{'Страна-изготовитель': ['Китай'], 'Бренд': ['...","[-0.0058242553, 0.0010011946, 0.015051351, 0.0..."
77151532,Клавиатура черная с черной рамкой для 25-011879,"{'1': 'EPG', '2': 'Электроника', '3': 'Запчаст...",[черный],,"[0.50964713, 0.7958329, -1.4113188, 0.19993813...","[-0.43467724, 0.6614495, 0.48050267, -0.588880...","{'Страна-изготовитель': ['Китай'], 'Комплектац...","[0.0088402, -0.0050699823, 0.026550002, -0.015..."
89664856,"15.6"" Игровой ноутбук Acer Predator Helios 300...","{'1': 'EPG', '2': 'Электроника', '3': 'Компьют...",[черный],"[[0.7804302, -0.245446, -0.67754817, -0.614691...","[0.9958085, -0.113175124, -0.7623152, -0.91648...","[-0.70010763, 0.48152006, 0.47597092, -0.51727...",{'Видеокарта': ['NVIDIA GeForce RTX 2070 (8 Гб...,"[-0.026623247, -0.018851712, 0.011397564, -0.0..."
90701982,Портативная колонка Borofone BR7 Empyreal Spor...,"{'1': 'EPG', '2': 'Электроника', '3': 'Акустик...","[red, красный]","[[-0.24636984, -1.0719914, -0.49986655, 0.3423...","[-0.26596686, -1.143009, -0.5289628, 0.4285588...","[-0.73135185, -0.039796613, 0.38907066, -0.496...","{'Основной материал корпуса': ['Металл'], 'Мак...","[0.016539363, 0.03778109, 0.025718935, 0.08805..."
92484118,Аккумулятор для Meizu BA712 ( M6s ),"{'1': 'EPG', '2': 'Электроника', '3': 'Батарей...",,,"[0.42047608, 0.75828516, 0.5440093, -0.0068945...","[-0.600158, 0.13944691, 0.48706242, -0.5050975...","{'Рекомендовано для': ['Meizu'], 'Бренд': ['Me...","[-0.0024493372, 0.02346121, 0.068452105, 0.023..."
...,...,...,...,...,...,...,...,...
702785891,Кабель USB - Lightning HOCO X21 PLUS (черно-бе...,"{'1': 'EPG', '2': 'Электроника', '3': 'Кабели ...",[черный],"[[1.1820095, -0.16312826, 1.4916217, 0.0288323...","[0.3297959, -0.16444838, 0.9350716, 0.34787956...","[-0.66597974, 0.7140731, 0.43572947, -0.445908...","{'Бренд': ['hoco'], 'Тип': ['Кабель'], 'Цвет т...","[-0.031527344, -0.06875799, 0.03187686, -0.004..."
704096517,Блок питания для ноутбука Asus f5gl (19V 90W 4...,"{'1': 'EPG', '2': 'Электроника', '3': 'Зарядны...",[черный],"[[-0.013610864, -0.68512607, 0.77639246, -1.04...","[0.2785852, -0.16053033, 1.1653559, 1.0619084,...","[-0.7575411, 0.4196694, 0.46428213, -0.4916808...","{'Комплектация': ['Зарядное устройство, сетево...","[-0.023706086, -0.012301952, -0.014316322, -0...."
705874953,Оперативная память HyperX FURY Black DDR4 2666...,"{'1': 'EPG', '2': 'Электроника', '3': 'Операти...",[black],"[[0.34073856, 0.65070343, 0.31146732, 1.261663...","[0.31382418, 0.60041714, 0.3067428, 1.1233345,...","[-0.60506856, 0.4477128, 0.62255704, -0.720129...","{'Тайминги': ['16-18-18-29'], 'Пропускная спос...","[-0.028754005, -0.025122717, 0.048854, -0.0297..."
706965102,8 ТБ Внутренний жесткий диск Toshiba TOSHIBA N...,"{'1': 'EPG', '2': 'Электроника', '3': 'Жесткие...",,"[[-0.9360045, -0.43083164, -1.1651772, 1.23836...","[0.404035, -0.20071658, -0.44533533, 0.2038879...","[-0.62029105, 0.45747545, 0.6659858, -0.671704...","{'Комплектация': ['HDWG480UZSVA'], 'Форм-факто...","[-0.026827315, 0.032079216, 0.040149417, -0.01..."


In [239]:
train_cat3 = set()
for categories in tqdm(train_data.categories):
    train_cat3.add(categories['3'])
    
test_cat3 = set()
for categories in tqdm(test_data.categories):
    test_cat3.add(categories['3'])
    
both_cat3 = train_cat3 & test_cat3
len(train_cat3), len(test_cat3), len(both_cat3)

  0%|          | 0/457063 [00:00<?, ?it/s]

  0%|          | 0/35730 [00:00<?, ?it/s]

(127, 74, 74)

In [268]:
train_cat4 = set()
for categories in tqdm(train_data.categories):
    train_cat4.add(categories['4'])
    
test_cat4 = set()
for categories in tqdm(test_data.categories):
    test_cat4.add(categories['4'])
    
both_cat4 = train_cat4 & test_cat4
len(train_cat4), len(test_cat4), len(both_cat4)

  0%|          | 0/457063 [00:00<?, ?it/s]

  0%|          | 0/35730 [00:00<?, ?it/s]

(357, 236, 234)

In [101]:
train_attrs = set()
for attrs in tqdm(train_data.characteristic_attributes_mapping):
    train_attrs |= attrs.keys()
    
test_attrs = set()
for attrs in tqdm(test_data.characteristic_attributes_mapping):
    test_attrs |= attrs.keys()
    
len(train_attrs), len(test_attrs), len(train_attrs & test_attrs)

  0%|          | 0/457063 [00:00<?, ?it/s]

  0%|          | 0/35730 [00:00<?, ?it/s]

(1447, 1238, 1224)

In [285]:
colors_mapper = {
 'ярко-синий': 'ярко-синий',
 'ярко-розовый': 'ярко-розовый',
 'ярко-зеленый': 'ярко-зеленый',
 'ярко-желтый': 'ярко-желтый',
 'янтарный': 'янтарный',
 'электрик': 'электрик',
 'экрю': 'экрю',
 'шоколадный': 'шоколадный',
 'черный': 'черный',
 'черно-синий': 'черно-синий',
 'черно-серый': 'черно-серый',
 'черно-красный': 'черно-красный',
 'черно-зеленый': 'черно-зеленый',
 'черн': 'черный',
 'чер': 'черный',
 'циан': 'бирюзовый',
 'цементный': 'цементный',
 'хаки': 'хаки',
 'фуксия': 'фуксия',
 'фисташковый': 'фисташковый',
 'фиолетовый': 'фиолетовый',
 'фиолетово-синий': 'фиолетово-синий',
 'фиолет': 'фиолетовый',
 'фиол': 'фиолетовый',
 'фиалковый': 'фиалковый',
 'тыквенный': 'тыквенный',
 'тыква': 'тыквенный',
 'травяной': 'травяной',
 'томатный': 'томатный',
 'тиффани': 'тиффани',
 'терракотовый': 'терракотовый',
 'терракота': 'терракотовый',
 'темно-фиолетовый': 'темно-фиолетовый',
 'темно-синий': 'темно-синий',
 'темно-серый': 'темно-серый',
 'темно-розовый': 'темно-розовый',
 'темно-оранжевый': 'темно-оранжевый',
 'темно-оливковый': 'темно-оливковый',
 'темно-красный': 'темно-красный',
 'темно-коричневый': 'темно-коричневый',
 'темно-зеленый': 'темно-зеленый',
 'темно-голубой': 'темно-голубой',
 'темно-бирюзовый': 'темно-бирюзовый',
 'темно-бежевый': 'темно-бежевый',
 'сливовый': 'сливовый',
 'сиреневый': 'сиреневый',
 'синий': 'синий',
 'сине-зеленый': 'сине-зеленый',
 'син': 'синий',
 'серый': 'серый',
 'серовато-зеленый': 'серовато-зеленый',
 'серо-коричневый': 'серо-коричневый',
 'серо-зеленый': 'серо-зеленый',
 'серо-голубой': 'серо-голубой',
 'серо-бежевый': 'серо-бежевый',
 'серебряный': 'серебряный',
 'серебристый': 'серебристый',
 'серебристо-серый': 'серебристо-серый',
 'сер': 'серый',
 'сепия': 'сепия',
 'светло-фиолетовый': 'светло-фиолетовый',
 'светло-синий': 'светло-синий',
 'светло-серый': 'светло-серый',
 'светло-розовый': 'светло-розовый',
 'светло-пурпурный': 'светло-пурпурный',
 'светло-коричневый': 'светло-коричневый',
 'светло-золотистый': 'светло-золотистый',
 'светло-зеленый': 'светло-зеленый',
 'светло-желтый': 'светло-желтый',
 'светло-голубой': 'светло-голубой',
 'светло-бирюзовый': 'светло-бирюзовый',
 'светло-бежевый': 'светло-бежевый',
 'сапфировый': 'сапфировый',
 'салатовый': 'салатовый',
 'рыжий': 'рыжий',
 'розовый': 'розовый',
 'розово-фиолетовый': 'розово-фиолетовый',
 'розово-золотой': 'розово-золотой',
 'разноцветный': 'разноцветный',
 'пурпурный': 'пурпурный',
 'пурпурно-фиолетовый': 'пурпурно-фиолетовый',
 'песочный': 'песочный',
 'перу': 'перу',
 'персиковый': 'персиковый',
 'охра': 'охра',
 'орхидея': 'орхидея',
 'оранжевый': 'оранжевый',
 'оранжево-розовый': 'оранжево-розовый',
 'оливковый': 'оливковый',
 'огненно-красный': 'огненно-красный',
 'нефритовый': 'нефритовый',
 'небесный': 'небесный',
 'мятный': 'мятный',
 'мятно-зеленый': 'мятно-зеленый',
 'мята': 'мятный',
 'мультиколор': 'мультиколор',
 'морковный': 'морковный',
 'молочный': 'молочный',
 'многоцветный': 'многоцветный',
 'медный': 'медный',
 'марсала': 'марсала',
 'малиновый': 'малиновый',
 'малиново-красный': 'малиново-красный',
 'малахитовый': 'малахитовый',
 'льняной': 'льняной',
 'лимонный': 'лимонный',
 'лиловый': 'лиловый',
 'латунный': 'латунный',
 'лаймовый': 'лаймовый',
 'лайм': 'лаймовый',
 'лазурный': 'лазурный',
 'лавандовый': 'лавандовый',
 'лаванда': 'лавандовый',
 'кремовый': 'кремовый',
 'красный': 'красный',
 'красновато-коричневый': 'красновато-коричневый',
 'красно-оранжевый': 'красно-оранжевый',
 'красно-коричневый': 'красно-коричневый',
 'красн': 'красный',
 'крас': 'красный',
 'кофейный': 'кофейный',
 'космос': 'космос',
 'коричневый': 'коричневый',
 'коричнево-красный': 'коричнево-красный',
 'коричнево-бежевый': 'коричнево-бежевый',
 'коралловый': 'коралловый',
 'кораллово-красный': 'кораллово-красный',
 'кобальтовый': 'кобальтовый',
 'кирпичный': 'кирпичный',
 'кирпично-красный': 'кирпично-красный',
 'кварцевый': 'кварцевый',
 'кардинал': 'кардинал',
 'канареечный': 'канареечный',
 'камуфляжный': 'камуфляжный',
 'индиго': 'индиго',
 'изумрудный': 'изумрудный',
 'изумрудно-зеленый': 'изумрудно-зеленый',
 'изумруд': 'изумрудный',
 'золотой': 'золотой',
 'золотистый': 'золотистый',
 'зеленый': 'зеленый',
 'зелено-серый': 'зелено-серый',
 'зел': 'зеленый',
 'жемчужно-белый': 'жемчужно-белый',
 'желтый': 'желтый',
 'желто-розовый': 'желто-розовый',
 'желто-зеленый': 'желто-зеленый',
 'желт': 'желтый',
 'гусеница': 'гусеница',
 'грушевый': 'грушевый',
 'графит': 'графит',
 'гранитный': 'гранитный',
 'гранатовый': 'гранатовый',
 'горчичный': 'горчичный',
 'голубой': 'голубой',
 'голуб': 'голубой',
 'глициния': 'глициния',
 'вишня': 'вишневый',
 'вишневый': 'вишневый',
 'васильковый': 'васильковый',
 'ванильный': 'ванильный',
 'бурый': 'бурый',
 'бронзовый': 'бронзовый',
 'бордовый': 'бордовый',
 'бордо': 'бордовый',
 'болотный': 'болотный',
 'бледно-розовый': 'бледно-розовый',
 'бледно-пурпурный': 'бледно-пурпурный',
 'бледно-желтый': 'бледно-желтый',
 'бирюзовый': 'бирюзовый',
 'бирюзово-зеленый': 'бирюзово-зеленый',
 'белый': 'белый',
 'белоснежный': 'белоснежный',
 'бело-зеленый': 'бело-зеленый',
 'бел': 'белый',
 'бежевый': 'бежевый',
 'бежево-серый': 'бежево-серый',
 'бежево-розовый': 'бежево-розовый',
 'баклажановый': 'баклажановый',
 'антрацитовый': 'антрацитовый',
 'аметистовый': 'аметистовый',
 'алый': 'алый',
 'аквамариновый': 'аквамариновый',
 'аква': 'аква',
 'абрикосовый': 'абрикосовый',
 'yellow': 'желтый',
 'wine': 'wine',
 'white': 'белый',
 'violet': 'фиолетовый',
 'vanilla': 'ванильный',
 'ultramarine': 'ultramarine',
 'turquoise': 'бирюзовый',
 'tomato': 'томатный',
 'teal': 'teal',
 'tan': 'tan',
 'snow': 'snow',
 'silver': 'серебряный',
 'sapphire': 'сапфировый',
 'red': 'красный',
 'purple': 'фиолетовый',
 'pink': 'розовый',
 'peru': 'перу',
 'pear': 'грушевый',
 'peach': 'персиковый',
 'orchid': 'орхидея',
 'orange': 'оранжевый',
 'olive': 'оливковый',
 'navy': 'navy',
 'magenta': 'пурпурный',
 'linen': 'linen',
 'lime': 'лаймовый',
 'lilac': 'сиреневый',
 'lemon': 'lemon',
 'lavender': 'лавандовый',
 'khaki': 'хаки',
 'jade': 'нефритовый',
 'ivory': 'ivory',
 'indigo': 'индиго',
 'grey': 'серый',
 'green': 'зеленый',
 'gray': 'серый',
 'gold': 'золотой',
 'fuchsia': 'фуксия',
 'flax': 'flax',
 'emerald': 'emerald',
 'denim': 'denim',
 'cyan': 'бирюзовый',
 'cream': 'кремовый',
 'corn': 'corn',
 'coral': 'коралловый',
 'copper': 'медный',
 'cobalt': 'кобальтовый',
 'chocolate': 'шоколадный',
 'burgundy': 'бордовый',
 'buff': 'buff',
 'brown': 'коричневый',
 'bronze': 'бронзовый',
 'brass': 'латунный',
 'blue': 'голубой',
 'blond': 'blond',
 'black': 'черный',
 'beige': 'бежевый',
 'azure': 'лазурный',
 'aquamarine': 'аквамариновый',
 'aqua': 'аквамариновый',
 'amethyst': 'аметистовый',
 'amber': 'янтарный'
}

In [346]:
import gc
import re
from thefuzz import fuzz
import numpy as np
from tqdm.auto import tqdm

def calc_dists(df, prefix, embs_1, embs_2):
    l1_dists, l2_dists, cos_dists = [], [], []
    for emb_1, emb_2 in tqdm(zip(embs_1, embs_2), total=len(df)):
        len_1 = (emb_1**2).sum()**0.5
        len_2 = (emb_2**2).sum()**0.5
        l1_dists.append(
            np.abs(emb_1 - emb_2).sum()
        )
        l2_dists.append(
            ((emb_1 - emb_2)**2).sum()**0.5
        )
        cos_dists.append(
            (emb_1 @ emb_2) / len_1 / len_2
        )
    df[f'{prefix}_l1_dist'] = l1_dists
    df[f'{prefix}_l2_dist'] = l2_dists
    df[f'{prefix}_cos_dist'] = cos_dists

def make_df(pairs, data):
    gc.collect()
    
    df = pairs.copy()
    
    # categories
    categories_1 = data.loc[pairs.variantid1, 'categories']
    categories_2 = data.loc[pairs.variantid2, 'categories']
    df['cat3'] = categories_1.apply(lambda x: x['3']).values
    df.loc[~df.cat3.isin(both_cat3), 'cat3'] = 'rest'
    df['cat4'] = categories_1.apply(lambda x: x['4']).values
    df.loc[~df.cat4.isin(both_cat4), 'cat4'] = 'rest'
    df['is_eq_cat4'] = categories_1.apply(lambda x: x['4']).values == categories_2.apply(lambda x: x['4']).values
    
    # colors
    colors_1 = data.loc[pairs.variantid1, 'color_parsed']
    colors_2 = data.loc[pairs.variantid2, 'color_parsed']
    same_colors = []
    all_colors = []
    for color_1, color_2 in tqdm(zip(colors_1, colors_2), total=len(df)):
        if color_1 is not None:
            color_1 = list(set([colors_mapper[c] for c in color_1]))
        if color_2 is not None:
            color_2 = list(set([colors_mapper[c] for c in color_2]))
        
        if color_1 is None or color_2 is None:
            same_colors.append(0)
            if color_1 is not None:
                all_colors.append(len(color_1))
            elif color_2 is not None:
                all_colors.append(len(color_2))
            else:
                all_colors.append(0)
        else:
            same_colors.append(
                len(set(color_1) & set(color_2))
            )
            all_colors.append(
                len(set(color_1) | set(color_2))
            )
    df['same_colors'] = same_colors
    df['all_colors'] = all_colors
    df['iou_colors'] = df['same_colors'] / df['all_colors']
    df.loc[df['all_colors']==0, 'iou_colors'] = 0
    
    # pictures
    main_pics_1 = data.loc[df.variantid1, 'main_pic_embeddings_resnet_v1'].values
    main_pics_2 = data.loc[df.variantid2, 'main_pic_embeddings_resnet_v1'].values
    calc_dists(
        df, 'main_pic', 
        main_pics_1,
        main_pics_2
    )
    embs_1 = data.loc[df.variantid1, 'pic_embeddings_resnet_v1'].values
    embs_2 = data.loc[df.variantid2, 'pic_embeddings_resnet_v1'].values
    min_dists, mean_dists, max_dists, std_dists = [], [], [], []
    pic_cnts_1, pic_cnts_2 = [], []
    mean_dists_to_main_1, mean_dists_to_main_2 = [], []
    for main_pic_1, main_pic_2, emb_1, emb_2 in tqdm(zip(main_pics_1, main_pics_2, embs_1, embs_2), total=len(df)):
        dists = []
        pics_1 = [main_pic_1]
        pics_2 = [main_pic_2]
        if emb_1 is not None:
            pics_1.extend(list(emb_1))
            pic_cnts_1.append(len(emb_1))
        else:
            pic_cnts_1.append(0)
        if emb_2 is not None:
            pics_2.extend(list(emb_2))
            pic_cnts_2.append(len(emb_2))
        else:
            pic_cnts_2.append(0)
        for pic_1 in pics_1:
            for pic_2 in pics_2:
                dists.append(
                    (pic_1 @ pic_2) / (pic_1**2).sum()**0.5 / (pic_2**2).sum()**0.5
                )
        min_dists.append(np.min(dists))
        mean_dists.append(np.mean(dists))
        max_dists.append(np.max(dists))
        std_dists.append(np.std(dists))
        
        dists = []
        for pic_2 in pics_2:
            dists.append(
                (main_pic_1 @ pic_2) / (main_pic_1**2).sum()**0.5 / (pic_2**2).sum()**0.5
            )
        mean_dists_to_main_1.append(np.mean(dists))
        dists = []
        for pic_1 in pics_1:
            dists.append(
                (pic_1 @ main_pic_2) / (pic_1**2).sum()**0.5 / (main_pic_2**2).sum()**0.5
            )
        mean_dists_to_main_2.append(np.mean(dists))
    df['pic_cnt_1'] = pic_cnts_1
    df['pic_cnt_2'] = pic_cnts_2
    df['pic_cnt_diff'] = np.abs(df['pic_cnt_1'] - df['pic_cnt_2'])
    df['pics_min_dist'] = min_dists
    df['pics_mean_dist'] = mean_dists
    df['pics_max_dist'] = max_dists
    df['pics_std_dist'] = std_dists
    df['pics_diff_dist'] = df['pics_max_dist'] - df['pics_min_dist']
    df['mean_dist_to_main_1'] = mean_dists_to_main_1
    df['mean_dist_to_main_2'] = mean_dists_to_main_2
    df['mean_dist_to_main_diff'] = np.abs(df['mean_dist_to_main_1'] - df['mean_dist_to_main_2'])
    
    # names
    calc_dists(
        df, 'name_bert', 
        data.loc[df.variantid1, 'name_bert_64'],
        data.loc[df.variantid2, 'name_bert_64']
    )
    calc_dists(
        df, 'name_labse', 
        data.loc[df.variantid1, 'name_labse_768'],
        data.loc[df.variantid2, 'name_labse_768']
    )
    
    names_1 = data.loc[pairs.variantid1, 'name']
    names_2 = data.loc[pairs.variantid2, 'name']
    dist, partial_dist, token_sort_dist, token_set_dist = [], [], [], []
    for name_1, name_2 in tqdm(zip(names_1, names_2), total=len(df)):
        dist.append(
            fuzz.ratio(name_1, name_2)
        )
        partial_dist.append(
            fuzz.partial_ratio(name_1, name_2)
        )
        token_sort_dist.append(
            fuzz.token_sort_ratio(name_1, name_2)
        )
        token_set_dist.append(
            fuzz.token_set_ratio(name_1, name_2)
        )
    df['name_dist'] = dist
    df['name_partial_dist'] = partial_dist
    df['name_token_sort_dist'] = token_sort_dist
    df['name_token_set_dist'] = token_set_dist
    
    same_words = []
    all_words = []
    for name_1, name_2 in tqdm(zip(names_1, names_2), total=len(df)):
        words_1 = set(name_1.split())
        words_2 = set(name_2.split())
        same_words.append(len(words_1 & words_2))
        all_words.append(len(words_1 | words_2))
    df['same_words'] = same_words
    df['all_words'] = all_words
    df['iou_words'] = df['same_words'] / df['all_words']
    df.loc[df['all_words']==0, 'iou_words'] = 0
    
    same_nums = []
    all_nums = []
    for name_1, name_2 in tqdm(zip(names_1, names_2), total=len(df)):
        nums_1 = set(re.sub(r'\D+', ' ', name_1).split())
        nums_2 = set(re.sub(r'\D+', ' ', name_2).split())
        same_nums.append(len(nums_1 & nums_2))
        all_nums.append(len(nums_1 | nums_2))
    df['same_nums'] = same_nums
    df['all_nums'] = all_nums
    df['iou_nums'] = df['same_nums'] / df['all_nums']
    df.loc[df['all_nums']==0, 'iou_nums'] = 1
    
    df['name_len_1'] = names_1.apply(lambda x: len(x)).values
    df['name_len_2'] = names_2.apply(lambda x: len(x)).values
    df['name_words_1'] = names_1.apply(lambda x: len(x.split())).values
    df['name_words_2'] = names_2.apply(lambda x: len(x.split())).values
    df['name_digit_cnt_1'] = names_1.apply(lambda x: np.sum(['0' <= letter <= '9' for letter in x])).values
    df['name_digit_cnt_2'] = names_2.apply(lambda x: np.sum(['0' <= letter <= '9' for letter in x])).values
    df['name_eng_cnt_1'] = names_1.apply(lambda x: np.sum(['a' <= letter <= 'z' for letter in x.lower()])).values
    df['name_eng_cnt_2'] = names_2.apply(lambda x: np.sum(['a' <= letter <= 'z' for letter in x.lower()])).values
    df['name_rus_cnt_1'] = names_1.apply(lambda x: np.sum(['а' <= letter <= 'я' or letter=='ё' for letter in x.lower()])).values
    df['name_rus_cnt_2'] = names_2.apply(lambda x: np.sum(['а' <= letter <= 'я' or letter=='ё' for letter in x.lower()])).values
    df['name_upper_cnt_1'] = names_1.apply(lambda x: np.sum([letter.isupper() for letter in x])).values
    df['name_upper_cnt_2'] = names_2.apply(lambda x: np.sum([letter.isupper() for letter in x])).values
    for feature in ('len', 'words', 'digit_cnt', 'eng_cnt', 'rus_cnt', 'upper_cnt'):
        df[f'name_{feature}_diff'] = np.abs(df[f'name_{feature}_1'] - df[f'name_{feature}_2'])
        
    # attributes
    attrs_1 = data.loc[pairs.variantid1, 'characteristic_attributes_mapping']
    attrs_2 = data.loc[pairs.variantid2, 'characteristic_attributes_mapping']
    same_keys, all_keys = [], []
    same_values, same_values_dist = [], []
    for attr_1, attr_2 in tqdm(zip(attrs_1, attrs_2), total=len(df)):
        same_keys.append(len(attr_1.keys() & attr_2.keys()))
        all_keys.append(len(attr_1.keys() | attr_2.keys()))
        count, dist = 0, 0
        for key in attr_1.keys() & attr_2.keys():
            count += attr_1[key] == attr_2[key]
            dist += fuzz.ratio(attr_1[key], attr_2[key])
        same_values.append(count)
        same_values_dist.append(dist / 100.)
    df['same_keys'] = same_keys
    df['all_keys'] = all_keys
    df['iou_keys'] = df['same_keys'] / df['all_keys']
    df.loc[df['all_keys']==0, 'iou_keys'] = 0
    df['same_values'] = same_values
    df['same_values_ratio'] = df['same_values'] / df['same_keys'] 
    df.loc[df['same_keys']==0, 'same_values_ratio'] = 0
    df['same_values_dist'] = same_values_dist
    df['same_values_dist_ratio'] = df['same_values_dist'] / df['same_keys'] 
    df.loc[df['same_keys']==0, 'same_values_dist_ratio'] = 0
    
    # embeddings
    df['l1_dist_sum'] = df['main_pic_l1_dist'] + df['name_bert_l1_dist'] + df['name_labse_l1_dist']
    df['l2_dist_sum'] = (df['main_pic_l2_dist']**2 + df['name_bert_l2_dist']**2 + df['name_labse_l2_dist']**2)**0.5
    df['cos_dist_sum'] = df['main_pic_cos_dist'] + df['name_bert_cos_dist'] + df['name_labse_cos_dist']
    
    return df.drop(['variantid1', 'variantid2'], axis=1)

train_df = make_df(train_pairs, train_data)
val_df = make_df(val_pairs, train_data)

  0%|          | 0/204376 [00:00<?, ?it/s]

  0%|          | 0/204376 [00:00<?, ?it/s]

  0%|          | 0/204376 [00:00<?, ?it/s]

  0%|          | 0/204376 [00:00<?, ?it/s]

  0%|          | 0/204376 [00:00<?, ?it/s]

  0%|          | 0/204376 [00:00<?, ?it/s]

  0%|          | 0/204376 [00:00<?, ?it/s]

  0%|          | 0/204376 [00:00<?, ?it/s]

  0%|          | 0/204376 [00:00<?, ?it/s]

  0%|          | 0/102164 [00:00<?, ?it/s]

  0%|          | 0/102164 [00:00<?, ?it/s]

  0%|          | 0/102164 [00:00<?, ?it/s]

  0%|          | 0/102164 [00:00<?, ?it/s]

  0%|          | 0/102164 [00:00<?, ?it/s]

  0%|          | 0/102164 [00:00<?, ?it/s]

  0%|          | 0/102164 [00:00<?, ?it/s]

  0%|          | 0/102164 [00:00<?, ?it/s]

  0%|          | 0/102164 [00:00<?, ?it/s]

In [347]:
train_df

Unnamed: 0,target,cat3,cat4,is_eq_cat4,same_colors,all_colors,iou_colors,main_pic_l1_dist,main_pic_l2_dist,main_pic_cos_dist,pic_cnt_1,pic_cnt_2,pic_cnt_diff,pics_min_dist,pics_mean_dist,pics_max_dist,pics_std_dist,pics_diff_dist,mean_dist_to_main_1,mean_dist_to_main_2,mean_dist_to_main_diff,name_bert_l1_dist,name_bert_l2_dist,name_bert_cos_dist,name_labse_l1_dist,name_labse_l2_dist,name_labse_cos_dist,name_dist,name_partial_dist,name_token_sort_dist,name_token_set_dist,same_words,all_words,iou_words,same_nums,all_nums,iou_nums,name_len_1,name_len_2,name_words_1,name_words_2,name_digit_cnt_1,name_digit_cnt_2,name_eng_cnt_1,name_eng_cnt_2,name_rus_cnt_1,name_rus_cnt_2,name_upper_cnt_1,name_upper_cnt_2,name_len_diff,name_words_diff,name_digit_cnt_diff,name_eng_cnt_diff,name_rus_cnt_diff,name_upper_cnt_diff,same_keys,all_keys,iou_keys,same_values,same_values_ratio,same_values_dist,same_values_dist_ratio,l1_dist_sum,l2_dist_sum,cos_dist_sum
0,0,"Сетевые фильтры, разветвители и удлинители","Сетевой фильтр, удлинитель, разветвитель",True,1,1,1.0,2.361205,0.259265,0.999613,0,0,0,0.999613,0.999613,0.999613,0.000000,0.000000,0.999613,0.999613,0.000000,2.036843,3.384575e-01,0.996394,6.450904,0.289437,0.958113,96,96,96,96,6,10,0.600000,3,6,0.500000,52,54,8,8,11,12,13,13,18,18,11,11,2,0,1,0,0,0,18,18,1.000000,17,0.944444,17.83,0.990556,10.848953,5.153114e-01,2.954120
2,1,Расходник для печати,Картридж,True,0,2,0.0,9.318584,1.008816,0.988570,0,0,0,0.988570,0.988570,0.988570,0.000000,0.000000,0.988570,0.988570,0.000000,3.115397,4.912246e-01,0.992545,6.322046,0.277990,0.961361,89,85,89,89,6,10,0.600000,4,6,0.666667,71,66,8,8,18,18,10,10,33,27,8,8,5,0,0,0,6,0,2,13,0.153846,1,0.500000,1.35,0.675000,18.756027,1.155980e+00,2.942476
3,1,Смарт-часы,Умные часы,True,1,2,0.5,14.735336,1.650810,0.972319,10,4,6,0.588097,0.818230,0.985439,0.094691,0.397342,0.907136,0.876833,0.030303,1.128109,1.721761e-01,0.999034,9.309541,0.424357,0.909961,84,100,85,100,4,7,0.571429,1,1,1.000000,23,32,5,6,1,1,1,1,17,24,2,2,9,1,0,0,7,0,19,31,0.612903,12,0.631579,16.59,0.873158,25.172985,1.713154e+00,2.881314
4,0,Батарейки и аккумуляторы,Аккумулятор для телефона,True,1,1,1.0,7.489506,0.834596,0.989531,0,0,0,0.989531,0.989531,0.989531,0.000000,0.000000,0.989531,0.989531,0.000000,1.829911,2.948854e-01,0.997475,4.707582,0.216588,0.976545,87,91,90,90,10,16,0.625000,2,2,1.000000,68,74,14,14,4,4,24,24,23,29,8,9,6,0,0,0,6,1,6,10,0.600000,2,0.333333,4.75,0.791667,14.026999,9.112724e-01,2.963551
5,0,SIM-карты,Тариф для телефона,True,0,0,0.0,0.806894,0.089664,0.999626,5,5,0,0.118992,0.533862,1.000000,0.282415,0.881008,0.727376,0.726684,0.000692,1.488625,2.440297e-01,0.998104,7.358721,0.336890,0.943253,93,93,87,94,18,22,0.818182,4,4,1.000000,130,129,20,20,7,7,7,7,89,88,8,8,1,0,0,0,1,0,11,11,1.000000,8,0.727273,10.49,0.953636,9.654239,4.255405e-01,2.940983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306534,0,Карты памяти и флешки,Карта памяти,True,0,0,0.0,0.070692,0.007775,0.999997,7,7,0,0.034431,0.475521,1.000000,0.272442,0.965569,0.688161,0.688163,0.000002,0.000004,6.134073e-07,1.000000,17.236347,0.789134,0.688634,86,82,74,86,2,6,0.333333,0,2,0.000000,18,17,4,4,2,1,0,0,13,13,3,3,1,0,1,0,0,0,8,8,1.000000,7,0.875000,7.71,0.963750,17.307043,7.891720e-01,2.688631
306535,0,"Смартфоны, планшеты, мобильные телефоны",Смартфон,True,1,1,1.0,3.544394,0.378028,0.995789,11,11,0,0.048930,0.506668,1.000000,0.266272,0.951070,0.710206,0.706516,0.003690,1.684155,2.589245e-01,0.998024,6.900630,0.315783,0.950141,91,90,88,91,5,7,0.714286,1,5,0.200000,39,40,6,6,6,7,7,7,17,17,5,5,1,0,1,0,0,0,56,56,1.000000,54,0.964286,55.62,0.993214,12.129179,5.564767e-01,2.943953
306536,0,Карты памяти и флешки,USB Флеш-накопитель,True,1,1,1.0,4.310287,0.475840,0.997327,6,6,0,0.281870,0.699386,1.000000,0.197437,0.718130,0.833658,0.829189,0.004469,0.000005,8.380848e-07,1.000000,12.931353,0.589798,0.826069,82,79,77,83,3,9,0.333333,0,4,0.000000,39,41,6,6,3,5,6,6,23,23,9,9,2,0,2,0,0,0,11,11,1.000000,10,0.909091,10.67,0.970000,17.241646,7.578162e-01,2.823396
306538,0,Кабели и переходники,Кабель,True,0,2,0.0,0.000000,0.000000,1.000000,0,0,0,1.000000,1.000000,1.000000,0.000000,0.000000,1.000000,1.000000,0.000000,0.000005,8.097516e-07,1.000000,0.000000,0.000000,1.000000,100,100,100,100,19,19,1.000000,4,4,1.000000,132,132,19,19,6,6,2,2,102,102,5,5,0,0,0,0,0,0,17,17,1.000000,15,0.882353,16.37,0.962941,0.000005,8.097516e-07,3.000000


In [361]:
cat_features = ['cat3', 'cat4']

In [362]:
from catboost import CatBoostClassifier, Pool

In [363]:
train_pool = Pool(
    data=train_df.drop('target', axis = 1),
    label=train_df['target'],
    cat_features=cat_features
)

val_pool = Pool(
    data=val_df.drop('target', axis = 1),
    label=val_df['target'],
    cat_features=cat_features
)

In [364]:
params = {
    'loss_function': 'CrossEntropy',
    'eval_metric': 'PRAUC',
    'task_type': 'CPU',
    'max_depth': 9,
}

In [365]:
model_cb = CatBoostClassifier(**params, random_seed=56)
model_cb.fit(train_pool, eval_set=val_pool, verbose=False, plot=True, use_best_model=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x152f8a4b9d0>

In [366]:
np.max(model_cb.get_evals_result()['validation']['PRAUC'])

0.8945271318261767

In [367]:
model_cb.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,cat3,10.008294
1,name_token_set_dist,8.627062
2,iou_nums,7.628546
3,iou_keys,5.719697
4,cat4,5.354227
5,name_dist,3.134371
6,pics_max_dist,2.892107
7,name_digit_cnt_diff,2.677014
8,all_keys,2.475117
9,same_values_ratio,2.389928


In [355]:
hard_df = make_df(hard_pairs, train_data)

  0%|          | 0/10388 [00:00<?, ?it/s]

  0%|          | 0/10388 [00:00<?, ?it/s]

  0%|          | 0/10388 [00:00<?, ?it/s]

  0%|          | 0/10388 [00:00<?, ?it/s]

  0%|          | 0/10388 [00:00<?, ?it/s]

  0%|          | 0/10388 [00:00<?, ?it/s]

  0%|          | 0/10388 [00:00<?, ?it/s]

  0%|          | 0/10388 [00:00<?, ?it/s]

  0%|          | 0/10388 [00:00<?, ?it/s]

In [356]:
hard_pool = Pool(
    data=hard_df.drop('target', axis = 1),
    label=hard_df['target'],
    cat_features=cat_features
)

In [357]:
model_cb = CatBoostClassifier(**params, random_seed=56)
model_cb.fit(train_pool, eval_set=hard_pool, verbose=False, plot=True, use_best_model=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x152f8a4a9e0>

In [358]:
np.max(model_cb.get_evals_result()['validation']['PRAUC'])

0.8874122673146232

In [368]:
test_df = make_df(test_pairs, test_data)

  0%|          | 0/18084 [00:00<?, ?it/s]

  0%|          | 0/18084 [00:00<?, ?it/s]

  0%|          | 0/18084 [00:00<?, ?it/s]

  0%|          | 0/18084 [00:00<?, ?it/s]

  0%|          | 0/18084 [00:00<?, ?it/s]

  0%|          | 0/18084 [00:00<?, ?it/s]

  0%|          | 0/18084 [00:00<?, ?it/s]

  0%|          | 0/18084 [00:00<?, ?it/s]

  0%|          | 0/18084 [00:00<?, ?it/s]

In [369]:
test_df

Unnamed: 0,cat3,cat4,is_eq_cat4,same_colors,all_colors,iou_colors,main_pic_l1_dist,main_pic_l2_dist,main_pic_cos_dist,pic_cnt_1,pic_cnt_2,pic_cnt_diff,pics_min_dist,pics_mean_dist,pics_max_dist,pics_std_dist,pics_diff_dist,mean_dist_to_main_1,mean_dist_to_main_2,mean_dist_to_main_diff,name_bert_l1_dist,name_bert_l2_dist,name_bert_cos_dist,name_labse_l1_dist,name_labse_l2_dist,name_labse_cos_dist,name_dist,name_partial_dist,name_token_sort_dist,name_token_set_dist,same_words,all_words,iou_words,same_nums,all_nums,iou_nums,name_len_1,name_len_2,name_words_1,name_words_2,name_digit_cnt_1,name_digit_cnt_2,name_eng_cnt_1,name_eng_cnt_2,name_rus_cnt_1,name_rus_cnt_2,name_upper_cnt_1,name_upper_cnt_2,name_len_diff,name_words_diff,name_digit_cnt_diff,name_eng_cnt_diff,name_rus_cnt_diff,name_upper_cnt_diff,same_keys,all_keys,iou_keys,same_values,same_values_ratio,same_values_dist,same_values_dist_ratio,l1_dist_sum,l2_dist_sum,cos_dist_sum
0,Батарейки и аккумуляторы,Батарейка,True,0,0,0.000000,4.067985,0.449950,0.997061,1,1,0,0.660079,0.786204,0.997061,0.137094,0.336983,0.907901,0.832999,0.074902,4.895445,0.719428,0.984255,10.957082,0.492601,0.878672,84,88,84,93,7,13,0.538462,1,3,0.333333,60,74,9,12,6,5,26,30,19,27,11,11,14,3,1,4,8,0,10,10,1.000000,9,0.900000,9.73,0.973000,19.920513,0.981167,2.859988
1,"Смартфоны, планшеты, мобильные телефоны",Смартфон,True,2,2,1.000000,31.456032,3.567521,0.776353,4,1,3,0.511453,0.711695,0.857158,0.096836,0.345705,0.748979,0.671683,0.077296,3.237051,0.573571,0.990931,7.803628,0.358195,0.935848,96,96,96,96,6,10,0.600000,2,4,0.500000,50,50,8,8,4,4,13,13,23,23,6,6,0,0,0,0,0,0,56,56,1.000000,39,0.696429,51.83,0.925536,42.496712,3.631046,2.703132
2,Кабели и переходники,Кабель,True,2,2,1.000000,7.635074,0.835175,0.994726,3,0,3,0.662802,0.821279,0.994726,0.119334,0.331924,0.994726,0.821279,0.173447,8.271223,1.240927,0.947974,15.502422,0.714478,0.744760,47,52,62,66,2,20,0.100000,3,6,0.500000,75,60,14,8,11,7,22,16,21,19,19,11,15,6,4,6,2,8,11,20,0.550000,7,0.636364,9.29,0.844545,31.408718,1.657679,2.687461
3,Устройство ручного ввода,Мышь,True,0,2,0.000000,36.588371,3.889462,0.881459,14,4,10,-0.068885,0.365562,0.929330,0.247084,0.998215,0.476920,0.456566,0.020354,3.471641,0.550203,0.991292,11.931400,0.540656,0.853846,51,53,67,74,2,21,0.095238,3,5,0.600000,79,75,12,11,8,8,21,25,35,25,6,9,4,1,0,4,10,3,18,24,0.750000,16,0.888889,17.36,0.964444,51.991413,3.965217,2.726597
4,"Смартфоны, планшеты, мобильные телефоны",Смартфон,True,1,2,0.500000,41.610771,4.637042,0.787680,1,1,0,0.720894,0.817621,0.943340,0.080714,0.222446,0.754287,0.803125,0.048839,3.829588,0.596074,0.990265,10.472519,0.482182,0.883750,93,85,87,93,5,8,0.625000,2,5,0.400000,38,33,7,6,9,5,5,5,16,16,5,5,5,1,4,0,0,0,26,38,0.684211,15,0.576923,21.14,0.813077,55.912880,4.699996,2.661695
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18079,Оперативная память,Модуль оперативной памяти,True,0,1,0.000000,0.000000,0.000000,1.000000,3,3,0,0.683404,0.878309,1.000000,0.133374,0.316596,0.906774,0.906774,0.000000,3.971746,0.634790,0.988507,5.847257,0.268956,0.963831,79,81,73,87,12,26,0.461538,3,6,0.500000,173,140,33,25,22,27,92,64,19,19,62,56,33,8,5,28,0,6,12,12,1.000000,7,0.583333,10.73,0.894167,9.819004,0.689417,2.952338
18080,Зарядные устройства и док-станции,Компьютерный блок питания,True,0,0,0.000000,18.228937,2.019089,0.946200,0,0,0,0.946200,0.946200,0.946200,0.000000,0.000000,0.946200,0.946200,0.000000,3.425500,0.553783,0.991711,8.206299,0.380577,0.927581,95,94,95,95,9,11,0.818182,1,3,0.333333,79,81,11,11,5,6,6,7,57,57,3,3,2,0,1,1,0,0,7,7,1.000000,6,0.857143,6.75,0.964286,29.860737,2.127965,2.865491
18081,Смарт-часы,Ремешок для смарт-часов,True,1,2,0.500000,2.718232,0.310947,0.998958,0,0,0,0.998958,0.998958,0.998958,0.000000,0.000000,0.998958,0.998958,0.000000,10.553557,1.682896,0.923589,22.486010,1.025240,0.474442,41,43,59,60,4,27,0.148148,1,12,0.083333,93,102,18,15,12,16,30,10,33,53,6,7,9,3,4,20,20,1,6,7,0.857143,5,0.833333,5.57,0.928333,35.757797,1.994980,2.396989
18082,Проектор,Проектор,True,1,3,0.333333,0.000000,0.000000,1.000000,4,5,1,0.033292,0.559774,1.000000,0.244317,0.966708,0.615496,0.670634,0.055138,2.824259,0.460841,0.992948,7.256427,0.339631,0.942325,82,79,86,94,12,17,0.705882,1,1,1.000000,87,101,14,15,1,1,21,21,50,63,5,5,14,1,0,0,13,0,8,9,0.888889,8,1.000000,8.00,1.000000,10.080686,0.572472,2.935273


In [370]:
test_pool = Pool(
    data=test_df,
    cat_features=cat_features
)

In [372]:
preds = model_cb.predict_proba(test_pool)[:, 1]
preds

array([0.36007458, 0.12455223, 0.70191708, ..., 0.58349519, 0.79315132,
       0.23232204])

In [374]:
test_pairs['target'] = preds
test_pairs.to_csv('./submit.csv', index=False)
test_pairs.drop('target', axis=1, inplace=True)