In [12]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from functools import partial
from typing import List
from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
from scipy.spatial.distance import cosine, euclidean
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score

### Load data

In [13]:
dataset = pd.read_parquet(r"C:\Users\druzh\Project_python\ozon_top_1\Datasets/train_pairs.parquet")
etl = pd.read_parquet(r"C:\Users\druzh\Project_python\ozon_top_1\Datasets/train_data.parquet")

In [14]:
etl.head(2)

Unnamed: 0,variantid,name,categories,color_parsed,pic_embeddings_resnet_v1,main_pic_embeddings_resnet_v1,name_bert_64,characteristic_attributes_mapping
0,51195767,"Удлинитель Партнер-Электро ПВС 2х0,75 ГОСТ,6A,...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[оранжевый],,"[[0.04603629, 0.18839523, -0.09973055, -0.6636...","[-0.47045058, 0.67237014, 0.48984158, -0.54485...","{""Номинальный ток, А"":[""10""],""Цвет товара"":[""о..."
1,53565809,Магнитный кабель USB 2.0 A (m) - USB Type-C (m...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Кабели ...",[красный],"[[0.26863545, -0.3130674, 0.29023397, 0.073978...","[[1.1471839, -0.665361, 0.7745614, 0.26716197,...","[-0.6575592, 0.6522429, 0.5426037, -0.54347897...","{""Конструктивные особенности"":[""Магнитная конс..."


In [15]:
dataset.head(2)

Unnamed: 0,target,variantid1,variantid2
0,0.0,51197862,51198054
1,1.0,53062686,536165289


In [16]:
etl[etl['variantid'] == 290590137]

Unnamed: 0,variantid,name,categories,color_parsed,pic_embeddings_resnet_v1,main_pic_embeddings_resnet_v1,name_bert_64,characteristic_attributes_mapping


Get raw data for each variantid.

In [17]:
features = (
    dataset
    .merge(
        etl
        .add_suffix('1'),
        on="variantid1"
    )
    .merge(
        etl
        .add_suffix('2'),
        on="variantid2"
    )
)

In [18]:
features.head(100)

Unnamed: 0,target,variantid1,variantid2,name1,categories1,color_parsed1,pic_embeddings_resnet_v11,main_pic_embeddings_resnet_v11,name_bert_641,characteristic_attributes_mapping1,name2,categories2,color_parsed2,pic_embeddings_resnet_v12,main_pic_embeddings_resnet_v12,name_bert_642,characteristic_attributes_mapping2
0,0.0,51197862,51198054,Удлинитель TDM Electric Люкс УЛ05В 5 м (SQ1303...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],,"[[-0.4304909, -0.49474272, -0.46439183, -0.060...","[-0.5104684, 0.56158644, 0.58873796, -0.529718...","{""Число жил"":[""3""],""Макс. нагрузка, Вт"":[""3500...",Удлинитель TDM Electric Люкс УЛ05В 1.5 м (SQ13...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],,"[[-0.42941108, -0.5129398, -0.4753536, -0.0677...","[-0.455473, 0.58157134, 0.5870387, -0.5325003,...","{""Электробезопасность"":[""Заземление""],""Длина к..."
1,0.0,51197862,51199884,Удлинитель TDM Electric Люкс УЛ05В 5 м (SQ1303...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],,"[[-0.4304909, -0.49474272, -0.46439183, -0.060...","[-0.5104684, 0.56158644, 0.58873796, -0.529718...","{""Число жил"":[""3""],""Макс. нагрузка, Вт"":[""3500...",Удлинитель TDM Electric Люкс УЛ05В 3 м (SQ1303...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],,"[[-0.43180764, -0.49580905, -0.5062628, -0.130...","[-0.5425725, 0.6415736, 0.51481575, -0.5687392...","{""Макс. нагрузка, Вт"":[""3500""],""Стандарт защит..."
2,1.0,53062686,536165289,Картридж лазерный Комус 729 (4370B002) черный ...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Расходн...","[черный, чер]",,"[[-0.0032773763, 0.32531193, -0.33156675, 0.41...","[-0.53706163, 0.37264067, 0.44363657, -0.37289...","{""Количество в упаковке, шт"":[""1""],""Бренд"":[""К...",Картридж лазерный Комус 729 (4368B002) пур. дл...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Расходн...",[пурпурный],,"[[-0.043616347, 0.49310583, -0.3069673, 0.4820...","[-0.51572454, 0.40346462, 0.43528882, -0.34104...","{""Бренд"":[""Комус""],""Тип"":[""Картридж""]}"
3,1.0,53602615,587809782,Картридж лазерный Комус 729 (4368B002) пурпурн...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Расходн...",[пурпурный],,"[[0.027616128, 0.33428708, -0.37326592, 0.4108...","[-0.61162275, 0.4953002, 0.47400212, -0.429568...","{""Назначение"":[""для лазерного принтера""],""Совм...",Картридж лазерный Комус 729 (4370B002) чер. дл...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Расходн...","[черный, чер]",,"[[-0.041107245, 0.48485547, -0.2837791, 0.4637...","[-0.5087511, 0.46164495, 0.42914906, -0.462508...","{""Тип"":[""Картридж""],""Бренд"":[""Нет бренда""]}"
4,1.0,53602615,615149925,Картридж лазерный Комус 729 (4368B002) пурпурн...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Расходн...",[пурпурный],,"[[0.027616128, 0.33428708, -0.37326592, 0.4108...","[-0.61162275, 0.4953002, 0.47400212, -0.429568...","{""Назначение"":[""для лазерного принтера""],""Совм...",Картридж лазерный Комус 729 (4368B002) пур. дл...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Расходн...",[пурпурный],,"[[-0.043616347, 0.49310583, -0.3069673, 0.4820...","[-0.51572454, 0.40346462, 0.43528882, -0.34104...","{""Тип"":[""Картридж""],""Бренд"":[""Комус""]}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,89687730,158966648,"13.4"" Ноутбук Dell XPS 13 9310 (9310-8426), In...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Компьют...",[серебристый],"[[0.42248148, 0.06304483, -0.56757, -0.0625605...","[[0.88323873, -0.27722967, -1.0792097, -0.0134...","[-0.4066556, 0.61670595, 0.5630685, -0.4760342...","{""Бренд графического процессора"":[""Intel""],""Ко...","13.4"" Ноутбук Dell XPS 13 9310 (9310-8310), In...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Компьют...","[серебристый, white]","[[0.5050414, -0.28092867, -0.7889776, -0.22077...","[[0.88323873, -0.27722967, -1.0792097, -0.0134...","[-0.45031324, 0.6427112, 0.5345638, -0.4277093...","{""Бренд графического процессора"":[""Intel""],""Оп..."
96,0.0,89671917,89672456,"13.3"" Ноутбук Dell XPS 13 7390, Intel Core i5-...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Компьют...",[серебристый],"[[0.74860406, -0.011533398, -1.1187638, -0.071...","[[0.76096475, -0.2428196, -1.1639042, 0.148975...","[-0.42422917, 0.6684081, 0.65938085, -0.463662...","{""Число портов Thunderbolt"":[""3""],""Операционна...","13.3"" Ноутбук Dell XPS 13 7390, Intel Core i7-...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Компьют...",[серебристый],"[[0.16279157, -0.23271999, 0.05059591, -1.1455...","[[0.6814279, -0.2930943, -1.1856017, 0.1864997...","[-0.41573897, 0.6715048, 0.6641684, -0.4426181...","{""Операционная система"":[""Windows Home""],""Конф..."
97,0.0,89679683,89680759,"14"" Ноутбук Lenovo ThinkPad T14 Gen 1, AMD Ryz...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Компьют...",[черный],"[[0.20045407, -0.20176557, -0.605728, 0.106663...","[[0.61744195, -0.38676617, -1.6081505, 0.02291...","[-0.3394341, 0.33940527, 0.6974925, -0.4580184...","{""Модель процессора"":[""AMD Ryzen 5 Pro""],""Особ...","14"" Ноутбук Lenovo ThinkPad T14 Gen 1 20S00043...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Компьют...",[черный],"[[0.5051854, -0.50143117, -1.5124974, 0.203869...","[[0.61744195, -0.38676617, -1.6081505, 0.02291...","[-0.36984313, 0.35905156, 0.7319111, -0.450559...","{""Цвет товара"":[""черный""],""Конфигурация звука""..."
98,0.0,89679683,89689622,"14"" Ноутбук Lenovo ThinkPad T14 Gen 1, AMD Ryz...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Компьют...",[черный],"[[0.20045407, -0.20176557, -0.605728, 0.106663...","[[0.61744195, -0.38676617, -1.6081505, 0.02291...","[-0.3394341, 0.33940527, 0.6974925, -0.4580184...","{""Модель процессора"":[""AMD Ryzen 5 Pro""],""Особ...","14"" Ноутбук Lenovo ThinkPad T14 Gen 1 (20S0006...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Компьют...",[черный],"[[0.20045407, -0.20176557, -0.605728, 0.106663...","[[0.61744195, -0.38676617, -1.6081505, 0.02291...","[-0.4003623, 0.2355009, 0.7422754, -0.49587643...","{""Форм-фактор ноутбука"":[""Полноразмерный ноутб..."


In [19]:
features['name1'].values[:150]

array(['Удлинитель TDM Electric Люкс УЛ05В 5 м (SQ1303-0138)',
       'Удлинитель TDM Electric Люкс УЛ05В 5 м (SQ1303-0138)',
       'Картридж лазерный Комус 729 (4370B002) черный для CanonLBP7010C/7018',
       'Картридж лазерный Комус 729 (4368B002) пурпурный для CanonLBP7010C/7018',
       'Картридж лазерный Комус 729 (4368B002) пурпурный для CanonLBP7010C/7018',
       'Умные часы для детей S4', 'Умные часы для детей S4',
       'Аккумулятор для смартфона Xiaomi BN41 ( Redmi Note 4 / Note 4 Pro ) ',
       'SIM-карта Tele2 Тарифный план для смартфона Мой онлайн, со скидкой 30% на 6 месяцев, баланс 300 руб Калуж.обл. (Калужская область)',
       'Картридж для принтера Lomond C8i Photo Magenta (без чипа)',
       'ИБП Powercom Raptor RPT-800A EURO',
       'ИБП Powercom Raptor RPT-800A EURO',
       'ИБП Powercom Raptor RPT-800A EURO',
       'ИБП Powercom Raptor RPT-800A EURO',
       'ИБП Powercom Raptor RPT-800A EURO',
       'Беспроводная колонка Marshall Kilburn II, черный',
   

In [11]:
features['name2'].values[:150]

array(['Батарейка AAA щелочная Perfeo LR03/2BL mini Super Alkaline 2 шт 2 упаковки',
       'Смартфон Ulefone Armor X3 2/32 ГБ, черный, красный',
       'Кабель  HDMI 1.4 (Male/Male) (CG150S-1.5M), черный + подарок',
       'Мышь A4Tech Bloody P93s Bullet, серый, оптическая (8000dpi), USB (8 кнопок)',
       'Смартфон Vivo Y81 3/32 ГБ, черный',
       'Переходные кольца/адаптеры для объективов,M42-M42(17-31)',
       'Смартфон Blackview BV4900 3/32 ГБ, оранжевый, черный',
       'Картридж Brother TN2275, черный, для лазерного принтера',
       'Дисплей для Xiaomi Redmi Note 4X в сборе с тачскрином (белый) (у телефона отсутствуют винты снизу)',
       'Аккумулятор для Samsung NT300V 11.1V 4400mAh',
       'Видеокарта Colorful GeForce RTX 3070 Ti 8 ГБ (iGame GeForce RTX 3070 Ti Ultra W OC 8G L-V), LHR',
       'Ремешок силиконовый GSMIN Sport Band 20 для Huawei Watch GT Active (Дизайн 1)',
       'Ремешок силиконовый GSMIN Sport Band 20 для Huawei Watch GT Active (Дизайн 1)',
       'Рем

In [5]:
feats = ["main_pic_embeddings_resnet_v11", "name_bert_641", "main_pic_embeddings_resnet_v12", "name_bert_642"]

In [6]:
X_test = features[feats]

In [7]:
X_test.head(2)

Unnamed: 0,main_pic_embeddings_resnet_v11,name_bert_641,main_pic_embeddings_resnet_v12,name_bert_642
0,"[[0.04763528, -0.20136409, 0.29605597, 0.26453...","[-0.28437558, 0.60909724, 0.5972025, -0.523296...","[[0.06223978, -0.16145544, 0.26409012, 0.24271...","[-0.3380968, 0.6156224, 0.6428071, -0.57499236..."
1,"[[-0.27325493, -0.6696304, 0.027148303, 0.0785...","[-0.45766184, 0.5528555, 0.26298037, -0.663931...","[[-0.15358369, -0.8256463, -0.054863703, 0.453...","[-0.4489074, 0.6278857, 0.33072582, -0.6749875..."


In [12]:
print(X_test.shape)

(18084, 4)


Features functions.

In [13]:
X_test_final = []

for i in range(len(X_test)):
    row = []
    row.extend(list(X_test[feats[0]].iloc[i])[0])
    row.extend(list(X_test[feats[1]].iloc[i]))
    row.extend(list(X_test[feats[2]].iloc[i])[0])
    row.extend(list(X_test[feats[3]].iloc[i]))
    X_test_final.append(row)

In [14]:
from catboost import CatBoostClassifier


model = CatBoostClassifier()
model.load_model("catboost_with_embs_only_optune_epochs.cbm")

<catboost.core.CatBoostClassifier at 0x1cda945d850>

In [15]:
prediction = model.predict_proba(X_test_final)

In [16]:
print(prediction)

[[0.66423751 0.33576249]
 [0.83673476 0.16326524]
 [0.57523377 0.42476623]
 ...
 [0.58543175 0.41456825]
 [0.40325275 0.59674725]
 [0.56980887 0.43019113]]


In [19]:
prediction = prediction[::, 1]

In [22]:
import random
for i in range(len(prediction)):
    if prediction[i] > 0.5:
        prediction[i] = prediction[i] + 0.25
    if prediction[i] >= 1:
        prediction[i] = random.uniform(0.75, 0.99)

In [23]:
for i in prediction:
    print(i)

0.3357624885100628
0.16326524042638899
0.42476623408713166
0.7862457788819723
0.4954740859841572
0.49238361256337676
0.8755257318388066
0.8670655808559301
0.8529511149797899
0.4524610825851806
0.8686754764207241
0.358492463794926
0.3493535677841346
0.9710620230428328
0.8465078451872955
0.47591940641757774
0.4248259565641679
0.8509929273402552
0.06482607294378749
0.309225288301557
0.7656990381317285
0.9332310191938628
0.9955425301510924
0.480778993222951
0.4721715805317722
0.7520828721546502
0.24087399641138588
0.9368906230130464
0.9312254574938901
0.957308863663605
0.947541791011852
0.8085252298015865
0.7796010940707511
0.4568222878045567
0.7582328522069023
0.3281526305816969
0.7695334493107979
0.8808392196598647
0.8041063682416625
0.43354552714832456
0.9883034859518874
0.8890835079596675
0.1694626859708243
0.39945761038206995
0.4200881915698143
0.4007772012102398
0.9823157867995205
0.7970137552207092
0.4118031420652282
0.39453871951809716
0.4135955820248653
0.8945060326174983
0.295958

In [24]:
dataset['target'] = prediction

In [25]:
dataset.to_csv("sub_catboost_w_embs_only_optuna_boosted.csv")