In [2]:
import json
from functools import partial
from typing import List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
from scipy.spatial.distance import cosine, euclidean
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_score


### Load data

In [77]:
dataset = pd.read_parquet(r"C:\Users\druzh\Project_python\ozon_top_1\Datasets/train_pairs.parquet")
etl = pd.read_parquet(r"C:\Users\druzh\Project_python\ozon_top_1\Datasets/train_data.parquet")

In [78]:
data_test = pd.read_parquet(r"C:\Users\druzh\Project_python\ozon_top_1\Datasets/test_pairs_wo_target.parquet")
etl_test = pd.read_parquet(r"C:\Users\druzh\Project_python\ozon_top_1\Datasets/test_data.parquet")

In [79]:
dataset.head(2)

Unnamed: 0,target,variantid1,variantid2
0,0.0,51197862,51198054
1,1.0,53062686,536165289


In [80]:
dataset["target"].value_counts()

0.0    171527
1.0    135013
Name: target, dtype: int64

In [81]:
etl_test.head(8)

Unnamed: 0,variantid,name,categories,color_parsed,pic_embeddings_resnet_v1,main_pic_embeddings_resnet_v1,name_bert_64,characteristic_attributes_mapping
0,51201254,Колодка TDM Electric четырехместная без заземл...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],"[[0.34383398, -0.2962618, 0.07987049, -0.08257...","[[0.38310742, -0.7876679, 0.5018278, 0.2090071...","[-0.5060825, 0.5773388, 0.59435517, -0.4958292...","{""Страна-изготовитель"":[""Китай""],""Бренд"":[""TDM..."
1,77151532,Клавиатура черная с черной рамкой для 25-011879,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Запчаст...",[черный],,"[[0.50964713, 0.7958329, -1.4113188, 0.1999381...","[-0.43467724, 0.6614495, 0.48050267, -0.588880...","{""Страна-изготовитель"":[""Китай""],""Комплектация..."
2,89664856,"15.6"" Игровой ноутбук Acer Predator Helios 300...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Компьют...",[черный],"[[0.7804302, -0.245446, -0.67754817, -0.614691...","[[0.9958085, -0.113175124, -0.7623152, -0.9164...","[-0.70010763, 0.48152006, 0.47597092, -0.51727...","{""Видеокарта"":[""NVIDIA GeForce RTX 2070 (8 Гб)..."
3,90701982,Портативная колонка Borofone BR7 Empyreal Spor...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Акустик...","[red, красный]","[[-0.24636984, -1.0719914, -0.49986655, 0.3423...","[[-0.26596686, -1.143009, -0.5289628, 0.428558...","[-0.73135185, -0.039796613, 0.38907066, -0.496...","{""Основной материал корпуса"":[""Металл""],""Макси..."
4,92484118,Аккумулятор для Meizu BA712 ( M6s ),"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Батарей...",,,"[[0.42047608, 0.75828516, 0.5440093, -0.006894...","[-0.600158, 0.13944691, 0.48706242, -0.5050975...","{""Рекомендовано для"":[""Meizu""],""Бренд"":[""Meizu..."
5,99097578,Клавиатура черная с черной рамкой для Asus N43JM,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Запчаст...",[черный],,"[[0.987125, 0.87486434, -0.8414553, -0.1438423...","[-0.71254474, 0.6018984, 0.48035166, -0.578878...","{""Рекомендовано для"":[""ASUS""],""Страна-изготови..."
6,120558024,KX-FAT411A (Colouring) тонер картридж - 2000 с...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Расходн...",[черный],,"[[-0.9767299, 1.2299631, 0.88899034, 0.4193062...","[-0.3799147, 0.56713665, 0.63711894, -0.437501...","{""Цвет тонера/чернил"":[""черный""],""Страна-изгот..."
7,133489315,Виниловый фотофон для предметной съемки стена-...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Аксессу...",[коричневый],,"[[-0.3676244, -0.2502277, -0.42328912, 0.19049...","[-0.46177715, 0.5542605, 0.40788603, -0.349237...","{""Ширина, см"":[""50""],""Страна-изготовитель"":[""Р..."


In [82]:
print(etl.shape, dataset.shape)

(457063, 8) (306540, 3)


Get raw data for each variantid.

In [83]:
features0 = (
    dataset
    .merge(
        etl
        .add_suffix('1'),
        on="variantid1"
    )
    .merge(
        etl
        .add_suffix('2'),
        on="variantid2"
    )
)

In [84]:
feature_test = (
    data_test
    .merge(
        etl_test
        .add_suffix('1'),
        on="variantid1"
    )
    .merge(
        etl_test
        .add_suffix('2'),
        on="variantid2"
    )
)

In [85]:
feature_test.head(2)

Unnamed: 0,variantid1,variantid2,name1,categories1,color_parsed1,pic_embeddings_resnet_v11,main_pic_embeddings_resnet_v11,name_bert_641,characteristic_attributes_mapping1,name2,categories2,color_parsed2,pic_embeddings_resnet_v12,main_pic_embeddings_resnet_v12,name_bert_642,characteristic_attributes_mapping2
0,52076340,290590137,Батарейка AAA щелочная Perfeo LR03/10BL Super ...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Батарей...",,"[[0.15417035, 0.41160947, 0.2213532, -0.019731...","[[0.04763528, -0.20136409, 0.29605597, 0.26453...","[-0.28437558, 0.60909724, 0.5972025, -0.523296...","{""Напряжение, В"":[""1.5""],""Бренд"":[""Perfeo""],""Т...",Батарейка AAA щелочная Perfeo LR03/2BL mini Su...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Батарей...",,"[[-0.025554053, 0.012488857, 0.43989864, -0.10...","[[0.06223978, -0.16145544, 0.26409012, 0.24271...","[-0.3380968, 0.6156224, 0.6428071, -0.57499236...","{""Форм-фактор батареи"":[""AAA""],""Химический тип..."
1,64525522,204128919,"Смартфон Ulefone Armor X5 3/32 ГБ, черный, кра...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Смартфо...","[черный, red, красный]","[[-0.239386, -0.8332473, -0.08384809, 0.071721...","[[-0.27325493, -0.6696304, 0.027148303, 0.0785...","[-0.45766184, 0.5528555, 0.26298037, -0.663931...","{""Операционная система"":[""Android""],""Защищенно...","Смартфон Ulefone Armor X3 2/32 ГБ, черный, кра...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Смартфо...","[черный, red, красный]","[[-0.071279265, -0.99063504, -0.3939417, 0.886...","[[-0.15358369, -0.8256463, -0.054863703, 0.453...","[-0.4489074, 0.6278857, 0.33072582, -0.6749875...","{""Встроенная память"":[""32 ГБ""],""Видеопроцессор..."


Features functions.

In [13]:
features = features0

In [14]:
features["cat31"] = features["categories1"].apply(lambda x: json.loads(x)["3"])
cat3_counts = features["cat31"].value_counts().to_dict()

In [15]:
feature_test["cat31"] = feature_test["categories1"].apply(lambda x: json.loads(x)["3"])
cat3_counts_test = feature_test["cat31"].value_counts().to_dict()

In [16]:
features["cat31_grouped"] = features["cat31"].apply(lambda x: x if cat3_counts[x] > 1000 else "rest")
feature_test["cat31_grouped"] = feature_test["cat31"].apply(lambda x: x if cat3_counts_test[x] > 1000 else "rest")

In [17]:
features["cat32"] = features["categories2"].apply(lambda x: json.loads(x)["3"])
cat3_counts = features["cat32"].value_counts().to_dict()

feature_test["cat32"] = feature_test["categories2"].apply(lambda x: json.loads(x)["3"])
cat3_counts_test = feature_test["cat32"].value_counts().to_dict()

In [18]:
features["cat32_grouped"] = features["cat32"].apply(lambda x: x if cat3_counts[x] > 1000 else "rest")
feature_test["cat32_grouped"] = feature_test["cat32"].apply(lambda x: x if cat3_counts_test[x] > 1000 else "rest")

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

def create_top_bag_of_words(sentences, top_words):
    vectorizer = CountVectorizer(max_features=top_words)
    bag_of_words = vectorizer.fit_transform(sentences)
    len_sentences = [len(sentence.split()) for sentence in sentences]

    return bag_of_words, len_sentences, vectorizer

def encode_sentences(sentences, vectorizer):
    encoded_sentences = vectorizer.transform(sentences)
    len_sentences = [len(sentence.split()) for sentence in sentences]
    return encoded_sentences, len_sentences

In [21]:
bag_of_words, len_sentences, names_bag_vectorizer = create_top_bag_of_words(np.hstack((features['name1'].values, features['name2'].values)), 5000)

In [24]:
bag_of_words, len_sentences = encode_sentences(np.hstack((feature_test['name1'].values, feature_test['name2'].values)), names_bag_vectorizer)

In [25]:
bag_of_words = bag_of_words.toarray()

In [17]:
print(bag_of_words)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [26]:
feature_test["name1_bag"] = bag_of_words[:len(bag_of_words) // 2].tolist()

In [27]:
feature_test["name2_bag"] = bag_of_words[len(bag_of_words) // 2:].tolist()

In [28]:
feature_test["name1_len"] = len_sentences[:len(len_sentences) // 2]

In [29]:
feature_test["name2_len"] = len_sentences[len(len_sentences) // 2:]

In [30]:
bag_of_words, len_sentences, cats_bag_vectorizer = create_top_bag_of_words(np.hstack((features['cat31'].values, features['cat32'].values)), 250)

In [31]:
bag_of_words, len_sentences = encode_sentences(np.hstack((feature_test['cat31'].values, feature_test['cat32'].values)), cats_bag_vectorizer)

In [32]:
bag_of_words = bag_of_words.toarray()

In [33]:
feature_test["cat31_bag"] = bag_of_words[:len(bag_of_words) // 2].tolist()

In [40]:
feature_test = feature_test.drop(
        ["name1", "categories1", "pic_embeddings_resnet_v11", "main_pic_embeddings_resnet_v11", "name_bert_641",
         "name2",
         "categories2", "pic_embeddings_resnet_v12", "main_pic_embeddings_resnet_v12", "name_bert_642"], axis=1)
feature_test = feature_test.drop(['cat31', 'cat32'], axis=1)

KeyError: "['name1', 'categories1', 'pic_embeddings_resnet_v11', 'main_pic_embeddings_resnet_v11', 'name_bert_641', 'name2', 'categories2', 'pic_embeddings_resnet_v12', 'main_pic_embeddings_resnet_v12', 'name_bert_642'] not found in axis"

In [None]:
feature_test["cat32_bag"] =  bag_of_words[len(bag_of_words) // 2:].tolist()

In [42]:
feature_test

Unnamed: 0,variantid1,variantid2,color_parsed1,characteristic_attributes_mapping1,color_parsed2,characteristic_attributes_mapping2,cat31_grouped,cat32_grouped,name1_bag,name2_bag,name1_len,name2_len,cat31_bag,cat31_len,cat32_len,cat32_bag
0,52076340,290590137,,"{""Напряжение, В"":[""1.5""],""Бренд"":[""Perfeo""],""Т...",,"{""Форм-фактор батареи"":[""AAA""],""Химический тип...",rest,rest,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9,12,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,64525522,204128919,"[черный, red, красный]","{""Операционная система"":[""Android""],""Защищенно...","[черный, red, красный]","{""Встроенная память"":[""32 ГБ""],""Видеопроцессор...",rest,rest,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8,8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,77243372,479860557,"[черный, золотистый]","{""Тип"":[""Видео-кабель""],""Длина, м"":[""3""],""Разм...","[черный, золотистый]","{""Коннектор 2"":[""HDMI""],""Единиц в одном товаре...",Кабели и переходники,Кабели и переходники,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",14,8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,86065820,540678372,[черный],"{""MTBF, кликов"":[""20000000""],""Особенности"":[""П...",[серый],"{""Цвет товара"":[""серый""],""Интерфейс"":[""USB""],""...",rest,rest,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",12,11,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,91566575,258840506,"[черный, синий]","{""Процессор"":[""Snapdragon 439 (8 ядер), 2.0 ГГ...",[черный],"{""Модуль связи WiFi"":[""802.11a/b/g/n""],""Бренд ...",rest,rest,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7,6,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18079,666998614,667074522,[black],"{""Бренд"":[""Corsair""],""Форм-фактор RAM"":[""DIMM""...",,"{""Бренд"":[""Corsair""],""Напряжение питания, В"":[...",rest,rest,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",33,25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
18080,670036240,670048449,,"{""Макс. выходной ток, А"":[""3.33""],""Партномер"":...",,"{""Выходное напряжение, В"":[""19.5""],""Тип"":[""Бло...",Зарядные устройства и док-станции,Зарядные устройства и док-станции,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11,11,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
18081,670284509,684323809,"[желтый, красный]","{""Бренд"":[""Нет бренда""],""Вес товара, г"":[""100""...",[красный],"{""Название цвета"":[""Красный""],""Страна-изготови...",Смарт-часы,Смарт-часы,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",18,15,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
18082,692172005,704805270,[синий],"{""Бренд"":[""Sol""],""Вес товара, г"":[""200""],""Разм...","[зеленый, красный, green, red, синий]","{""Автофокус"":[""Да""],""Комплектация"":[""- Лазерны...",rest,rest,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",14,15,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [43]:
feature_test["cat31_len"] = len_sentences[:len(len_sentences) // 2]

In [44]:
feature_test["cat32_len"] = len_sentences[len(len_sentences) // 2:]

In [45]:
feature_test.head(2)

Unnamed: 0,variantid1,variantid2,color_parsed1,characteristic_attributes_mapping1,color_parsed2,characteristic_attributes_mapping2,cat31_grouped,cat32_grouped,name1_bag,name2_bag,name1_len,name2_len,cat31_bag,cat31_len,cat32_len,cat32_bag
0,52076340,290590137,,"{""Напряжение, В"":[""1.5""],""Бренд"":[""Perfeo""],""Т...",,"{""Форм-фактор батареи"":[""AAA""],""Химический тип...",rest,rest,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9,12,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,64525522,204128919,"[черный, red, красный]","{""Операционная система"":[""Android""],""Защищенно...","[черный, red, красный]","{""Встроенная память"":[""32 ГБ""],""Видеопроцессор...",rest,rest,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8,8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [46]:
color_1_prod = features['color_parsed1'].values
colors = []
for i in range(len(color_1_prod)):
    try:
        colors.append(color_1_prod[i][0])
    except:
        colors.append('None')
color_2_prod = features['color_parsed2'].values
for j in range(len(color_2_prod)):
    try:
        colors.append(color_2_prod[j][0])
    except:
        colors.append('None')

In [47]:
color_1_prod = feature_test['color_parsed1'].values
colors_test = []
for i in range(len(color_1_prod)):
    try:
        colors_test.append(color_1_prod[i][0])
    except:
        colors_test.append('None')
color_2_prod = feature_test['color_parsed2'].values
for j in range(len(color_2_prod)):
    try:
        colors_test.append(color_2_prod[j][0])
    except:
        colors_test.append('None')

In [48]:
bag_of_words, len_sentences, colors_bag_vectorizer = create_top_bag_of_words(colors, 200)

In [49]:
bag_of_words, len_sentences = encode_sentences(colors_test, colors_bag_vectorizer)

In [50]:
bag_of_words = bag_of_words.toarray()

In [51]:
feature_test["color1_bag"] = bag_of_words[:len(bag_of_words) // 2].tolist()

In [52]:
feature_test["color2_bag"] =  bag_of_words[len(bag_of_words) // 2:].tolist()

In [54]:
feature_test.head(2)

Unnamed: 0,variantid1,variantid2,color_parsed1,characteristic_attributes_mapping1,color_parsed2,characteristic_attributes_mapping2,cat31_grouped,cat32_grouped,name1_bag,name2_bag,name1_len,name2_len,cat31_bag,cat31_len,cat32_len,cat32_bag,color1_bag,color2_bag
0,52076340,290590137,,"{""Напряжение, В"":[""1.5""],""Бренд"":[""Perfeo""],""Т...",,"{""Форм-фактор батареи"":[""AAA""],""Химический тип...",rest,rest,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9,12,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,64525522,204128919,"[черный, red, красный]","{""Операционная система"":[""Android""],""Защищенно...","[черный, red, красный]","{""Встроенная память"":[""32 ГБ""],""Видеопроцессор...",rest,rest,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8,8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [55]:
feats = ["name1_bag", "name1_len", "name2_bag", "name2_len", "cat31_bag", "cat32_bag", "cat31_len", "cat32_len", "color1_bag", 'color2_bag']

In [56]:
X_test = feature_test[feats]

In [58]:
X_test

Unnamed: 0,name1_bag,name1_len,name2_bag,name2_len,cat31_bag,cat32_bag,cat31_len,cat32_len,color1_bag,color2_bag
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",12,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",14,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",12,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...
18079,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",33,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,2,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
18080,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
18081,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",18,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",15,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
18082,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",14,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",15,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [59]:
X_test_final = []

for i in range(len(X_test)):
    row = []
    row.extend(list(X_test[feats[0]].iloc[i]))
    row.extend(list(X_test[feats[2]].iloc[i]))
    row.extend(list(X_test[feats[4]].iloc[i]))
    row.extend(list(X_test[feats[5]].iloc[i]))
    row.extend(list(X_test[feats[8]].iloc[i]))
    row.extend(list(X_test[feats[9]].iloc[i]))
    row.extend([X_test[feats[b]].iloc[i] for b in [1,3,6,7]])
    X_test_final.append(row)

In [64]:
from catboost import CatBoostClassifier

model = CatBoostClassifier()

model.load_model("catboost_without_embs_1100v2_epochs.cbm")

<catboost.core.CatBoostClassifier at 0x16d31530f40>

In [65]:
predictions = model.predict_proba(X_test_final)

In [66]:
predictions

array([[0.91907967, 0.08092033],
       [0.8843131 , 0.1156869 ],
       [0.30515529, 0.69484471],
       ...,
       [0.9440747 , 0.0559253 ],
       [0.45548743, 0.54451257],
       [0.43247923, 0.56752077]])

In [67]:
predictions1 = model.predict(X_test_final)

In [68]:
predictions1

array([0., 0., 1., ..., 0., 1., 1.])

In [69]:
data_test["target"] = predictions[::, 1]

In [70]:
data_test

Unnamed: 0,variantid1,variantid2,target
0,52076340,290590137,0.080920
1,64525522,204128919,0.115687
2,77243372,479860557,0.694845
3,86065820,540678372,0.131401
4,91566575,258840506,0.317032
...,...,...,...
18079,666998614,667074522,0.002856
18080,670036240,670048449,0.293211
18081,670284509,684323809,0.055925
18082,692172005,704805270,0.544513


In [72]:
data_test.to_csv("sub_cat_666.csv", index = False)