In [1]:
import json
from functools import partial
from typing import List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
from scipy.spatial.distance import cosine, euclidean
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_score


### Load data

In [2]:
dataset = pd.read_parquet(r"C:\Users\druzh\Project_python\ozon_top_1\Datasets/train_pairs_w_target.parquet")
etl = pd.read_parquet(r"C:\Users\druzh\Project_python\ozon_top_1\Datasets/train_data.parquet")


In [3]:
test_data = pd.read_parquet(r"C:\Users\druzh\Project_python\ozon_top_1\Datasets\test_data.parquet")
test_pairs_wo_target = pd.read_parquet(r"C:\Users\druzh\Project_python\ozon_top_1\Datasets\test_pairs_wo_target.parquet")

In [4]:
dataset.head(2)

Unnamed: 0,target,variantid1,variantid2
0,0.0,51197862,51198054
1,1.0,53062686,536165289


In [5]:
test_data.head(2)

Unnamed: 0,variantid,name,categories,color_parsed,pic_embeddings_resnet_v1,main_pic_embeddings_resnet_v1,name_bert_64,characteristic_attributes_mapping
0,51201254,Колодка TDM Electric четырехместная без заземл...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],"[[0.34383398, -0.2962618, 0.07987049, -0.08257...","[[0.38310742, -0.7876679, 0.5018278, 0.2090071...","[-0.5060825, 0.5773388, 0.59435517, -0.4958292...","{""Страна-изготовитель"":[""Китай""],""Бренд"":[""TDM..."
1,77151532,Клавиатура черная с черной рамкой для 25-011879,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Запчаст...",[черный],,"[[0.50964713, 0.7958329, -1.4113188, 0.1999381...","[-0.43467724, 0.6614495, 0.48050267, -0.588880...","{""Страна-изготовитель"":[""Китай""],""Комплектация..."


In [6]:
test_pairs_wo_target.head(2)

Unnamed: 0,variantid1,variantid2
0,52076340,290590137
1,64525522,204128919


In [7]:
dataset["target"].value_counts()

0.0    171527
1.0    135013
Name: target, dtype: int64

In [8]:
etl.head(2)

Unnamed: 0,variantid,characteristic_attributes_mapping,name,categories,color_parsed,pic_embeddings_resnet_v1,main_pic_embeddings_resnet_v1,name_bert_64
0,51195767,"{""85"":[""Партнер-Электро""],""8229"":[""Удлинитель ...","Удлинитель Партнер-Электро ПВС 2х0,75 ГОСТ,6A,...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[оранжевый],,"[[0.04603629, 0.18839523, -0.09973055, -0.6636...","[-0.47045058, 0.67237014, 0.48984158, -0.54485..."
1,51196903,"{""85"":[""TDM Electric""],""8229"":[""Удлинитель быт...",Колодка TDM Electric пятиместная без заземлени...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],"[[0.42044494, -0.33794826, -0.037247024, 0.165...","[[0.16211876, -0.4455993, 0.6131705, 0.5954206...","[-0.48503304, 0.6264443, 0.6406273, -0.4696772..."


In [9]:
print(etl.shape, dataset.shape)

(457063, 8) (306540, 3)


Get raw data for each variantid.

In [10]:
features0 = (
    dataset
    .merge(
        etl
        .add_suffix('1'),
        on="variantid1"
    )
    .merge(
        etl
        .add_suffix('2'),
        on="variantid2"
    )
)

featurestest = (
    test_pairs_wo_target
    .merge(
        test_data
        .add_suffix('1'),
        on="variantid1"
    )
    .merge(
        test_data
        .add_suffix('2'),
        on="variantid2"
    )
)

In [53]:
featurestest.head(4)

Unnamed: 0,variantid1,variantid2,color_parsed1,characteristic_attributes_mapping1,color_parsed2,characteristic_attributes_mapping2,cat31_grouped,cat32_grouped,name1_bag,name2_bag,name1_len,name2_len,cat31_bag,cat32_bag,cat31_len,cat32_len,color1_bag,color2_bag
0,52076340,290590137,,"{""Напряжение, В"":[""1.5""],""Бренд"":[""Perfeo""],""Т...",,"{""Форм-фактор батареи"":[""AAA""],""Химический тип...",rest,rest,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9,12,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,64525522,204128919,"[черный, red, красный]","{""Операционная система"":[""Android""],""Защищенно...","[черный, red, красный]","{""Встроенная память"":[""32 ГБ""],""Видеопроцессор...",rest,rest,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8,8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,77243372,479860557,"[черный, золотистый]","{""Тип"":[""Видео-кабель""],""Длина, м"":[""3""],""Разм...","[черный, золотистый]","{""Коннектор 2"":[""HDMI""],""Единиц в одном товаре...",Кабели и переходники,Кабели и переходники,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",14,8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,86065820,540678372,[черный],"{""MTBF, кликов"":[""20000000""],""Особенности"":[""П...",[серый],"{""Цвет товара"":[""серый""],""Интерфейс"":[""USB""],""...",rest,rest,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",12,11,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


Features functions.

In [12]:
features = features0

In [13]:
features["cat31"] = features["categories1"].apply(lambda x: json.loads(x)["3"])
cat3_counts = features["cat31"].value_counts().to_dict()

In [14]:
featurestest["cat31"] = featurestest["categories1"].apply(lambda x: json.loads(x)["3"])
cat3_counts_test = featurestest["cat31"].value_counts().to_dict()

In [15]:
features["cat31_grouped"] = features["cat31"].apply(lambda x: x if cat3_counts[x] > 1000 else "rest")

featurestest["cat31_grouped"] = featurestest["cat31"].apply(lambda x: x if cat3_counts_test[x] > 1000 else "rest")

In [16]:
features["cat32"] = features["categories2"].apply(lambda x: json.loads(x)["3"])
cat3_counts = features["cat32"].value_counts().to_dict()

featurestest["cat32"] = featurestest["categories2"].apply(lambda x: json.loads(x)["3"])
cat3_counts_test = featurestest["cat32"].value_counts().to_dict()

In [17]:
features["cat32_grouped"] = features["cat32"].apply(lambda x: x if cat3_counts[x] > 1000 else "rest")

featurestest["cat32_grouped"] = featurestest["cat32"].apply(lambda x: x if cat3_counts_test[x] > 1000 else "rest")

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

def create_top_bag_of_words(sentences, top_words):
    vectorizer = CountVectorizer(max_features=top_words)
    bag_of_words = vectorizer.fit_transform(sentences)
    len_sentences = [len(sentence.split()) for sentence in sentences]

    return bag_of_words, len_sentences, vectorizer

def encode_sentences(sentences, vectorizer):
    encoded_sentences = vectorizer.transform(sentences)
    len_sentences = [len(sentence.split()) for sentence in sentences]
    return encoded_sentences, len_sentences

In [19]:
bag_of_words, len_sentences, names_bag_vectorizer = create_top_bag_of_words(np.hstack((features['name1'].values, features['name2'].values)), 5000)

In [20]:
bag_of_words, len_sentences = encode_sentences(np.hstack((featurestest['name1'].values, featurestest['name2'].values)), names_bag_vectorizer)

In [21]:
bag_of_words = bag_of_words.toarray()

In [26]:
bag_of_words_1, bag_of_words_2 = bag_of_words[:len(bag_of_words) // 2], bag_of_words[len(bag_of_words) // 2:]
len_sentences_1, len_sentences_2 = len_sentences[:len(len_sentences) // 2], len_sentences[len(len_sentences) // 2:]

In [22]:
featurestest["name1_bag"] = bag_of_words[:len(bag_of_words) // 2].tolist()

In [23]:
featurestest["name2_bag"] = bag_of_words[len(bag_of_words) // 2:].tolist()

In [24]:
featurestest["name1_len"] = len_sentences[:len(len_sentences) // 2]

In [25]:
featurestest["name2_len"] = len_sentences[len(len_sentences) // 2:]

In [26]:
bag_of_words, len_sentences, cats_bag_vectorizer = create_top_bag_of_words(np.hstack((features['cat31'].values, features['cat32'].values)), 250)

In [27]:
bag_of_words = bag_of_words.toarray()

In [28]:
bag_of_words, len_sentences = encode_sentences(np.hstack((featurestest['cat31'].values, featurestest['cat32'].values)), cats_bag_vectorizer)

In [29]:
bag_of_words = bag_of_words.toarray()

In [25]:
bag_of_words_1, bag_of_words_2 = bag_of_words[:len(bag_of_words) // 2], bag_of_words[len(bag_of_words) // 2:]
len_sentences_1, len_sentences_2 = len_sentences[:len(len_sentences) // 2], len_sentences[len(len_sentences) // 2:]

In [30]:
featurestest["cat31_bag"] = bag_of_words[:len(bag_of_words) // 2].tolist()

In [31]:
featurestest = featurestest.drop(
        ["name1", "categories1", "pic_embeddings_resnet_v11", "main_pic_embeddings_resnet_v11", "name_bert_641",
         "name2",
         "categories2", "pic_embeddings_resnet_v12", "main_pic_embeddings_resnet_v12", "name_bert_642"], axis=1)
featurestest = featurestest.drop(['cat31', 'cat32'], axis=1)

In [32]:
featurestest["cat32_bag"] =  bag_of_words[len(bag_of_words) // 2:].tolist()

In [33]:
featurestest["cat31_len"] = len_sentences[:len(len_sentences) // 2]

In [34]:
featurestest["cat32_len"] = len_sentences[len(len_sentences) // 2:]

In [35]:
featurestest.head(2)

Unnamed: 0,variantid1,variantid2,color_parsed1,characteristic_attributes_mapping1,color_parsed2,characteristic_attributes_mapping2,cat31_grouped,cat32_grouped,name1_bag,name2_bag,name1_len,name2_len,cat31_bag,cat32_bag,cat31_len,cat32_len
0,52076340,290590137,,"{""Напряжение, В"":[""1.5""],""Бренд"":[""Perfeo""],""Т...",,"{""Форм-фактор батареи"":[""AAA""],""Химический тип...",rest,rest,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9,12,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,3
1,64525522,204128919,"[черный, red, красный]","{""Операционная система"":[""Android""],""Защищенно...","[черный, red, красный]","{""Встроенная память"":[""32 ГБ""],""Видеопроцессор...",rest,rest,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8,8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,4


In [36]:
color_1_prod = features['color_parsed1'].values
colors = []
for i in range(len(color_1_prod)):
    try:
        colors.append(color_1_prod[i][0])
    except:
        colors.append('None')
color_2_prod = features['color_parsed2'].values
for j in range(len(color_2_prod)):
    try:
        colors.append(color_2_prod[j][0])
    except:
        colors.append('None')

In [37]:
color_1_prod = featurestest['color_parsed1'].values
colorstest = []
for i in range(len(color_1_prod)):
    try:
        colorstest.append(color_1_prod[i][0])
    except:
        colorstest.append('None')
color_2_prod = featurestest['color_parsed2'].values
for j in range(len(color_2_prod)):
    try:
        colorstest.append(color_2_prod[j][0])
    except:
        colorstest.append('None')

In [38]:
bag_of_words, len_sentences, colors_bag_vectorizer = create_top_bag_of_words(colors, 200)

In [39]:
bag_of_words = bag_of_words.toarray()

In [40]:
bag_of_words, len_sentences = encode_sentences(colorstest, colors_bag_vectorizer)

In [41]:
bag_of_words = bag_of_words.toarray()

In [42]:
featurestest["color1_bag"] = bag_of_words[:len(bag_of_words) // 2].tolist()

In [43]:
featurestest["color2_bag"] =  bag_of_words[len(bag_of_words) // 2:].tolist()

In [44]:
featurestest.head(2)

Unnamed: 0,variantid1,variantid2,color_parsed1,characteristic_attributes_mapping1,color_parsed2,characteristic_attributes_mapping2,cat31_grouped,cat32_grouped,name1_bag,name2_bag,name1_len,name2_len,cat31_bag,cat32_bag,cat31_len,cat32_len,color1_bag,color2_bag
0,52076340,290590137,,"{""Напряжение, В"":[""1.5""],""Бренд"":[""Perfeo""],""Т...",,"{""Форм-фактор батареи"":[""AAA""],""Химический тип...",rest,rest,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9,12,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,64525522,204128919,"[черный, red, красный]","{""Операционная система"":[""Android""],""Защищенно...","[черный, red, красный]","{""Встроенная память"":[""32 ГБ""],""Видеопроцессор...",rest,rest,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8,8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [45]:
feats = ["name1_bag", "name1_len", "name2_bag", "name2_len", "cat31_bag", "cat32_bag", "cat31_len", "cat32_len", "color1_bag", 'color2_bag']

In [46]:
X_test = featurestest[feats]

        


In [47]:
X_test

Unnamed: 0,name1_bag,name1_len,name2_bag,name2_len,cat31_bag,cat32_bag,cat31_len,cat32_len,color1_bag,color2_bag
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",12,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",14,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",12,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...
18079,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",33,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,2,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
18080,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
18081,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",18,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",15,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
18082,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",14,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",15,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [48]:
X_test_final = []

for i in range(len(X_test)):
    row = []
    row.extend(list(X_test[feats[0]].iloc[i]))
    row.extend(list(X_test[feats[2]].iloc[i]))
    row.extend(list(X_test[feats[4]].iloc[i]))
    row.extend(list(X_test[feats[5]].iloc[i]))
    row.extend(list(X_test[feats[8]].iloc[i]))
    row.extend(list(X_test[feats[9]].iloc[i]))
    row.extend([X_test[feats[b]].iloc[i] for b in [1,3,6,7]])
    X_test_final.append(row)

In [49]:
print(len(X_test_final))

18084


In [50]:
import joblib

model = joblib.load('logistic_regression_model_500_epochs.pkl')

predictions = model.predict(X_test_final)

In [77]:
test_pairs_wo_target = test_pairs_wo_target.drop(
        ['prediction', 'cat3_grouped_p', 'scores'], axis=1)

In [51]:
test_pairs_wo_target['target'] = predictions

In [52]:
test_pairs_wo_target.to_csv("sub666.csv", index = False)