In [1]:
import json
from functools import partial
from typing import List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
from scipy.spatial.distance import cosine, euclidean
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_score


### Load data

In [2]:
dataset = pd.read_parquet(r"C:\Users\druzh\Project_python\ozon_top_1\Datasets/train_pairs.parquet")
etl = pd.read_parquet(r"C:\Users\druzh\Project_python\ozon_top_1\Datasets/train_data.parquet")


In [3]:
test_data = pd.read_parquet(r"C:\Users\druzh\Project_python\ozon_top_1\Datasets\test_data.parquet")
test_pairs_wo_target = pd.read_parquet(r"C:\Users\druzh\Project_python\ozon_top_1\Datasets\test_pairs_wo_target.parquet")

Get raw data for each variantid.

In [4]:
features = (
    dataset
    .merge(
        etl
        .add_suffix('1'),
        on="variantid1"
    )
    .merge(
        etl
        .add_suffix('2'),
        on="variantid2"
    )
)

featurestest = (
    test_pairs_wo_target
    .merge(
        test_data
        .add_suffix('1'),
        on="variantid1"
    )
    .merge(
        test_data
        .add_suffix('2'),
        on="variantid2"
    )
)

In [5]:
featurestest.head(4)

Unnamed: 0,variantid1,variantid2,name1,categories1,color_parsed1,pic_embeddings_resnet_v11,main_pic_embeddings_resnet_v11,name_bert_641,characteristic_attributes_mapping1,name2,categories2,color_parsed2,pic_embeddings_resnet_v12,main_pic_embeddings_resnet_v12,name_bert_642,characteristic_attributes_mapping2
0,52076340,290590137,Батарейка AAA щелочная Perfeo LR03/10BL Super ...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Батарей...",,"[[0.15417035, 0.41160947, 0.2213532, -0.019731...","[[0.04763528, -0.20136409, 0.29605597, 0.26453...","[-0.28437558, 0.60909724, 0.5972025, -0.523296...","{""Напряжение, В"":[""1.5""],""Бренд"":[""Perfeo""],""Т...",Батарейка AAA щелочная Perfeo LR03/2BL mini Su...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Батарей...",,"[[-0.025554053, 0.012488857, 0.43989864, -0.10...","[[0.06223978, -0.16145544, 0.26409012, 0.24271...","[-0.3380968, 0.6156224, 0.6428071, -0.57499236...","{""Форм-фактор батареи"":[""AAA""],""Химический тип..."
1,64525522,204128919,"Смартфон Ulefone Armor X5 3/32 ГБ, черный, кра...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Смартфо...","[черный, red, красный]","[[-0.239386, -0.8332473, -0.08384809, 0.071721...","[[-0.27325493, -0.6696304, 0.027148303, 0.0785...","[-0.45766184, 0.5528555, 0.26298037, -0.663931...","{""Операционная система"":[""Android""],""Защищенно...","Смартфон Ulefone Armor X3 2/32 ГБ, черный, кра...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Смартфо...","[черный, red, красный]","[[-0.071279265, -0.99063504, -0.3939417, 0.886...","[[-0.15358369, -0.8256463, -0.054863703, 0.453...","[-0.4489074, 0.6278857, 0.33072582, -0.6749875...","{""Встроенная память"":[""32 ГБ""],""Видеопроцессор..."
2,77243372,479860557,Цифровой кабель TV-COM HDMI 1.4 (M/ M) Full HD...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Кабели ...","[черный, золотистый]","[[0.28645578, -0.4411031, 0.9677321, 0.0880519...","[[0.262986, -0.48823145, 1.558289, -0.20182608...","[-0.59732205, 0.7067618, 0.488719, -0.34360138...","{""Тип"":[""Видео-кабель""],""Длина, м"":[""3""],""Разм...","Кабель HDMI 1.4 (Male/Male) (CG150S-1.5M), че...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Кабели ...","[черный, золотистый]",,"[[0.3385992, -0.45696405, 1.5419214, -0.185660...","[-0.37139902, 0.39780936, 0.5956383, -0.173539...","{""Коннектор 2"":[""HDMI""],""Единиц в одном товаре..."
3,86065820,540678372,"Игровая мышь проводная A4Tech Bloody P93, 8 кн...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Устройс...",[черный],"[[-0.16229028, -0.29124975, -0.025808021, 1.06...","[[0.08867453, -0.13375187, 0.11679518, 1.16906...","[-0.44171652, 0.55147576, 0.50624573, -0.31552...","{""MTBF, кликов"":[""20000000""],""Особенности"":[""П...","Мышь A4Tech Bloody P93s Bullet, серый, оптичес...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Устройс...",[серый],"[[-0.34419402, 0.46059757, 0.13175267, 0.16226...","[[0.2734869, 0.18559125, -0.30230802, 1.330367...","[-0.46880978, 0.4516948, 0.43816173, -0.466540...","{""Цвет товара"":[""серый""],""Интерфейс"":[""USB""],""..."


In [6]:
def get_pic_features(main_pic_embeddings_1,
                     main_pic_embeddings_2,
                     percentiles: List[int]):
    """Calculate distances percentiles for 
    pairwise pic distances. Percentiles are useful 
    when product has several pictures.
    """
    
    if main_pic_embeddings_1 is not None and main_pic_embeddings_2 is not None:
        main_pic_embeddings_1 = np.array([x for x in main_pic_embeddings_1])
        main_pic_embeddings_2 = np.array([x for x in main_pic_embeddings_2])
        
        dist_m = pairwise_distances(
            main_pic_embeddings_1, main_pic_embeddings_2
        )
    else:
        dist_m = np.array([[-1]])

    pair_features = []
    pair_features += np.percentile(dist_m, percentiles).tolist()

    return pair_features


def text_dense_distances(ozon_embedding, comp_embedding):
    """Calculate Euclidean and Cosine distances between
    ozon_embedding and comp_embedding.
    """
    pair_features = []
    if ozon_embedding is None or comp_embedding is None:
        pair_features = [-1, -1]
    elif len(ozon_embedding) == 0 or len(comp_embedding) == 0:
        pair_features = [-1, -1]
    else:
        pair_features.append(
            euclidean(ozon_embedding, comp_embedding)
        )
        cosine_value = cosine(ozon_embedding, comp_embedding)
        
        pair_features.append(cosine_value)

    return pair_features

In [7]:
get_pic_features_func = partial(
    get_pic_features,
    percentiles=[0, 25, 50]
)

In [8]:
featurestest[["pic_dist_0_perc", "pic_dist_25_perc", "pic_dist_50_perc"]] = (
    featurestest[["pic_embeddings_resnet_v11", "pic_embeddings_resnet_v12"]].apply(
        lambda x: pd.Series(get_pic_features_func(*x)), axis=1
    )
)


In [9]:
featurestest[["main_pic_dist_0_perc", "main_pic_dist_25_perc", "main_pic_dist_50_perc"]] = (
    featurestest[["main_pic_embeddings_resnet_v11", "main_pic_embeddings_resnet_v12"]].apply(
        lambda x: pd.Series(get_pic_features_func(*x)), axis=1
    )
)

In [10]:
featurestest[["euclidean_name_bert_dist", "cosine_name_bert_dist"]] = (
    featurestest[["name_bert_641", "name_bert_642"]].apply(
        lambda x: pd.Series(text_dense_distances(*x)), axis=1
    )
)


Features functions.

In [11]:
features["cat31"] = features["categories1"].apply(lambda x: json.loads(x)["3"])
cat3_counts = features["cat31"].value_counts().to_dict()

In [12]:
featurestest["cat31"] = featurestest["categories1"].apply(lambda x: json.loads(x)["3"])
cat3_counts_test = featurestest["cat31"].value_counts().to_dict()

In [13]:
features["cat31_grouped"] = features["cat31"].apply(lambda x: x if cat3_counts[x] > 1000 else "rest")

featurestest["cat31_grouped"] = featurestest["cat31"].apply(lambda x: x if cat3_counts_test[x] > 1000 else "rest")

In [14]:
features["cat32"] = features["categories2"].apply(lambda x: json.loads(x)["3"])
cat3_counts = features["cat32"].value_counts().to_dict()

featurestest["cat32"] = featurestest["categories2"].apply(lambda x: json.loads(x)["3"])
cat3_counts_test = featurestest["cat32"].value_counts().to_dict()

In [15]:
features["cat32_grouped"] = features["cat32"].apply(lambda x: x if cat3_counts[x] > 1000 else "rest")

featurestest["cat32_grouped"] = featurestest["cat32"].apply(lambda x: x if cat3_counts_test[x] > 1000 else "rest")

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

def create_top_bag_of_words(sentences, top_words):
    vectorizer = CountVectorizer(max_features=top_words)
    bag_of_words = vectorizer.fit_transform(sentences)
    len_sentences = [len(sentence.split()) for sentence in sentences]

    return bag_of_words, len_sentences, vectorizer

def encode_sentences(sentences, vectorizer):
    encoded_sentences = vectorizer.transform(sentences)
    len_sentences = [len(sentence.split()) for sentence in sentences]
    return encoded_sentences, len_sentences

In [17]:
bag_of_words, len_sentences, names_bag_vectorizer = create_top_bag_of_words(np.hstack((features['name1'].values, features['name2'].values)), 5000)

In [18]:
bag_of_words, len_sentences = encode_sentences(np.hstack((featurestest['name1'].values, featurestest['name2'].values)), names_bag_vectorizer)

In [19]:
bag_of_words = bag_of_words.toarray()

In [20]:
featurestest["name1_bag"] = bag_of_words[:len(bag_of_words) // 2].tolist()

In [21]:
featurestest["name2_bag"] = bag_of_words[len(bag_of_words) // 2:].tolist()

In [22]:
featurestest["name1_len"] = len_sentences[:len(len_sentences) // 2]

In [23]:
featurestest["name2_len"] = len_sentences[len(len_sentences) // 2:]

In [24]:
bag_of_words, len_sentences, cats_bag_vectorizer = create_top_bag_of_words(np.hstack((features['cat31'].values, features['cat32'].values)), 250)

In [25]:
bag_of_words, len_sentences = encode_sentences(np.hstack((featurestest['cat31'].values, featurestest['cat32'].values)), cats_bag_vectorizer)

In [26]:
bag_of_words = bag_of_words.toarray()

In [27]:
featurestest["cat31_bag"] = bag_of_words[:len(bag_of_words) // 2].tolist()

In [28]:
featurestest = featurestest.drop(
        ["name1", "categories1", "pic_embeddings_resnet_v11", "main_pic_embeddings_resnet_v11", "name_bert_641",
         "name2",
         "categories2", "pic_embeddings_resnet_v12", "main_pic_embeddings_resnet_v12", "name_bert_642"], axis=1)
featurestest = featurestest.drop(['cat31', 'cat32'], axis=1)

In [29]:
featurestest["cat32_bag"] =  bag_of_words[len(bag_of_words) // 2:].tolist()

In [30]:
featurestest["cat31_len"] = len_sentences[:len(len_sentences) // 2]

In [31]:
featurestest["cat32_len"] = len_sentences[len(len_sentences) // 2:]

In [32]:
featurestest.head(2)

Unnamed: 0,variantid1,variantid2,color_parsed1,characteristic_attributes_mapping1,color_parsed2,characteristic_attributes_mapping2,pic_dist_0_perc,pic_dist_25_perc,pic_dist_50_perc,main_pic_dist_0_perc,...,cat31_grouped,cat32_grouped,name1_bag,name2_bag,name1_len,name2_len,cat31_bag,cat32_bag,cat31_len,cat32_len
0,52076340,290590137,,"{""Напряжение, В"":[""1.5""],""Бренд"":[""Perfeo""],""Т...",,"{""Форм-фактор батареи"":[""AAA""],""Химический тип...",5.33199,5.33199,5.33199,0.44995,...,rest,rest,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9,12,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,3
1,64525522,204128919,"[черный, red, красный]","{""Операционная система"":[""Android""],""Защищенно...","[черный, red, красный]","{""Встроенная память"":[""32 ГБ""],""Видеопроцессор...",3.724136,4.690508,5.325581,3.567521,...,rest,rest,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8,8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,4


In [33]:
color_1_prod = features['color_parsed1'].values
colors = []
for i in range(len(color_1_prod)):
    try:
        colors.append(color_1_prod[i][0])
    except:
        colors.append('None')
color_2_prod = features['color_parsed2'].values
for j in range(len(color_2_prod)):
    try:
        colors.append(color_2_prod[j][0])
    except:
        colors.append('None')

In [34]:
color_1_prod = featurestest['color_parsed1'].values
colorstest = []
for i in range(len(color_1_prod)):
    try:
        colorstest.append(color_1_prod[i][0])
    except:
        colorstest.append('None')
color_2_prod = featurestest['color_parsed2'].values
for j in range(len(color_2_prod)):
    try:
        colorstest.append(color_2_prod[j][0])
    except:
        colorstest.append('None')

In [35]:
bag_of_words, len_sentences, colors_bag_vectorizer = create_top_bag_of_words(colors, 200)

In [36]:
bag_of_words, len_sentences = encode_sentences(colorstest, colors_bag_vectorizer)

In [37]:
bag_of_words = bag_of_words.toarray()

In [38]:
featurestest["color1_bag"] = bag_of_words[:len(bag_of_words) // 2].tolist()

In [39]:
featurestest["color2_bag"] =  bag_of_words[len(bag_of_words) // 2:].tolist()

In [40]:
featurestest.head(2)

Unnamed: 0,variantid1,variantid2,color_parsed1,characteristic_attributes_mapping1,color_parsed2,characteristic_attributes_mapping2,pic_dist_0_perc,pic_dist_25_perc,pic_dist_50_perc,main_pic_dist_0_perc,...,name1_bag,name2_bag,name1_len,name2_len,cat31_bag,cat32_bag,cat31_len,cat32_len,color1_bag,color2_bag
0,52076340,290590137,,"{""Напряжение, В"":[""1.5""],""Бренд"":[""Perfeo""],""Т...",,"{""Форм-фактор батареи"":[""AAA""],""Химический тип...",5.33199,5.33199,5.33199,0.44995,...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9,12,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,64525522,204128919,"[черный, red, красный]","{""Операционная система"":[""Android""],""Защищенно...","[черный, red, красный]","{""Встроенная память"":[""32 ГБ""],""Видеопроцессор...",3.724136,4.690508,5.325581,3.567521,...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8,8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [41]:
feats = ["name1_bag", "name1_len", "name2_bag", "name2_len", "cat31_bag", "cat32_bag", "cat31_len", "cat32_len", "color1_bag", 'color2_bag', "pic_dist_0_perc", "pic_dist_25_perc", "pic_dist_50_perc", "main_pic_dist_0_perc", "main_pic_dist_25_perc", "main_pic_dist_50_perc", "euclidean_name_bert_dist", "cosine_name_bert_dist"]

In [42]:
X_test = featurestest[feats]

In [43]:
X_test

Unnamed: 0,name1_bag,name1_len,name2_bag,name2_len,cat31_bag,cat32_bag,cat31_len,cat32_len,color1_bag,color2_bag,pic_dist_0_perc,pic_dist_25_perc,pic_dist_50_perc,main_pic_dist_0_perc,main_pic_dist_25_perc,main_pic_dist_50_perc,euclidean_name_bert_dist,cosine_name_bert_dist
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",12,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5.331990,5.331990,5.331990,0.449950,0.449950,0.449950,0.719428,0.015745
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3.724136,4.690508,5.325581,3.567521,3.567521,3.567521,0.573571,0.009069
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",14,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-1.000000,-1.000000,-1.000000,0.835175,0.835175,0.835175,1.240927,0.052026
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",12,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.570454,4.767440,5.547379,3.889462,3.889462,3.889462,0.550203,0.008708
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.668072,2.668072,2.668072,4.637042,4.637042,4.637042,0.596074,0.009735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18079,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",33,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,2,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.000000,0.000000,2.191005,0.000000,0.000000,0.000000,0.634790,0.011493
18080,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-1.000000,-1.000000,-1.000000,2.019089,2.019089,2.019089,0.553783,0.008289
18081,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",18,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",15,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-1.000000,-1.000000,-1.000000,0.310947,0.310947,0.310947,1.682896,0.076411
18082,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",14,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",15,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.000000,3.209469,5.253204,0.000000,0.000000,0.000000,0.460841,0.007052


In [44]:
X_test_final = []

for i in range(len(X_test)):
    row = []
    row.extend(list(X_test[feats[0]].iloc[i]))
    row.extend(list(X_test[feats[2]].iloc[i]))
    row.extend(list(X_test[feats[4]].iloc[i]))
    row.extend(list(X_test[feats[5]].iloc[i]))
    row.extend(list(X_test[feats[8]].iloc[i]))
    row.extend(list(X_test[feats[9]].iloc[i]))
    row.extend([X_test[feats[b]].iloc[i] for b in [1,3,6,7,10,11,12,13,14,15,16,17]])
    X_test_final.append(row)

In [45]:
print(len(X_test_final))

18084


In [54]:
import joblib

model = joblib.load('logistic_regression_w_embs_model_200_epochs.pkl')

predictions = model.predict(X_test_final)

In [47]:
predictions[::, 1]

array([0.78733842, 0.7389366 , 0.62069414, ..., 0.76547454, 0.45693213,
       0.56289734])

In [55]:
predictions

array([0., 0., 0., ..., 0., 1., 0.])

In [51]:
test_pairs_wo_target = test_pairs_wo_target.drop(
        ['target'], axis=1)

In [52]:
test_pairs_wo_target['target'] = predictions[::, 1]

In [53]:
test_pairs_wo_target.to_csv("submission_1.csv", index = False)