In [125]:
import json
from functools import partial
from typing import List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
from scipy.spatial.distance import cosine, euclidean
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score


### Load data

In [126]:
dataset = pd.read_parquet("train_pairs_w_target.parquet")
etl = pd.read_parquet("train_data.parquet")

In [127]:
dataset.head(2)

Unnamed: 0,target,variantid1,variantid2
0,0.0,51197862,51198054
1,1.0,53062686,536165289


In [128]:
dataset["target"].value_counts()

0.0    171527
1.0    135013
Name: target, dtype: int64

In [129]:
etl.head(2)

Unnamed: 0,variantid,characteristic_attributes_mapping,name,categories,color_parsed,pic_embeddings_resnet_v1,main_pic_embeddings_resnet_v1,name_bert_64
0,51195767,"{""85"":[""Партнер-Электро""],""8229"":[""Удлинитель ...","Удлинитель Партнер-Электро ПВС 2х0,75 ГОСТ,6A,...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[оранжевый],,"[[0.04603629, 0.18839523, -0.09973055, -0.6636...","[-0.47045058, 0.67237014, 0.48984158, -0.54485..."
1,51196903,"{""85"":[""TDM Electric""],""8229"":[""Удлинитель быт...",Колодка TDM Electric пятиместная без заземлени...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],"[[0.42044494, -0.33794826, -0.037247024, 0.165...","[[0.16211876, -0.4455993, 0.6131705, 0.5954206...","[-0.48503304, 0.6264443, 0.6406273, -0.4696772..."


In [130]:
print(etl.shape, dataset.shape)

(457063, 8) (306540, 3)


Get raw data for each variantid.

In [131]:
features0 = (
    dataset
    .merge(
        etl
        .add_suffix('1'),
        on="variantid1"
    )
    .merge(
        etl
        .add_suffix('2'),
        on="variantid2"
    )
)

In [132]:
features0.head(2)

Unnamed: 0,target,variantid1,variantid2,characteristic_attributes_mapping1,name1,categories1,color_parsed1,pic_embeddings_resnet_v11,main_pic_embeddings_resnet_v11,name_bert_641,characteristic_attributes_mapping2,name2,categories2,color_parsed2,pic_embeddings_resnet_v12,main_pic_embeddings_resnet_v12,name_bert_642
0,0.0,51197862,51198054,"{""85"":[""TDM Electric""],""8229"":[""Удлинитель быт...",Удлинитель TDM Electric Люкс УЛ05В 5 м (SQ1303...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],,"[[-0.4304909, -0.49474272, -0.46439183, -0.060...","[-0.5104684, 0.56158644, 0.58873796, -0.529718...","{""85"":[""TDM Electric""],""8229"":[""Удлинитель быт...",Удлинитель TDM Electric Люкс УЛ05В 1.5 м (SQ13...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],,"[[-0.42941108, -0.5129398, -0.4753536, -0.0677...","[-0.455473, 0.58157134, 0.5870387, -0.5325003,..."
1,0.0,51197862,51199884,"{""85"":[""TDM Electric""],""8229"":[""Удлинитель быт...",Удлинитель TDM Electric Люкс УЛ05В 5 м (SQ1303...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],,"[[-0.4304909, -0.49474272, -0.46439183, -0.060...","[-0.5104684, 0.56158644, 0.58873796, -0.529718...","{""85"":[""TDM Electric""],""8229"":[""Удлинитель быт...",Удлинитель TDM Electric Люкс УЛ05В 3 м (SQ1303...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],,"[[-0.43180764, -0.49580905, -0.5062628, -0.130...","[-0.5425725, 0.6415736, 0.51481575, -0.5687392..."


Features functions.

In [133]:
features = features0[:3000]

In [134]:
features["cat31"] = features["categories1"].apply(lambda x: json.loads(x)["3"])
cat3_counts = features["cat31"].value_counts().to_dict()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["cat31"] = features["categories1"].apply(lambda x: json.loads(x)["3"])


In [135]:
features["cat31_grouped"] = features["cat31"].apply(lambda x: x if cat3_counts[x] > 1000 else "rest")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["cat31_grouped"] = features["cat31"].apply(lambda x: x if cat3_counts[x] > 1000 else "rest")


In [136]:
features["cat32"] = features["categories2"].apply(lambda x: json.loads(x)["3"])
cat3_counts = features["cat32"].value_counts().to_dict()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["cat32"] = features["categories2"].apply(lambda x: json.loads(x)["3"])


In [137]:
features["cat32_grouped"] = features["cat32"].apply(lambda x: x if cat3_counts[x] > 1000 else "rest")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["cat32_grouped"] = features["cat32"].apply(lambda x: x if cat3_counts[x] > 1000 else "rest")


In [138]:
from sklearn.feature_extraction.text import CountVectorizer


def create_bag_of_words(sentences):
    vectorizer = CountVectorizer()
    bag_of_words = vectorizer.fit_transform(sentences)
    len_sentences = []
    for sentence in sentences:
        len_sentences.append(len(sentence.split()))

    return bag_of_words.toarray(), len_sentences

In [139]:
bag_of_words, len_sentences = create_bag_of_words(np.hstack((features['name1'].values, features['name2'].values)))

In [140]:
print(bag_of_words.shape)

(6000, 3606)


In [141]:
bag_of_words_1, bag_of_words_2 = bag_of_words[:len(bag_of_words) // 2], bag_of_words[len(bag_of_words) // 2:]
len_sentences_1, len_sentences_2 = len_sentences[:len(len_sentences) // 2], len_sentences[len(len_sentences) // 2:]

In [142]:
features["name1_bag"] = bag_of_words_1.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["name1_bag"] = bag_of_words_1.tolist()


In [143]:
features["name2_bag"] = bag_of_words_2.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["name2_bag"] = bag_of_words_2.tolist()


In [144]:
features["name1_len"] = len_sentences_1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["name1_len"] = len_sentences_1


In [145]:
features["name2_len"] = len_sentences_2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["name2_len"] = len_sentences_2


In [146]:
bag_of_words, len_sentences = create_bag_of_words(np.hstack((features['cat31'].values, features['cat32'].values)))

In [147]:
print(bag_of_words.shape)

(6000, 113)


In [148]:
bag_of_words_1, bag_of_words_2 = bag_of_words[:len(bag_of_words) // 2], bag_of_words[len(bag_of_words) // 2:]
len_sentences_1, len_sentences_2 = len_sentences[:len(len_sentences) // 2], len_sentences[len(len_sentences) // 2:]

In [149]:
features["cat31_bag"] = bag_of_words_1.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["cat31_bag"] = bag_of_words_1.tolist()


In [150]:
features["cat32_bag"] = bag_of_words_2.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["cat32_bag"] = bag_of_words_2.tolist()


In [151]:
features["cat31_len"] = len_sentences_1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["cat31_len"] = len_sentences_1


In [152]:
features["cat32_len"] = len_sentences_2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["cat32_len"] = len_sentences_2


In [153]:
features.head(2)

Unnamed: 0,target,variantid1,variantid2,characteristic_attributes_mapping1,name1,categories1,color_parsed1,pic_embeddings_resnet_v11,main_pic_embeddings_resnet_v11,name_bert_641,...,cat32,cat32_grouped,name1_bag,name2_bag,name1_len,name2_len,cat31_bag,cat32_bag,cat31_len,cat32_len
0,0.0,51197862,51198054,"{""85"":[""TDM Electric""],""8229"":[""Удлинитель быт...",Удлинитель TDM Electric Люкс УЛ05В 5 м (SQ1303...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],,"[[-0.4304909, -0.49474272, -0.46439183, -0.060...","[-0.5104684, 0.56158644, 0.58873796, -0.529718...",...,"Сетевые фильтры, разветвители и удлинители",rest,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8,8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5,5
1,0.0,51197862,51199884,"{""85"":[""TDM Electric""],""8229"":[""Удлинитель быт...",Удлинитель TDM Electric Люкс УЛ05В 5 м (SQ1303...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],,"[[-0.4304909, -0.49474272, -0.46439183, -0.060...","[-0.5104684, 0.56158644, 0.58873796, -0.529718...",...,"Сетевые фильтры, разветвители и удлинители",rest,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8,8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5,5


In [154]:
gg = features['color_parsed1'].values
color_parsed1 = []
for g in range(len(gg)):
    try:
        color_parsed1.append(gg[g][0])
    except:
        color_parsed1.append('None')
gg1 = features['color_parsed2'].values
for g in range(len(gg1)):
    try:
        color_parsed1.append(gg1[g][0])
    except:
        color_parsed1.append('None')

In [155]:
bag_of_words, len_sentences = create_bag_of_words(color_parsed1)

In [156]:
print(bag_of_words.shape)

(6000, 53)


In [157]:
bag_of_words_1, bag_of_words_2 = bag_of_words[:len(bag_of_words) // 2], bag_of_words[len(bag_of_words) // 2:]
len_sentences_1, len_sentences_2 = len_sentences[:len(len_sentences) // 2], len_sentences[len(len_sentences) // 2:]

In [158]:
features["color1_bag"] = bag_of_words_1.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["color1_bag"] = bag_of_words_1.tolist()


In [159]:
features["color2_bag"] = bag_of_words_2.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["color2_bag"] = bag_of_words_2.tolist()


In [160]:
features.head(2)

Unnamed: 0,target,variantid1,variantid2,characteristic_attributes_mapping1,name1,categories1,color_parsed1,pic_embeddings_resnet_v11,main_pic_embeddings_resnet_v11,name_bert_641,...,name1_bag,name2_bag,name1_len,name2_len,cat31_bag,cat32_bag,cat31_len,cat32_len,color1_bag,color2_bag
0,0.0,51197862,51198054,"{""85"":[""TDM Electric""],""8229"":[""Удлинитель быт...",Удлинитель TDM Electric Люкс УЛ05В 5 м (SQ1303...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],,"[[-0.4304909, -0.49474272, -0.46439183, -0.060...","[-0.5104684, 0.56158644, 0.58873796, -0.529718...",...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8,8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5,5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,0.0,51197862,51199884,"{""85"":[""TDM Electric""],""8229"":[""Удлинитель быт...",Удлинитель TDM Electric Люкс УЛ05В 5 м (SQ1303...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],,"[[-0.4304909, -0.49474272, -0.46439183, -0.060...","[-0.5104684, 0.56158644, 0.58873796, -0.529718...",...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8,8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5,5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [161]:
feats = ["name1_bag", "name1_len", "name2_bag", "name2_len", "cat31_bag", "cat32_bag", "cat31_len", "cat32_len", "color1_bag", 'color2_bag']

In [179]:
X_train, X_test = train_test_split(
    features[feats + ["target"]], 
    test_size=0.1, random_state=42, stratify=features[["target"]])

X_train, X_val = train_test_split(
    X_train[feats + ["target"]], 
    test_size=0.1, random_state=42, stratify=X_train[["target"]])

y_test = X_test[["target"]]
X_test = X_test.drop(["target"], axis=1)

y_train = X_train["target"]
y_val = X_val["target"]

X_train = X_train.drop(["target"], axis=1)
X_val = X_val.drop(["target"], axis=1)

        


In [201]:
X_train_final = []

for i in range(len(X_train)):
    row = []
    row.extend(list(X_train[feats[0]].iloc[i]))
    row.extend(list(X_train[feats[2]].iloc[i]))
    row.extend(list(X_train[feats[4]].iloc[i]))
    row.extend(list(X_train[feats[5]].iloc[i]))
    row.extend(list(X_train[feats[8]].iloc[i]))
    row.extend(list(X_train[feats[9]].iloc[i]))
    row.extend([X_train[feats[b]].iloc[i] for b in [1,3,6,7]])
    X_train_final.append(row)

In [229]:
print(len(X_train_final))

2430


In [164]:
print(y_train)

428     0.0
1313    1.0
1969    1.0
1584    1.0
2442    1.0
       ... 
2719    1.0
1877    1.0
1776    0.0
1381    0.0
1186    1.0
Name: target, Length: 2430, dtype: float64


In [207]:
X_test_final = []

for i in range(len(X_test)):
    row = []
    row.extend(list(X_test[feats[0]].iloc[i]))
    row.extend(list(X_test[feats[2]].iloc[i]))
    row.extend(list(X_test[feats[4]].iloc[i]))
    row.extend(list(X_test[feats[5]].iloc[i]))
    row.extend(list(X_test[feats[8]].iloc[i]))
    row.extend(list(X_test[feats[9]].iloc[i]))
    row.extend([X_test[feats[b]].iloc[i] for b in [1,3,6,7]])
    X_test_final.append(row)

In [227]:
print(len(X_test_final))

300


In [209]:
print(y_test)

      target
36       0.0
196      0.0
1745     0.0
39       0.0
2334     1.0
...      ...
885      1.0
562      0.0
2959     1.0
2037     1.0
2303     1.0

[300 rows x 1 columns]


In [231]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_final, y_train)




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [233]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
prediction = model.predict(X_test_final)
mae = mean_absolute_error(y_test, prediction)
print("Mean Squared Error:", mae)

accuracy = accuracy_score(y_test, prediction)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Mean Squared Error: 0.13666666666666666
Accuracy: 0.8633333333333333
Precision: 0.8620689655172413
Recall: 0.9308510638297872


In [221]:
print(X_train_final[1])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [225]:
print(y_train)

428     0.0
1313    1.0
1969    1.0
1584    1.0
2442    1.0
       ... 
2719    1.0
1877    1.0
1776    0.0
1381    0.0
1186    1.0
Name: target, Length: 2430, dtype: float64
