In [1]:
import json
from functools import partial
from typing import List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
from scipy.spatial.distance import cosine, euclidean
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_score


### Load data

In [2]:
dataset = pd.read_parquet(r"C:\Users\druzh\Project_python\ozon_top_1\Datasets/train_pairs.parquet")
etl = pd.read_parquet(r"C:\Users\druzh\Project_python\ozon_top_1\Datasets/train_data.parquet")

Get raw data for each variantid.

In [3]:
features = (
    dataset
    .merge(
        etl
        .add_suffix('1'),
        on="variantid1"
    )
    .merge(
        etl
        .add_suffix('2'),
        on="variantid2"
    )
)

In [4]:
features.head(2)

Unnamed: 0,target,variantid1,variantid2,name1,categories1,color_parsed1,pic_embeddings_resnet_v11,main_pic_embeddings_resnet_v11,name_bert_641,characteristic_attributes_mapping1,name2,categories2,color_parsed2,pic_embeddings_resnet_v12,main_pic_embeddings_resnet_v12,name_bert_642,characteristic_attributes_mapping2
0,0.0,51197862,51198054,Удлинитель TDM Electric Люкс УЛ05В 5 м (SQ1303...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],,"[[-0.4304909, -0.49474272, -0.46439183, -0.060...","[-0.5104684, 0.56158644, 0.58873796, -0.529718...","{""Число жил"":[""3""],""Макс. нагрузка, Вт"":[""3500...",Удлинитель TDM Electric Люкс УЛ05В 1.5 м (SQ13...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],,"[[-0.42941108, -0.5129398, -0.4753536, -0.0677...","[-0.455473, 0.58157134, 0.5870387, -0.5325003,...","{""Электробезопасность"":[""Заземление""],""Длина к..."
1,0.0,51197862,51199884,Удлинитель TDM Electric Люкс УЛ05В 5 м (SQ1303...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],,"[[-0.4304909, -0.49474272, -0.46439183, -0.060...","[-0.5104684, 0.56158644, 0.58873796, -0.529718...","{""Число жил"":[""3""],""Макс. нагрузка, Вт"":[""3500...",Удлинитель TDM Electric Люкс УЛ05В 3 м (SQ1303...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],,"[[-0.43180764, -0.49580905, -0.5062628, -0.130...","[-0.5425725, 0.6415736, 0.51481575, -0.5687392...","{""Макс. нагрузка, Вт"":[""3500""],""Стандарт защит..."


In [16]:
features['main_pic_embeddings_resnet_v11'].values[0][0]

array([-0.4304909 , -0.49474272, -0.46439183, -0.06097448, -1.8553698 ,
        0.9934219 ,  0.38884374,  0.00946222,  0.19081306, -1.2679458 ,
        0.45550576,  1.3070302 , -0.8735641 , -1.860254  , -0.95965904,
       -0.16294205, -0.9591131 ,  0.1039326 , -0.27149314, -1.7488945 ,
       -0.36103576,  0.32591015, -0.6894629 ,  0.9465886 , -0.7641743 ,
        0.13324884,  1.3035853 , -0.96210086,  0.73029554, -0.45102447,
       -1.05415   ,  0.2828672 ,  1.3557287 , -0.43453777, -0.06004876,
        0.41944742, -1.0558593 ,  0.87374985, -0.3916652 , -1.3230326 ,
       -0.00288831, -1.3905232 , -1.2145977 , -0.9100479 , -0.2660876 ,
        0.52205855, -0.4524113 ,  0.13193643, -0.5352012 ,  0.29408264,
        0.12342283,  0.13545749,  0.18025269,  0.3071742 ,  0.01327318,
        0.5571765 ,  0.9504708 , -0.14764053,  1.4609265 , -0.15241341,
       -0.19463056,  0.24829568, -0.840411  ,  1.0239264 ,  0.76586485,
        0.10371237,  0.691278  ,  1.7055038 , -0.24888694, -1.04

In [34]:
embeddings = []
for i in features['main_pic_embeddings_resnet_v11'].values:
    embeddings.append(i[0])

In [None]:
embeddings = np.array(embeddings)

In [40]:
from sklearn.manifold import TSNE

tsne_model = TSNE(n_components=3, perplexity=30, learning_rate=200)

compressed_embeddings_main_pic = tsne_model.fit_transform(embeddings)

In [49]:
name_bert_641_embedding = np.array(features['name_bert_641'].values)

In [50]:
name_bert_641_embedding

array([array([-0.5104684 ,  0.56158644,  0.58873796, -0.5297187 ,  0.67423356,
               0.531697  , -0.04992381, -0.6909534 ,  0.39199966, -0.38739476,
              -0.4923953 ,  0.27993584,  0.6704811 , -0.39758343,  0.506643  ,
               0.6050151 ,  0.42997733,  0.41967082,  0.3626733 , -0.38260484,
               0.53740305, -0.63591284,  0.56744915, -0.61197364, -0.6526525 ,
              -0.48569676, -0.31523797, -0.62524575,  0.57858294, -0.25364733,
              -0.5563701 ,  0.61647224, -0.4231567 , -0.16108625,  0.61438966,
              -0.37515053,  0.35756475, -0.50881344,  0.74725235,  0.48903146,
               0.2457251 ,  0.5520097 ,  0.574529  , -0.28198302, -0.47757316,
              -0.43194747, -0.432521  ,  0.10753258, -0.27314606, -0.61970246,
              -0.38382018,  0.4471867 ,  0.3399269 , -0.11906805,  0.5892857 ,
              -0.5445462 ,  0.6403395 ,  0.34737936, -0.5550568 , -0.41882515,
              -0.31731957,  0.7378823 ,  0.50075114,

In [52]:
name_bert_641_embedding = []
for i in features['name_bert_641'].values:
    name_bert_641_embedding.append(i)

In [53]:
name_bert_641_embedding_final = np.array(name_bert_641_embedding)

In [54]:
tsne_model = TSNE(n_components=3, perplexity=30, learning_rate=200)

compressed_embeddings_name_bert_641 = tsne_model.fit_transform(name_bert_641_embedding_final)

In [55]:
embeddings_main_2 = []
for i in features['main_pic_embeddings_resnet_v12'].values:
    embeddings_main_2.append(i[0])

In [56]:
embeddings_main_2_final = np.array(embeddings_main_2)

In [57]:
tsne_model = TSNE(n_components=3, perplexity=30, learning_rate=200)

compressed_embeddings_main_pic_2 = tsne_model.fit_transform(embeddings_main_2_final)

In [58]:
name_bert_642_embedding = []
for i in features['name_bert_642'].values:
    name_bert_642_embedding.append(i)

In [59]:
name_bert_642_embedding_final = np.array(name_bert_642_embedding)

In [60]:
tsne_model = TSNE(n_components=3, perplexity=30, learning_rate=200)

compressed_embeddings_name_bert_642 = tsne_model.fit_transform(name_bert_642_embedding_final)

In [15]:
feats = ["main_pic_embeddings_resnet_v11", "name_bert_641", "main_pic_embeddings_resnet_v12", "name_bert_642"]

In [64]:
print("привет мир")

привет мир


In [22]:
X_train, X_test = train_test_split(
    features[feats + ["target", "variantid1", "variantid2"]], 
    test_size=0.1, random_state=42, stratify=features[["target"]])

y_test = X_test[["target"]]
X_test = X_test.drop(["target", "variantid1", "variantid2"], axis=1)

y_train = X_train["target"]
X_train = X_train.drop(["target","variantid1", "variantid2"], axis=1)

In [23]:
print(X_train.shape)

(275886, 4)


In [24]:
print(X_test.shape)

(30654, 4)


Features functions.

In [25]:
X_train_final = []

for i in range(len(X_train)):
    row = []
    row.extend(list(X_train[feats[0]].iloc[i])[0])
    row.extend(list(X_train[feats[1]].iloc[i]))
    row.extend(list(X_train[feats[2]].iloc[i])[0])
    row.extend(list(X_train[feats[3]].iloc[i]))
    X_train_final.append(row)

In [26]:
X_test_final = []

for i in range(len(X_test)):
    row = []
    row.extend(list(X_test[feats[0]].iloc[i])[0])
    row.extend(list(X_test[feats[1]].iloc[i]))
    row.extend(list(X_test[feats[2]].iloc[i])[0])
    row.extend(list(X_test[feats[3]].iloc[i]))
    X_test_final.append(row)

In [33]:
from catboost import CatBoostClassifier
import optuna
from sklearn.metrics import accuracy_score

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10.0),
        'border_count': trial.suggest_int('border_count', 5, 255),
        'random_seed': 42,
        'eval_metric': 'Accuracy',
        'loss_function': 'Logloss',
        'verbose': False
    }
    
    model = CatBoostClassifier(**params)
    model.fit(X_train_final, y_train, eval_set=(X_test_final, y_test), early_stopping_rounds=20, verbose=False)
    y_pred = model.predict(X_test_final)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Создание объекта study и запуск оптимизации
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

# Получение лучших параметров
best_params = study.best_params
best_accuracy = study.best_value

print('Best parameters:', best_params)
print('Best accuracy:', best_accuracy)

[32m[I 2023-05-25 20:04:50,080][0m A new study created in memory with name: no-name-553ed098-fe03-4bf4-bcea-9f9bb9ac04b0[0m
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10.0),
[32m[I 2023-05-25 20:06:19,139][0m Trial 0 finished with value: 0.7688066810204215 and parameters: {'iterations': 772, 'learning_rate': 0.4006577177062766, 'depth': 9, 'l2_leaf_reg': 3.9137870145489715, 'border_count': 133}. Best is trial 0 with value: 0.7688066810204215.[0m
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10.0),
[32m[I 2023-05-25 20:07:56,716][0m Trial 1 finished with value: 0.7695569909310367 and parameters: {'iterations': 514, 'learning_rate': 0.09104388685818902, 'depth': 9, 'l2_leaf_reg': 0.15320485409273238, 'border_count': 77}. Best is trial 1 with value: 0.7695569909310367.[0m
  'learning_rate': trial.sugge

[32m[I 2023-05-25 20:18:05,374][0m Trial 8 finished with value: 0.7368369543942063 and parameters: {'iterations': 882, 'learning_rate': 0.022525843063157382, 'depth': 6, 'l2_leaf_reg': 5.253986073515961, 'border_count': 41}. Best is trial 4 with value: 0.7712859659424545.[0m
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10.0),
[32m[I 2023-05-25 20:19:14,921][0m Trial 9 finished with value: 0.7307692307692307 and parameters: {'iterations': 877, 'learning_rate': 0.09316183929837007, 'depth': 3, 'l2_leaf_reg': 3.593008531681482, 'border_count': 37}. Best is trial 4 with value: 0.7712859659424545.[0m
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10.0),
[32m[I 2023-05-25 20:21:00,088][0m Trial 10 finished with value: 0.726430482155673 and parameters: {'iterations': 626, 'learning_rate': 0.016446477236453608,

[32m[I 2023-05-25 20:39:25,409][0m Trial 17 finished with value: 0.7684152149801005 and parameters: {'iterations': 973, 'learning_rate': 0.05904781409398694, 'depth': 8, 'l2_leaf_reg': 0.5107036424817228, 'border_count': 151}. Best is trial 11 with value: 0.7715795654726952.[0m
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10.0),
[32m[I 2023-05-25 20:43:08,553][0m Trial 18 finished with value: 0.7575846545312194 and parameters: {'iterations': 715, 'learning_rate': 0.02715052000183209, 'depth': 9, 'l2_leaf_reg': 0.20279815196908546, 'border_count': 240}. Best is trial 11 with value: 0.7715795654726952.[0m
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10.0),
[32m[I 2023-05-25 20:44:25,878][0m Trial 19 finished with value: 0.7003653683042996 and parameters: {'iterations': 563, 'learning_rate': 0.0100643976

Best parameters: {'iterations': 517, 'learning_rate': 0.047054063568355974, 'depth': 10, 'l2_leaf_reg': 0.11683807614991185, 'border_count': 75}
Best accuracy: 0.7715795654726952


In [34]:
model = CatBoostClassifier(iterations = 517, learning_rate = 0.047054063568355974, depth = 10, l2_leaf_reg = 0.11683807614991185, border_count = 75)

model.fit(X_train_final, y_train)

0:	learn: 0.6813710	total: 338ms	remaining: 2m 54s
1:	learn: 0.6707417	total: 706ms	remaining: 3m 1s
2:	learn: 0.6612430	total: 1.06s	remaining: 3m 2s
3:	learn: 0.6520319	total: 1.42s	remaining: 3m 1s
4:	learn: 0.6432654	total: 1.79s	remaining: 3m 3s
5:	learn: 0.6359395	total: 2.14s	remaining: 3m 2s
6:	learn: 0.6297434	total: 2.5s	remaining: 3m 1s
7:	learn: 0.6234839	total: 2.85s	remaining: 3m 1s
8:	learn: 0.6180071	total: 3.19s	remaining: 3m
9:	learn: 0.6124650	total: 3.54s	remaining: 2m 59s
10:	learn: 0.6078602	total: 3.88s	remaining: 2m 58s
11:	learn: 0.6031106	total: 4.25s	remaining: 2m 58s
12:	learn: 0.5987036	total: 4.61s	remaining: 2m 58s
13:	learn: 0.5948262	total: 4.98s	remaining: 2m 58s
14:	learn: 0.5913396	total: 5.32s	remaining: 2m 58s
15:	learn: 0.5876569	total: 5.68s	remaining: 2m 57s
16:	learn: 0.5846856	total: 6.02s	remaining: 2m 57s
17:	learn: 0.5818373	total: 6.38s	remaining: 2m 56s
18:	learn: 0.5790497	total: 6.74s	remaining: 2m 56s
19:	learn: 0.5762778	total: 7.12s	

158:	learn: 0.4780781	total: 59.5s	remaining: 2m 13s
159:	learn: 0.4777643	total: 59.8s	remaining: 2m 13s
160:	learn: 0.4774162	total: 1m	remaining: 2m 13s
161:	learn: 0.4770484	total: 1m	remaining: 2m 12s
162:	learn: 0.4766932	total: 1m	remaining: 2m 12s
163:	learn: 0.4764083	total: 1m 1s	remaining: 2m 11s
164:	learn: 0.4760485	total: 1m 1s	remaining: 2m 11s
165:	learn: 0.4756932	total: 1m 2s	remaining: 2m 11s
166:	learn: 0.4753714	total: 1m 2s	remaining: 2m 10s
167:	learn: 0.4749979	total: 1m 2s	remaining: 2m 10s
168:	learn: 0.4746559	total: 1m 3s	remaining: 2m 10s
169:	learn: 0.4743303	total: 1m 3s	remaining: 2m 9s
170:	learn: 0.4740349	total: 1m 3s	remaining: 2m 9s
171:	learn: 0.4736662	total: 1m 4s	remaining: 2m 8s
172:	learn: 0.4733699	total: 1m 4s	remaining: 2m 8s
173:	learn: 0.4730145	total: 1m 4s	remaining: 2m 8s
174:	learn: 0.4727385	total: 1m 5s	remaining: 2m 7s
175:	learn: 0.4724727	total: 1m 5s	remaining: 2m 7s
176:	learn: 0.4721149	total: 1m 6s	remaining: 2m 6s
177:	learn

311:	learn: 0.4326589	total: 1m 55s	remaining: 1m 15s
312:	learn: 0.4324203	total: 1m 55s	remaining: 1m 15s
313:	learn: 0.4321352	total: 1m 56s	remaining: 1m 15s
314:	learn: 0.4318723	total: 1m 56s	remaining: 1m 14s
315:	learn: 0.4315714	total: 1m 57s	remaining: 1m 14s
316:	learn: 0.4312535	total: 1m 57s	remaining: 1m 14s
317:	learn: 0.4310168	total: 1m 57s	remaining: 1m 13s
318:	learn: 0.4307031	total: 1m 58s	remaining: 1m 13s
319:	learn: 0.4304347	total: 1m 58s	remaining: 1m 12s
320:	learn: 0.4302003	total: 1m 58s	remaining: 1m 12s
321:	learn: 0.4299433	total: 1m 59s	remaining: 1m 12s
322:	learn: 0.4296804	total: 1m 59s	remaining: 1m 11s
323:	learn: 0.4294209	total: 2m	remaining: 1m 11s
324:	learn: 0.4291834	total: 2m	remaining: 1m 11s
325:	learn: 0.4289126	total: 2m	remaining: 1m 10s
326:	learn: 0.4286651	total: 2m 1s	remaining: 1m 10s
327:	learn: 0.4284163	total: 2m 1s	remaining: 1m 10s
328:	learn: 0.4281838	total: 2m 1s	remaining: 1m 9s
329:	learn: 0.4279331	total: 2m 2s	remaining

467:	learn: 0.3942137	total: 2m 53s	remaining: 18.2s
468:	learn: 0.3940023	total: 2m 53s	remaining: 17.8s
469:	learn: 0.3937752	total: 2m 54s	remaining: 17.4s
470:	learn: 0.3935683	total: 2m 54s	remaining: 17.1s
471:	learn: 0.3933421	total: 2m 55s	remaining: 16.7s
472:	learn: 0.3931428	total: 2m 55s	remaining: 16.3s
473:	learn: 0.3929225	total: 2m 55s	remaining: 15.9s
474:	learn: 0.3926973	total: 2m 56s	remaining: 15.6s
475:	learn: 0.3924986	total: 2m 56s	remaining: 15.2s
476:	learn: 0.3922794	total: 2m 56s	remaining: 14.8s
477:	learn: 0.3920828	total: 2m 57s	remaining: 14.5s
478:	learn: 0.3918566	total: 2m 57s	remaining: 14.1s
479:	learn: 0.3916466	total: 2m 58s	remaining: 13.7s
480:	learn: 0.3914239	total: 2m 58s	remaining: 13.3s
481:	learn: 0.3912163	total: 2m 58s	remaining: 13s
482:	learn: 0.3909786	total: 2m 59s	remaining: 12.6s
483:	learn: 0.3907540	total: 2m 59s	remaining: 12.2s
484:	learn: 0.3905109	total: 2m 59s	remaining: 11.9s
485:	learn: 0.3903243	total: 3m	remaining: 11.5s

<catboost.core.CatBoostClassifier at 0x208aa46bf10>

In [35]:
import random

from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
prediction = model.predict(X_test_final)

mse = mean_squared_error(y_test, prediction)
print("Mean Squared Error:", mse)

f1 = f1_score(y_test, prediction)
print("f1:", f1)

accuracy = accuracy_score(y_test, prediction)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Mean Squared Error: 0.228779278397599
f1: 0.7353085487827892
Accuracy: 0.771220721602401
Precision: 0.7496536863167615
Recall: 0.7215021109547441


In [36]:
model.save_model("catboost_with_embs_only_optune_epochs.cbm")