In [None]:
import pandas as pd
import os
import torch
from tqdm import tqdm
import json
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity, rbf_kernel
import math
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import optuna
import matplotlib.pyplot as plt

In [None]:
os.chdir('avitotech_data\\avitotech_data')

In [None]:
df_train_1 = pd.read_parquet("train_part_0001.snappy.parquet")
df_train_2 = pd.read_parquet("train_part_0002.snappy.parquet")
df_train_3 = pd.read_parquet("train_part_0003.snappy.parquet")
df_train_4 = pd.read_parquet("train_part_0004.snappy.parquet")

df_test_1 = pd.read_parquet("test_part_0001.snappy.parquet")
df_test_2 = pd.read_parquet("test_part_0002.snappy.parquet")

df_train = pd.concat([df_train_1, df_train_2, df_train_3, df_train_4])
df_test = pd.concat([df_test_1, df_test_2])

In [None]:
df_train = df_train.drop(columns=['group_id', 'action_date', 'base_title',
       'cand_title', 'base_description', 'cand_description',
       'base_category_name', 'cand_category_name', 'base_subcategory_name',
       'cand_subcategory_name', 'base_param1', 'cand_param1', 'base_param2',
       'cand_param2', 'base_title_image', 'cand_title_image',
       'is_same_location', 'is_same_region'], axis=1)

df_test = df_test.drop(columns=['base_title',
       'cand_title', 'base_description', 'cand_description',
       'base_category_name', 'cand_category_name', 'base_subcategory_name',
       'cand_subcategory_name', 'base_param1', 'cand_param1', 'base_param2',
       'cand_param2', 'base_title_image', 'cand_title_image',
       'is_same_location', 'is_same_region'], axis=1)

In [None]:
train_merged_embed=torch.load("train_merged_embed.pt", map_location="cpu")
test_merged_embed= torch.load("test_merged_embed.pt", map_location="cpu")

# Model params range

In [None]:
def mean_average_precision(y_true, y_pred):
    order = np.argsort(y_pred)[::-1]
    y_true_sorted = np.array(y_true)[order]
    n_positives = np.sum(y_true_sorted)
    if n_positives == 0:
        return 0.0

    precisions = []
    recalls = []
    tp = 0
    for k in range(1, len(y_true_sorted)+1):
        if y_true_sorted[k-1] == 1:
            tp += 1
            precision = tp / k
            recall = tp / n_positives
            precisions.append(precision)
            recalls.append(recall)
    delta_recalls = [recalls[0]] + [recalls[i] - recalls[i-1] for i in range(1, len(recalls))]
    mAP = np.sum([p * dr for p, dr in zip(precisions, delta_recalls)])
    return mAP

def objective(trial):
    # Подбор параметров
    params = {
        'iterations': trial.suggest_int('iterations', 2200, 6000),
        'depth': trial.suggest_int('depth', 7, 15),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.07, 0.25),
        'l2_leaf_reg': trial.suggest_uniform('l2_leaf_reg', 5, 16),
        'random_strength': trial.suggest_uniform('random_strength', 0, 2),
        'bagging_temperature': trial.suggest_uniform('bagging_temperature', 0.1, 0.8),
        'eval_metric': 'Logloss',
        'loss_function': 'Logloss',
        'task_type': 'GPU',
        'verbose': 0
    }

    model = CatBoostClassifier(**params)
    model.fit(
        df_extracted_train, y_train,
        eval_set=(df_extracted_val, y_val),
        use_best_model=False,
        verbose=0
    )
    y_val_pred = model.predict_proba(df_extracted_val)[:, 1]
    # ВАЖНО: возвращаем mAP, чтобы Optuna его максимизировал
    return mean_average_precision(y_val, y_val_pred)

# Build ids' items

In [None]:
item_data = defaultdict()

for row in tqdm(df_train.iterrows(), total=len(df_train)):
    row_element = row[1]
    
    base_item_id = row_element['base_item_id']
    cand_item_id = row_element['cand_item_id']

    if base_item_id not in item_data:
        
        item_data[base_item_id] = {
            'price':        row_element['base_price'],
            'json_params':  row_element['base_json_params'],
            'count_images': row_element['base_count_images'],
        }

    if cand_item_id not in item_data:
        
        item_data[cand_item_id] = {
            'price':        row_element['cand_price'],
            'json_params':  row_element['cand_json_params'],
            'count_images': row_element['cand_count_images'],
        }

item_data_test = defaultdict()

for row in tqdm(df_test.iterrows(), total=len(df_test)):
    row_element = row[1]
    
    base_item_id = row_element['base_item_id']
    cand_item_id = row_element['cand_item_id']

    if base_item_id not in item_data_test:
        
        item_data_test[base_item_id] = {
            'price':        row_element['base_price'],
            'json_params':  row_element['base_json_params'],
            'count_images': row_element['base_count_images'],
        }

    if cand_item_id not in item_data_test:
        
        item_data_test[cand_item_id] = {
            'price':        row_element['cand_price'],
            'json_params':  row_element['cand_json_params'],
            'count_images': row_element['cand_count_images'],
        }

In [None]:
item_info = defaultdict()

for item_id, values in tqdm(item_data.items()):
    if item_id in train_merged_embed.keys():
        values['embed'] = train_merged_embed[item_id]
        item_info[item_id] = values

item_info_test = defaultdict()

for item_id, values in tqdm(item_data_test.items()):
    values['embed'] = test_merged_embed[item_id]
    item_info_test[item_id] = values

# Extract features

In [None]:
X = []
y= []

for row in tqdm(df_train.iterrows(), total=len(df_train)):
    row_element = row[1]
    
    base_item_id = row_element['base_item_id']
    cand_item_id = row_element['cand_item_id']

    if base_item_id in item_info.keys() and cand_item_id in item_info.keys():
        base_price = item_info[base_item_id]['price']
        cand_price = item_info[cand_item_id]['price']

        base_json_params = json.loads(item_info[base_item_id]['json_params'])
        cand_json_params = json.loads(item_info[cand_item_id]['json_params'])

        base_count_images = item_info[base_item_id]['count_images']
        cand_count_images = item_info[cand_item_id]['count_images']

        base_embed = item_info[base_item_id]['embed'].reshape(1, -1)
        cand_embed = item_info[cand_item_id]['embed'].reshape(1, -1)

        is_double = row_element['is_double']
        # price
        price_dif = 2 * abs(base_price - cand_price) / max((base_price + cand_price), 1)

        # json
        base_unique_keys = set(base_json_params.keys())
        cand_unique_keys = set(cand_json_params.keys())

        intersect = base_unique_keys.intersection(cand_unique_keys)
        union = base_unique_keys.union(cand_unique_keys)

        ## a. Jaccard
        jaccard = 1 if len(union) == 0 else len(intersect) / len(union)

        ## b. Ratio of intersect
        ratio = 1 if len(union) == 0 else len(intersect) / max(min(len(base_unique_keys), len(cand_unique_keys)), 1)

        ## c. shared
        shared_int = 0
        shared_float = 0
        shared_str = 0
        shared_list = 0
        
        intersect_int = 0
        intersect_float = 0
        intersect_str = 0
        intersect_list = 0

        for unique_key in intersect:
            # int
            if isinstance(base_json_params[unique_key], int) and isinstance(cand_json_params[unique_key], int):
                intersect_int += 1
                if base_json_params[unique_key] == cand_json_params[unique_key]:
                    shared_int += 1

            # float
            if isinstance(base_json_params[unique_key], float) and isinstance(cand_json_params[unique_key], float):
                intersect_float += 1
                if base_json_params[unique_key] == cand_json_params[unique_key]:
                    shared_float += 1

            # str
            if isinstance(base_json_params[unique_key], str) and isinstance(cand_json_params[unique_key], str):
                intersect_str += 1
                if base_json_params[unique_key] == cand_json_params[unique_key]:
                    shared_str += 1

            # list
            if isinstance(base_json_params[unique_key], list) and isinstance(cand_json_params[unique_key], list):
                intersect_list += 1
                if len(base_json_params[unique_key]) == 0 or len(cand_json_params[unique_key]) == 0:
                    continue
                
                if isinstance(base_json_params[unique_key][0], dict) or isinstance(cand_json_params[unique_key][0], dict):
                    if set(base_json_params[unique_key][0].keys()) == set(cand_json_params[unique_key][0].keys()):
                        shared_list += 1
                elif set(base_json_params[unique_key]) == set(cand_json_params[unique_key]):
                    shared_list += 1

        shared = shared_int + shared_float + shared_str + shared_list

        same_items_ratio       = shared / max(len(intersect), 1)
        same_items_ratio_int   = shared_int / max(intersect_int, 1)
        same_items_ratio_float = shared_float / max(intersect_float, 1)
        same_items_ratio_str   = shared_str / max(intersect_str, 1)
        same_items_ratio_list  = shared_list / max(intersect_list, 1)

        # jaccard per type
        union_int = set()
        union_float = set()
        union_str = set()
        union_list = set()

        for key, value in base_json_params.items():
            if isinstance(value, int):
                union_int.add(key)
            elif isinstance(value, float):
                union_float.add(key)
            elif isinstance(value, str):
                union_str.add(key)
            elif isinstance(value, list):
                union_list.add(key)

        for key, value in cand_json_params.items():
            if isinstance(value, int):
                union_int.add(key)
            elif isinstance(value, float):
                union_float.add(key)
            elif isinstance(value, str):
                union_str.add(key)
            elif isinstance(value, list):
                union_list.add(key)

        jaccard_int = 1 if len(union_int) == 0 else intersect_int / len(union_int)
        jaccard_float = 1 if len(union_float) == 0 else intersect_float / len(union_float)
        jaccard_str = 1 if len(union_str) == 0 else intersect_str / len(union_str)
        jaccard_list = 1 if len(union_list) == 0 else intersect_list / len(union_list)
        
        # img diff
        img_diff = abs(0 if math.isnan(base_count_images) else base_count_images - 0 if math.isnan(cand_count_images) else cand_count_images)

        # cosine_similarity
        cos_sim = cosine_similarity(base_embed, cand_embed).item()

        # rbf kernel
        rbf = rbf_kernel(base_embed, cand_embed).item()

        X.append(
            {
                'price_dif': round(price_dif),
                'jaccard': round(jaccard, 5),
                'jaccard_int': round(jaccard_int, 5),
                'jaccard_float': round(jaccard_float, 5),
                'jaccard_str': round(jaccard_str, 5),
                'jaccard_list': round(jaccard_list, 5),
                'ratio': round(ratio, 5),
                'same_items_ratio': round(same_items_ratio, 5),
                'same_items_ratio_int': round(same_items_ratio_int, 5),
                'same_items_ratio_float': round(same_items_ratio_float, 5),
                'same_items_ratio_str': round(same_items_ratio_str, 5),
                'same_items_ratio_list': round(same_items_ratio_list, 5),
                'img_diff': round(img_diff),
                'cos_sim': round(cos_sim, 5),
                'rbf': round(rbf, 5)
            }
        )

        y.append(is_double)

In [None]:
X_test = []

for row in tqdm(df_test.iterrows(), total=len(df_test)):
    row_element = row[1]
    
    base_item_id = row_element['base_item_id']
    cand_item_id = row_element['cand_item_id']

    if base_item_id in item_info_test.keys() and cand_item_id in item_info_test.keys():
        base_price = item_info_test[base_item_id]['price']
        cand_price = item_info_test[cand_item_id]['price']

        base_json_params = json.loads(item_info_test[base_item_id]['json_params'])
        cand_json_params = json.loads(item_info_test[cand_item_id]['json_params'])

        base_count_images = item_info_test[base_item_id]['count_images']
        cand_count_images = item_info_test[cand_item_id]['count_images']

        base_embed = item_info_test[base_item_id]['embed'].reshape(1, -1)
        cand_embed = item_info_test[cand_item_id]['embed'].reshape(1, -1)

        # price
        price_dif = 2 * abs(base_price - cand_price) / max((base_price + cand_price), 1)

        # json
        base_unique_keys = set(base_json_params.keys())
        cand_unique_keys = set(cand_json_params.keys())

        intersect = base_unique_keys.intersection(cand_unique_keys)
        union = base_unique_keys.union(cand_unique_keys)

        ## a. Jaccard
        jaccard = 1 if len(union) == 0 else len(intersect) / len(union)

        ## b. Ratio of intersect
        ratio = 1 if len(union) == 0 else len(intersect) / max(min(len(base_unique_keys), len(cand_unique_keys)), 1)

        ## c. shared
        shared_int = 0
        shared_float = 0
        shared_str = 0
        shared_list = 0
        
        intersect_int = 0
        intersect_float = 0
        intersect_str = 0
        intersect_list = 0

        for unique_key in intersect:
            # int
            if isinstance(base_json_params[unique_key], int) and isinstance(cand_json_params[unique_key], int):
                intersect_int += 1
                if base_json_params[unique_key] == cand_json_params[unique_key]:
                    shared_int += 1

            # float
            if isinstance(base_json_params[unique_key], float) and isinstance(cand_json_params[unique_key], float):
                intersect_float += 1
                if base_json_params[unique_key] == cand_json_params[unique_key]:
                    shared_float += 1

            # str
            if isinstance(base_json_params[unique_key], str) and isinstance(cand_json_params[unique_key], str):
                intersect_str += 1
                if base_json_params[unique_key] == cand_json_params[unique_key]:
                    shared_str += 1

            # list
            if isinstance(base_json_params[unique_key], list) and isinstance(cand_json_params[unique_key], list):
                intersect_list += 1
                if len(base_json_params[unique_key]) == 0 or len(cand_json_params[unique_key]) == 0:
                    continue
                
                if isinstance(base_json_params[unique_key][0], dict) or isinstance(cand_json_params[unique_key][0], dict):
                    if set(base_json_params[unique_key][0].keys()) == set(cand_json_params[unique_key][0].keys()):
                        shared_list += 1
                elif set(base_json_params[unique_key]) == set(cand_json_params[unique_key]):
                    shared_list += 1

        shared = shared_int + shared_float + shared_str + shared_list

        same_items_ratio       = shared / max(len(intersect), 1)
        same_items_ratio_int   = shared_int / max(intersect_int, 1)
        same_items_ratio_float = shared_float / max(intersect_float, 1)
        same_items_ratio_str   = shared_str / max(intersect_str, 1)
        same_items_ratio_list  = shared_list / max(intersect_list, 1)

        # jaccard per type
        union_int = set()
        union_float = set()
        union_str = set()
        union_list = set()

        for key, value in base_json_params.items():
            if isinstance(value, int):
                union_int.add(key)
            elif isinstance(value, float):
                union_float.add(key)
            elif isinstance(value, str):
                union_str.add(key)
            elif isinstance(value, list):
                union_list.add(key)

        for key, value in cand_json_params.items():
            if isinstance(value, int):
                union_int.add(key)
            elif isinstance(value, float):
                union_float.add(key)
            elif isinstance(value, str):
                union_str.add(key)
            elif isinstance(value, list):
                union_list.add(key)

        jaccard_int = 1 if len(union_int) == 0 else intersect_int / len(union_int)
        jaccard_float = 1 if len(union_float) == 0 else intersect_float / len(union_float)
        jaccard_str = 1 if len(union_str) == 0 else intersect_str / len(union_str)
        jaccard_list = 1 if len(union_list) == 0 else intersect_list / len(union_list)
        
        # img diff
        img_diff = abs(0 if math.isnan(base_count_images) else base_count_images - 0 if math.isnan(cand_count_images) else cand_count_images)

        # cosine_similarity
        cos_sim = cosine_similarity(base_embed, cand_embed).item()

        # rbf kernel
        rbf = rbf_kernel(base_embed, cand_embed).item()

        X_test.append(
            {
                'price_dif': round(price_dif),
                'jaccard': round(jaccard, 5),
                'jaccard_int': round(jaccard_int, 5),
                'jaccard_float': round(jaccard_float, 5),
                'jaccard_str': round(jaccard_str, 5),
                'jaccard_list': round(jaccard_list, 5),
                'ratio': round(ratio, 5),
                'same_items_ratio': round(same_items_ratio, 5),
                'same_items_ratio_int': round(same_items_ratio_int, 5),
                'same_items_ratio_float': round(same_items_ratio_float, 5),
                'same_items_ratio_str': round(same_items_ratio_str, 5),
                'same_items_ratio_list': round(same_items_ratio_list, 5),
                'img_diff': round(img_diff),
                'cos_sim': round(cos_sim, 5),
                'rbf': round(rbf, 5)
            }
        )


# Split and build the optim model

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [None]:
df_extracted_train = pd.DataFrame(X_train)
df_extracted_val = pd.DataFrame(X_val)

df_extracted_test = pd.DataFrame(X_test)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=150)

print("Best params:", study.best_params)
print("Best mAP:", study.best_value)

## Discovery the best range

In [None]:
iterations = []
depth = []
learning_rate =[]
l2_leaf_reg= []
random_strength =[]
bagging_temperature = []
metric = []

for t in study.trials:
    iterations.append(t.params['iterations'])
    depth.append(t.params['depth'])
    learning_rate.append(t.params['learning_rate'])
    l2_leaf_reg.append(t.params['l2_leaf_reg'])
    random_strength.append(t.params['random_strength'])
    bagging_temperature.append(t.params['bagging_temperature'])

    metric.append(t.values[0])

In [None]:
cols = [
    'iterations',
    'depth',
    'learning_rate',
    'l2_leaf_reg',
    'random_strength',
    'bagging_temperature'
]

In [None]:
steps = {
    'iterations':          100,
    'depth':               1,
    'learning_rate':       0.01,
    'l2_leaf_reg':         1,
    'random_strength':     0.1,
    'bagging_temperature': 0.1
}

In [None]:
metric_array = np.array(metric)
metric_norm = (metric_array - metric_array.min()) / (metric_array.max() - metric_array.min())

In [None]:
df_correction = pd.DataFrame({'iterations': iterations,
                              'depth': depth,
                              'learning_rate': learning_rate,
                              'l2_leaf_reg': l2_leaf_reg,
                              'random_strength': random_strength,
                              'bagging_temperature': bagging_temperature,
                              'metric_norm': metric_norm})

In [None]:
bins_dict = {}
labels_dict = {}

for col in cols:
    mn = df_correction[col].min()
    mx = df_correction[col].max()
    step = steps[col]
    # создаём бины
    bins = np.arange(mn, mx + step, step)
    bins_dict[col] = bins
    # выбираем формат меток
    fmt = ".2f" if step < 1 else ".0f"
    # создаём метки
    labels = [f"{bins[i]:{fmt}}-{bins[i+1]:{fmt}}" for i in range(len(bins)-1)]
    labels_dict[col] = labels

In [None]:
for col in cols:
    df_correction[col] = pd.cut(
        df_correction[col],
        bins=bins_dict[col],
        labels=labels_dict[col],
        include_lowest=True,
        right=True
    )

In [None]:
for col in cols:
    agg = (
        df_correction
        .groupby(col, observed=False)['metric_norm']
        .sum()
        .reset_index()
    )
    
    plt.figure()
    plt.bar(agg[col].astype(str), agg['metric_norm'])
    plt.xticks(rotation=45, ha='right')
    plt.title(f'Sum of metric_norm by {col}')
    plt.xlabel(col)
    plt.ylabel('Sum of metric_norm')
    plt.show()

# Retrain the model with best params

In [None]:
best_params = study.best_params
best_model = CatBoostClassifier(**best_params)

In [None]:
best_model.fit(
    pd.concat([df_extracted_train, df_extracted_val]),
    y_train + y_val,
    verbose=100
)

# Predict params

In [None]:
df_test = df_test.drop(['base_price', 'cand_price',
       'base_json_params', 'cand_json_params', 'base_count_images',
       'cand_count_images'], axis=1)

In [None]:
y_test_proba = y_test_proba.astype(np.float32)

In [None]:
df_test["y_test_proba"] = y_test_proba

In [None]:
df_test = df_test.rename(columns={
    df_test.columns[0]: "base_id",
    df_test.columns[1]: "cand_id",
    df_test.columns[2]: "probability"
})

In [None]:
df_test.to_csv("submission.csv", index=False, sep=",")