In [None]:
#🔵
main_path = '/Users/stanislavkrupnov/Jup.Notebook'

In [None]:
#🟠
main_path = '/content/drive/Othercomputers/Mac/Jup.Notebook'
import gdown
from google.colab import drive
drive.mount('/content/drive')

# Описание ноутбука

Я провожу тюнинг гиперпараметров модели **CatBoost Ranker** в две попытки:
1. Используя функцию потерь **Pairlogit**
2. Используя функцию потерь **YetiRank** и метрику **PFound**

Подбор выполняю с помощью Optuna

# Функции

In [None]:
def metric_eval(predictions: pd.DataFrame, valid: pd.DataFrame,
                id2type: dict) -> tuple:
    """
    Вычисляет метрики для оценки качества модели.

    Parameters:
    - predictions (pd.DataFrame): DataFrame с предсказаниями модели.
    - valid (pd.DataFrame): DataFrame с данными для валидации.
    - id2type (dict): Словарь для преобразования индексов типов в соответствующие строки.

    Returns:
    - tuple: Кортеж, содержащий локальную метрику и реколл для каждого типа.
    """
    # Преобразование индексов типов в соответствующие строки
    valid.type = valid.type.map(lambda idx: id2type[idx])

    # Создание списка с метками для каждой сессии и типа
    ground_truth = valid.groupby(['session', 'type'])['aid'].apply(list)
    ground_truth = ground_truth.reset_index().rename(columns={'aid': 'labels'})

    # Обрезка меток для типа 'clicks' до одной метки
    ground_truth.loc[ground_truth.type == 'clicks',
                     'labels'] = ground_truth.loc[ground_truth.type ==
                                                  'clicks', 'labels'].str[:1]

    # Объединение предсказаний с истинными метками
    submission_with_gt = predictions.merge(
        ground_truth[['session', 'type', 'labels']],
        how='left',
        on=['session', 'type'])

    # Отбрасывание сессий без истинных меток
    submission_with_gt = submission_with_gt[~submission_with_gt.labels_y.isna(
    )]

    # Вычисление количества совпадений между предсказанными и истинными метками
    submission_with_gt['hits'] = submission_with_gt.apply(
        lambda df: len(set(df.labels_x).intersection(set(df.labels_y))),
        axis=1)

    # Вычисление количества истинных меток для каждого типа
    submission_with_gt['gt_count'] = submission_with_gt.labels_y.str.len(
    ).clip(0, 20)

    # Вычисление реколла для каждого типа
    recall_per_type = submission_with_gt.groupby([
        'type'
    ])['hits'].sum() / submission_with_gt.groupby(['type'])['gt_count'].sum()

    # Вычисление локальной метрики на основе реколла для каждого типа
    local_validation_score = (recall_per_type * pd.Series({
        'clicks': 0.10,
        'carts': 0.30,
        'orders': 0.60
    })).sum()

    return local_validation_score, recall_per_type

In [None]:
def read_parquets(path):
    """
    Чтение данных из файлов Parquet и их объединение в единый DataFrame.

    Parameters:
    - path (str): Путь к файлам Parquet.

    Returns:
    - tr_candidates (pd.DataFrame): Объединенный DataFrame.
    """
    # Список файлов Parquet
    file_list = glob.glob(path)

    # Создание пустого DataFrame для сбора данных
    tr_candidates = pd.DataFrame()

    # Цикл для чтения и объединения файлов
    for file in tqdm(file_list):
        # Чтение файла Parquet во временный DataFrame
        df_ = pd.read_parquet(file)

        # Объединение временного DataFrame с основным датасетом
        tr_candidates = pd.concat([tr_candidates, df_], ignore_index=True)

    return tr_candidates

# Import

In [None]:
!pip install optuna



In [None]:
import os
import sys
from tqdm import tqdm
import glob
import pyarrow.parquet as pq
import pickle
import pandas as pd
import gc
import numpy as np
import itertools
from datetime import datetime as dt
import polars as pl
# import optuna
import shutil
import xgboost as xgb
from sklearn.model_selection import GroupKFold
from imblearn.under_sampling import RandomUnderSampler

# Ver_8 CatB Tune Pairlogit

## Подбор гиперпараметров с Optuna

In [None]:
ver_tun = 'catb_tun'
ver = 6
ver_folder = f'ver_{ver}'
typee = 'clicks'
# typee = 'carts'
# typee = 'orders'

id2type_name = 'id2type.pkl'

random_state = 42

In [None]:
# Список файлов Parquet
file_list = glob.glob(f'{main_path}/tr/{ver_folder}/tr_candidates_{typee}/*')

# Создание пустого DataFrame для сбора данных
tr_candidates = pd.DataFrame()

# Цикл для чтения и объединения файлов
for file in tqdm(file_list):
    # Чтение файла Parquet во временный DataFrame
    df_ = pd.read_parquet(file)

    # Объединение временного DataFrame с основным датасетом
    tr_candidates = pd.concat([tr_candidates, df_], ignore_index=True)

100%|██████████| 5/5 [00:22<00:00,  4.49s/it]


**Downsampling**

In [None]:
from imblearn.under_sampling import RandomUnderSampler

# clicks
under_sampler = RandomUnderSampler(
    sampling_strategy=0.43)  # Указываем желаемое соотношение классов

# carts
# under_sampler = RandomUnderSampler(sampling_strategy= 0.0624)  # Указываем желаемое соотношение классов

# orders
# under_sampler = RandomUnderSampler(sampling_strategy= 0.0624)  # Указываем желаемое соотношение классов

In [None]:
FEATURES = tr_candidates.columns[:-1]
X = tr_candidates.loc[:, FEATURES]
y = tr_candidates.loc[:, 'target']
X_train, y_train = under_sampler.fit_resample(X, y)
tr_candidates = pd.merge(X_train, y_train, left_index=True, right_index=True)

In [None]:
tr_candidates = tr_candidates.sort_values('session')
tr_candidates = tr_candidates.reset_index(drop = True)

In [None]:
del X, y, X_train, y_train

In [None]:
tr_candidates.to_parquet('tr_candidates.parquet')

In [None]:
tr_candidates = pd.read_parquet('tr_candidates.parquet')

**Optuna**

**Я не могу замерять recall@20 внутри optuna, так как для recall@20 нужно изначально делить сессии на две части, а тут их придется делить внутри фолдов, используя valid части. Посколько это уже не сессии юзеров, а их рекомендации, там только уник. aid, и я не строю внутри фолдов рекомендации опять, а лишь могу  ранжировать, то и конечно пересечений между рекомендациями в одной части и "ист.метками" в другой не будет**. \
Опять же, recall@20 оценивает в целом мою возмонжость рекомендовать, а внутри оптюна моя цель лишь ранжировать хорошо

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [None]:
import catboost
from catboost import CatBoostRanker, Pool

In [None]:
import os
os.environ['CATBOOST_GPU'] = '1'

In [None]:
folder_path = 'model'
os.makedirs(folder_path, exist_ok=True)

In [None]:
def objective(trial: optuna.trial.Trial, data: pd.DataFrame,
              random_state: int) -> float:
    """
    Объектная функция для оптимизации гиперпараметров CatBoost.

    Параметры:
    - trial
        Одна итерация оптимизации, содержащая гиперпараметры.
    - data
        Обучающий набор данных.
    - random_state
        Случайное зерно для воспроизводимости.

    Возвращает:
    - Среднее значение метрики pairlogit по всем фолдам для заданных гиперпараметров.
    """

    # Гиперпараметры для оптимизации
    catboost_params = {
        'loss_function':
        'PairLogit',  # Функция потерь PairLogit
        'task_type':
        'GPU',  # Использовать GPU для обучения
        'random_seed':
        trial.suggest_categorical('random_seed',
                                  [random_state]),  # Случайное зерно
        'iterations':
        trial.suggest_categorical("iterations", [1515]),
        'learning_rate':
        trial.suggest_categorical("learning_rate", [0.013688644928808849]),
        'l2_leaf_reg':
        trial.suggest_float('l2_leaf_reg', 1e-5, 1e2,
                            log=True),  # L2 регуляризация
        'border_count':
        trial.suggest_int('border_count', 32,
                          255),  # Количество бинов для числовых признаков
        'random_strength':
        trial.suggest_float('random_strength', 0.0, 1.0),  # Сила случайности
        'bootstrap_type':
        trial.suggest_categorical('bootstrap_type',
                                  ['Bernoulli', 'MVS', 'Poisson']),
        "depth":
        trial.suggest_int("depth", 1, 10),
        "subsample":
        trial.suggest_float("subsample", 0.05, 1.0),
        "min_data_in_leaf":
        trial.suggest_int("min_data_in_leaf", 1, 100)
    }

    # K-fold кросс-валидация
    skf = GroupKFold(n_splits=5)
    FEATURES = data.columns[:-1]

    # Контейнер для хранения значений метрики pairlogit для каждого фолда
    scores = []

    # Итерация по фолдам
    for fold, (train_idx, valid_idx) in enumerate(
            skf.split(data, data['target'], groups=data['session'])):

        X_train = data.loc[train_idx, FEATURES]
        y_train = data.loc[train_idx, 'target']
        X_valid = data.loc[valid_idx, FEATURES]
        y_valid = data.loc[valid_idx, 'target']

        group_id_tr = X_train.iloc[:, 0]
        group_id_v = X_valid.iloc[:, 0]

        X_train = X_train.iloc[:, 2:]
        X_valid = X_valid.iloc[:, 2:]

        catboost_train = Pool(data=X_train,
                              label=y_train,
                              group_id=group_id_tr)
        catboost_valid = Pool(data=X_valid, label=y_valid, group_id=group_id_v)

        # Инициализация CatBoostRanker с гиперпараметрами
        ranker = CatBoostRanker(**catboost_params)

        # Обучение модели и оценка на валидационном наборе
        ranker.fit(catboost_train, eval_set=catboost_valid, verbose_eval=100)

        # Получение метрики pairlogit из лучших результатов валидации
        pairlogit = ranker.best_score_['validation']['PairLogit']

        # Сохранение значения метрики pairlogit для этого фолда
        scores.append(pairlogit)

    # Рассчет среднего значения метрики pairlogit по всем фолдам
    pairlogit_mean = np.mean(scores)

    return pairlogit_mean

In [None]:
# Создать Optuna study для оптимизации
study = optuna.create_study(direction="minimize",
                            study_name="CatBoost_Ranker_Optimization")
func = lambda trial: objective(trial, tr_candidates, random_state=random_state)

[I 2023-09-29 04:12:26,159] A new study created in memory with name: CatBoost_Ranker_Optimization


In [None]:
study.optimize(func, n_trials=1, show_progress_bar=True, n_jobs=6)

**Записываю варианты получившихся параметров и выбираю лучшие**

**Clicks** \
0.3941695 \
{'random_seed': 42, 'iterations': 1758, 'learning_rate': 0.053701000860254436} \   
**0.3936051** \
{'random_seed': 42,
 'iterations': 1758,
 'learning_rate': 0.053701000860254436,
 'l2_leaf_reg': 0.8881227642380258,
 'border_count': 92,
 'random_strength': 0.8903326222703105,
 'bootstrap_type': 'Poisson',
 'depth': 6,
 'subsample': 0.9931914618968313,
 'min_data_in_leaf': 7} \.
 **Carts** \
  0.3076646 \   
  {'random_seed': 42, 'iterations': 724, 'learning_rate': 0.04034712260216462} \
  **0.307172** \  
  {'random_seed': 42,
 'iterations': 724,
 'learning_rate': 0.04034712260216462,
 'l2_leaf_reg': 7.449337336120884,
 'border_count': 198,
 'random_strength': 0.3325166004315703,
 'depth': 8,
 'min_data_in_leaf': 12} \  

 **Orders** \
0.1920218 \  
{'random_seed': 42, 'iterations': 1515, 'learning_rate': 0.013688644928808849} \  
0.1920230 \  
{'random_seed': 42,
 'iterations': 1515,
 'learning_rate': 0.013688644928808849,
 'l2_leaf_reg': 15.665624071536678,
 'border_count': 246,
 'random_strength': 0.5474358984091794,
 'depth': 6,
 'min_data_in_leaf': 48} \.  
 **0.1910265** \  
{'random_seed': 42,
 'iterations': 1515,
 'learning_rate': 0.013688644928808849,
 'l2_leaf_reg': 0.03669716203608452,
 'border_count': 235,
 'random_strength': 0.526822845804525,
 'bootstrap_type': 'Poisson',
 'depth': 7,
 'subsample': 0.938154110187783,
 'min_data_in_leaf': 28} \

In [None]:
study.best_params

{'random_seed': 42,
 'iterations': 1515,
 'learning_rate': 0.013688644928808849,
 'l2_leaf_reg': 0.03669716203608452,
 'border_count': 235,
 'random_strength': 0.526822845804525,
 'bootstrap_type': 'Poisson',
 'depth': 7,
 'subsample': 0.938154110187783,
 'min_data_in_leaf': 28}

**Формирую словари лучших гиперпараметров**

In [None]:
best_p_cl = {
    'loss_function': 'PairLogit',
    'task_type': 'GPU',
    'random_seed': 42,
    'iterations': 1758,
    'learning_rate': 0.053701000860254436,
    'l2_leaf_reg': 0.8881227642380258,
    'border_count': 92,
    'random_strength': 0.8903326222703105,
    'bootstrap_type': 'Poisson',
    'depth': 6,
    'subsample': 0.9931914618968313,
    'min_data_in_leaf': 7
}

In [None]:
best_p_carts = {
    'loss_function': 'PairLogit',
    'task_type': 'GPU',
    'random_seed': 42,
    'iterations': 724,
    'learning_rate': 0.04034712260216462,
    'l2_leaf_reg': 7.449337336120884,
    'border_count': 198,
    'random_strength': 0.3325166004315703,
    'depth': 8,
    'min_data_in_leaf': 12
}

In [None]:
best_p_orders = {
    'loss_function': 'PairLogit',
    'task_type': 'GPU',
    'random_seed': 42,
    'iterations': 1515,
    'learning_rate': 0.013688644928808849,
    'l2_leaf_reg': 0.03669716203608452,
    'border_count': 235,
    'random_strength': 0.526822845804525,
    'bootstrap_type': 'Poisson',
    'depth': 7,
    'subsample': 0.938154110187783,
    'min_data_in_leaf': 28
}

## Обучение на best_params

In [None]:
downsamp_k = 16
params = [best_p_cl, best_p_carts, best_p_orders]

# Итерация по типам событий ('clicks', 'carts', 'orders') с использованием оптимальных параметров
for typee, best_params in tqdm(zip(['clicks', 'carts', 'orders'], params),
                               desc='type'):

    # Загрузка данных для текущего типа событий
    path = f'{main_path}/tr/ver_6/tr_candidates_{typee}/*'
    tr_candidates = read_parquets(path)
    print('reading_done')

    # Приведение типов данных
    tr_candidates.session = tr_candidates.session.astype(np.int32)
    tr_candidates.aid = tr_candidates.aid.astype(np.int32)

    # Вычисление соотношения классов
    class_ratio = tr_candidates['target'].value_counts()[1] / \
                  tr_candidates['target'].value_counts()[0]

    # Указание желаемого соотношения классов с использованием downsample
    under_sampler = RandomUnderSampler(sampling_strategy=class_ratio *
                                       downsamp_k)

    # Downsampling для лучшего баланса классов
    FEATURES = tr_candidates.columns[:-1]
    X = tr_candidates.loc[:, FEATURES]
    y = tr_candidates.loc[:, 'target']
    X_train, y_train = under_sampler.fit_resample(X, y)
    tr_candidates = pd.merge(X_train,
                             y_train,
                             left_index=True,
                             right_index=True)
    tr_candidates = tr_candidates.sort_values('session')
    tr_candidates = tr_candidates.reset_index(drop=True)
    del X, y
    print('sampling_done')

    # Обучение модели с использованием k-fold кросс-валидации
    skf = GroupKFold(n_splits=5)
    FEATURES = tr_candidates.columns[:-1]
    scores = []

    # Итерация по фолдам
    for fold, (train_idx, valid_idx) in enumerate(
            skf.split(tr_candidates,
                      tr_candidates['target'],
                      groups=tr_candidates['session'])):

        X_train = tr_candidates.loc[train_idx, FEATURES]
        y_train = tr_candidates.loc[train_idx, 'target']
        X_valid = tr_candidates.loc[valid_idx, FEATURES]
        y_valid = tr_candidates.loc[valid_idx, 'target']

        group_id_tr = X_train.iloc[:, 0]
        group_id_v = X_valid.iloc[:, 0]

        X_train = X_train.iloc[:, 2:]
        X_valid = X_valid.iloc[:, 2:]

        catboost_train = Pool(data=X_train,
                              label=y_train,
                              group_id=group_id_tr)
        catboost_valid = Pool(data=X_valid, label=y_valid, group_id=group_id_v)

        # Инициализация CatBoostRanker с оптимальными параметрами
        ranker = CatBoostRanker(**best_params)

        # Обучение модели и оценка на валидационном наборе
        ranker.fit(catboost_train, eval_set=catboost_valid, verbose_eval=100)

        # Рассчет метрики pairlogit
        pairlogit = ranker.best_score_['validation']['PairLogit']
        scores.append(pairlogit)

        # Сохранение модели на диск для каждого фолда
        folder_path = f'model/{ver_tun}'
        os.makedirs(folder_path, exist_ok=True)
        ranker.save_model(f'model/{ver_tun}/catb_tun_{fold}_{typee}')

    # Усреднение метрики pairlogit по всем фолдам
    pairlogit_mean = np.mean(scores)
    print('pairlogit_mean:', pairlogit_mean)
    print('training_done')

type: 0it [00:00, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:02<00:09,  2.46s/it][A
 40%|████      | 2/5 [00:06<00:09,  3.12s/it][A
 60%|██████    | 3/5 [00:10<00:07,  3.77s/it][A
 80%|████████  | 4/5 [00:15<00:04,  4.34s/it][A
100%|██████████| 5/5 [00:21<00:00,  4.27s/it]


reading_done
sampling_done
0:	learn: 0.6566164	test: 0.6566426	best: 0.6566426 (0)	total: 45.6ms	remaining: 1m 20s
100:	learn: 0.4008695	test: 0.4023338	best: 0.4023338 (100)	total: 4.31s	remaining: 1m 10s
200:	learn: 0.3970201	test: 0.3989531	best: 0.3989531 (200)	total: 8.66s	remaining: 1m 7s
300:	learn: 0.3952726	test: 0.3976329	best: 0.3976329 (300)	total: 13s	remaining: 1m 2s
400:	learn: 0.3941640	test: 0.3969386	best: 0.3969386 (400)	total: 17.2s	remaining: 58.2s
500:	learn: 0.3932715	test: 0.3964530	best: 0.3964530 (500)	total: 21.4s	remaining: 53.8s
600:	learn: 0.3925190	test: 0.3960877	best: 0.3960877 (600)	total: 25.6s	remaining: 49.3s
700:	learn: 0.3918837	test: 0.3958487	best: 0.3958487 (700)	total: 29.8s	remaining: 44.9s
800:	learn: 0.3912713	test: 0.3956112	best: 0.3956112 (800)	total: 34s	remaining: 40.6s
900:	learn: 0.3906794	test: 0.3954109	best: 0.3954090 (899)	total: 38.1s	remaining: 36.3s
1000:	learn: 0.3901777	test: 0.3952919	best: 0.3952915 (999)	total: 42.3s	rema

type: 1it [08:05, 485.44s/it]

pairlogit_mean: 0.3933920899477553
training_done



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:05<00:20,  5.16s/it][A
 40%|████      | 2/5 [00:11<00:18,  6.10s/it][A
 60%|██████    | 3/5 [00:18<00:12,  6.21s/it][A
 80%|████████  | 4/5 [00:25<00:06,  6.72s/it][A
100%|██████████| 5/5 [00:33<00:00,  6.73s/it]


reading_done
sampling_done
0:	learn: 0.6480888	test: 0.6481078	best: 0.6481078 (0)	total: 38.2ms	remaining: 27.6s
100:	learn: 0.3104180	test: 0.3138651	best: 0.3138651 (100)	total: 3.88s	remaining: 24s
200:	learn: 0.3039429	test: 0.3104232	best: 0.3104232 (200)	total: 7.78s	remaining: 20.2s
300:	learn: 0.3002566	test: 0.3094734	best: 0.3094734 (300)	total: 11.7s	remaining: 16.4s
400:	learn: 0.2970513	test: 0.3089306	best: 0.3089176 (399)	total: 15.6s	remaining: 12.5s
500:	learn: 0.2941111	test: 0.3086100	best: 0.3086059 (489)	total: 19.4s	remaining: 8.66s
600:	learn: 0.2913561	test: 0.3084390	best: 0.3084342 (596)	total: 23.3s	remaining: 4.76s
700:	learn: 0.2887533	test: 0.3083742	best: 0.3083742 (700)	total: 27.1s	remaining: 891ms
723:	learn: 0.2881246	test: 0.3083306	best: 0.3083244 (709)	total: 28s	remaining: 0us
bestTest = 0.3083244319
bestIteration = 709
Shrink model to first 710 iterations.
0:	learn: 0.6477988	test: 0.6479433	best: 0.6479433 (0)	total: 38.3ms	remaining: 27.7s
100

type: 2it [12:20, 349.95s/it]

723:	learn: 0.2887600	test: 0.3069296	best: 0.3069264 (721)	total: 28.2s	remaining: 0us
bestTest = 0.3069264141
bestIteration = 721
Shrink model to first 722 iterations.
pairlogit_mean: 0.30792093498124806
training_done



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:04<00:17,  4.46s/it][A
 40%|████      | 2/5 [00:11<00:18,  6.14s/it][A
 60%|██████    | 3/5 [00:18<00:12,  6.47s/it][A
 80%|████████  | 4/5 [00:25<00:06,  6.80s/it][A
100%|██████████| 5/5 [00:33<00:00,  6.76s/it]


reading_done
sampling_done
0:	learn: 0.6696325	test: 0.6697260	best: 0.6697260 (0)	total: 35.7ms	remaining: 54.1s
100:	learn: 0.2141224	test: 0.2150934	best: 0.2150934 (100)	total: 3.21s	remaining: 45s
200:	learn: 0.1973599	test: 0.1987737	best: 0.1987737 (200)	total: 6.51s	remaining: 42.6s
300:	learn: 0.1928804	test: 0.1952302	best: 0.1952302 (300)	total: 9.83s	remaining: 39.7s
400:	learn: 0.1901888	test: 0.1934892	best: 0.1934892 (400)	total: 13.2s	remaining: 36.6s
500:	learn: 0.1881997	test: 0.1924409	best: 0.1924409 (500)	total: 16.5s	remaining: 33.3s
600:	learn: 0.1865662	test: 0.1917943	best: 0.1917943 (600)	total: 19.8s	remaining: 30.1s
700:	learn: 0.1851920	test: 0.1913805	best: 0.1913804 (699)	total: 23.1s	remaining: 26.9s
800:	learn: 0.1839145	test: 0.1910064	best: 0.1910062 (799)	total: 26.4s	remaining: 23.6s
900:	learn: 0.1827501	test: 0.1907382	best: 0.1907382 (900)	total: 29.7s	remaining: 20.3s
1000:	learn: 0.1816374	test: 0.1904976	best: 0.1904941 (996)	total: 33s	remain

type: 3it [18:26, 368.79s/it]

1514:	learn: 0.1775457	test: 0.1860902	best: 0.1860873 (1513)	total: 49.9s	remaining: 0us
bestTest = 0.1860873187
bestIteration = 1513
Shrink model to first 1514 iterations.
pairlogit_mean: 0.18968429172051454
training_done





## Pairlogit's каждого типа Cross-val

- pairlogit_mean_clicks: **0.3933920** (Baseline : 0.395109)
- pairlogit_mean_carts: 0.30792093 (Baseline : **0.307252**)
- pairlogit_mean_orders: **0.1896842** (Baseline : 0.191810)

In [None]:
folder_path = f'{main_path}/models/{ver_tun}/'
os.makedirs(folder_path, exist_ok=True)
file_list = glob.glob(f'/content/model/{ver_tun}/*')
[shutil.copy(file, f'{main_path}/models/{ver_tun}/') for file in file_list]

['/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/catb_tun_0_orders',
 '/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/catb_tun_1_clicks',
 '/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/catb_tun_4_orders',
 '/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/catb_tun_2_carts',
 '/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/catb_tun_2_clicks',
 '/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/catb_tun_3_clicks',
 '/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/catb_tun_4_clicks',
 '/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/catb_tun_0_carts',
 '/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/catb_tun_2_orders',
 '/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/catb_tun_1_carts',
 '/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/catb_tun_1_orders',
 '/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/cat

## Предсказание + оценка

In [None]:
lv_predictions_full = pd.DataFrame()

# Итерация по типам событий ('clicks', 'carts', 'orders')
for typee in tqdm(['clicks', 'carts', 'orders'], desc='type'):

    # Проверка наличия кандидатов выбранного типа в рабочей директории
    if not os.path.exists(f'/content/test_candidates_{typee}'):
        # Импорт кандидатов выбранного типа
        shutil.copytree(f'{main_path}/lv/ver_6/test_candidates_{typee}/',
                        f'/content/test_candidates_{typee}/')

    # Загрузка данных для текущего типа событий
    path = f'test_candidates_{typee}/*'
    test_candidates = read_parquets(path)
    test_candidates = test_candidates.sort_values('session')
    test_candidates = test_candidates.reset_index(drop=True)

    # Выделение признаков
    FEATURES = test_candidates.columns[2:]

    # Создание пула данных для CatBoost
    dtest = Pool(data=test_candidates[FEATURES])

    preds = np.zeros(len(test_candidates))

    # Итерация по фолдам для усреднения прогнозов
    for fold in tqdm(range(5)):
        model = CatBoostRanker(random_state=42)
        model.load_model(f'model/{ver_tun}/catb_tun_{fold}_{typee}')
        fold_preds = model.predict(dtest)
        preds += fold_preds / 5

    # Создание DataFrame с прогнозами для текущего типа
    lv_predictions = test_candidates[['session', 'aid']].copy()
    lv_predictions['pred'] = preds
    lv_predictions = lv_predictions.sort_values(
        ['session', 'pred'], ascending=[True, False]).reset_index(drop=True)
    lv_predictions['n'] = lv_predictions.groupby(
        'session').aid.cumcount().astype('int8')
    lv_predictions = lv_predictions.loc[lv_predictions.n < 20]
    lv_predictions = lv_predictions.groupby('session').aid.apply(list)
    lv_predictions = lv_predictions.to_frame().reset_index()
    lv_predictions.rename(columns={'aid': 'labels'}, inplace=True)
    lv_predictions['type'] = typee

    # Объединение временного DataFrame с основным датасетом
    lv_predictions_full = pd.concat([lv_predictions_full, lv_predictions],
                                    ignore_index=True)
    del lv_predictions, preds, test_candidates

    # Сохранение результатов прогнозирования в Parquet-файл
    # lv_predictions.to_parquet(f'lv_predictions_{typee}.parquet')
    # shutil.copy2(f'lv_predictions_{typee}.parquet', f'{main_path}/lv/{ver_folder}/lv_predictions_{typee}.parquet')

type:   0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:02,  1.50it/s][A
 40%|████      | 2/5 [00:01<00:02,  1.02it/s][A
 60%|██████    | 3/5 [00:03<00:02,  1.17s/it][A
 80%|████████  | 4/5 [00:04<00:01,  1.36s/it][A
100%|██████████| 5/5 [00:06<00:00,  1.39s/it]

  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:15<01:03, 15.86s/it][A
 40%|████      | 2/5 [00:31<00:47, 15.95s/it][A
 60%|██████    | 3/5 [00:47<00:31, 15.92s/it][A
 80%|████████  | 4/5 [01:03<00:15, 15.83s/it][A
100%|██████████| 5/5 [01:19<00:00, 15.83s/it]
type:  33%|███▎      | 1/3 [02:22<04:45, 142.89s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:02,  1.53it/s][A
 40%|████      | 2/5 [00:01<00:02,  1.09it/s][A
 60%|██████    | 3/5 [00:03<00:02,  1.13s/it][A
 80%|████████  | 4/5 [00:04<00:01,  1.36s/it][A
100%|██████████| 5/5 [00:06<00:00,  1.37s/it]

  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██    

## Score

In [None]:
id2type_name = 'id2type.pkl'
# version = 'ver_6.1'

In [None]:
valid = pd.read_parquet(f'{main_path}/s/test_labels_loc.parquet')

with open(f'{main_path}/pkl/{id2type_name}', 'rb') as file:
    id2type = pickle.load(file)

print('Model score :', metric_eval(lv_predictions_full, valid, id2type))

Model score : (0.5268101513955865, type
carts     0.367607
clicks    0.394414
orders    0.628478
dtype: float64)


- pairlogit_mean_clicks: **0.3933920** (Baseline : 0.395109)
- pairlogit_mean_carts: 0.30792093 (Baseline : **0.307252**)
- pairlogit_mean_orders: **0.1896842** (Baseline : 0.191810)

0.5268101 - если обучать готовую модель на кросс-валидации,  
но бейзлайн оубчался без кросс-валидации.

## Обучение на best_params без кросс-валидации

In [None]:
best_p_cl = {
    'loss_function': 'PairLogit',
    'task_type': 'GPU',
    'random_seed': 42,
    'iterations': 1758,
    'learning_rate': 0.053701000860254436,
    'l2_leaf_reg': 0.8881227642380258,
    'border_count': 92,
    'random_strength': 0.8903326222703105,
    'bootstrap_type': 'Poisson',
    'depth': 6,
    'subsample': 0.9931914618968313,
    'min_data_in_leaf': 7
}

In [None]:
best_p_carts = {
    'loss_function': 'PairLogit',
    'task_type': 'GPU',
    'random_seed': 42,
    'iterations': 724,
    'learning_rate': 0.04034712260216462,
    'l2_leaf_reg': 7.449337336120884,
    'border_count': 198,
    'random_strength': 0.3325166004315703,
    'depth': 8,
    'min_data_in_leaf': 12
}

In [None]:
best_p_orders = {
    'loss_function': 'PairLogit',  # Функция потерь PairLogit
    'task_type': 'GPU',
    'random_seed': 42,
    'iterations': 1515,
    'learning_rate': 0.013688644928808849,
    'l2_leaf_reg': 0.03669716203608452,
    'border_count': 235,
    'random_strength': 0.526822845804525,
    'bootstrap_type': 'Poisson',
    'depth': 7,
    'subsample': 0.938154110187783,
    'min_data_in_leaf': 28
}

In [None]:
downsamp_k = 16
params = [best_p_cl, best_p_carts, best_p_orders]
for typee, best_params in tqdm(zip(['clicks', 'carts', 'orders'], params),
                               desc='type'):

    path = f'{main_path}/tr/ver_6/tr_candidates_{typee}/*'
    tr_candidates = read_parquets(path)
    print('reading_done')

    tr_candidates.session = tr_candidates.session.astype(np.int32)
    tr_candidates.aid = tr_candidates.aid.astype(np.int32)

    class_ratio = tr_candidates['target'].value_counts()[1] / \
                  tr_candidates['target'].value_counts()[0]

    # Указываем желаемое соотношение классов
    under_sampler = RandomUnderSampler(sampling_strategy=class_ratio *
                                       downsamp_k)

    # downsamling для лучшего баланса классов
    FEATURES = tr_candidates.columns[:-1]
    X = tr_candidates.loc[:, FEATURES]
    y = tr_candidates.loc[:, 'target']
    X_train, y_train = under_sampler.fit_resample(X, y)
    tr_candidates = pd.merge(X_train,
                             y_train,
                             left_index=True,
                             right_index=True)
    tr_candidates = tr_candidates.sort_values('session')
    tr_candidates = tr_candidates.reset_index(drop=True)
    del X, y
    print('sampling_done')

    # Обучение модели
    X_train = tr_candidates[FEATURES]
    y_train = tr_candidates['target']

    group_id_tr = X_train.iloc[:, 0]

    X_train = X_train.iloc[:, 2:]

    catboost_train = Pool(data=X_train, label=y_train, group_id=group_id_tr)

    ranker = CatBoostRanker(**best_params)
    ranker.fit(catboost_train, early_stopping_rounds=50, verbose=False)

    # Рассчитать метрику pairlogit
    pairlogit = ranker.best_score_['learn']['PairLogit']

    folder_path = f'model/{ver_tun}_no_kfold'
    os.makedirs(folder_path, exist_ok=True)
    ranker.save_model(f'model/{ver_tun}_no_kfold/catb_tun_{fold}_{typee}')

    print('pairlogit:', pairlogit)
    print('training_done')

type: 0it [00:00, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:03<00:13,  3.41s/it][A
 40%|████      | 2/5 [00:07<00:11,  3.92s/it][A
 60%|██████    | 3/5 [00:12<00:08,  4.13s/it][A
 80%|████████  | 4/5 [00:17<00:04,  4.50s/it][A
100%|██████████| 5/5 [00:23<00:00,  4.65s/it]


reading_done
sampling_done


type: 1it [02:51, 171.05s/it]

pairlogit: 0.3883519543825259
training_done



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:04<00:17,  4.28s/it][A
 40%|████      | 2/5 [00:10<00:16,  5.36s/it][A
 60%|██████    | 3/5 [00:16<00:11,  5.74s/it][A
 80%|████████  | 4/5 [00:23<00:06,  6.27s/it][A
100%|██████████| 5/5 [00:31<00:00,  6.22s/it]


reading_done
sampling_done


type: 2it [04:53, 142.55s/it]

pairlogit: 0.29182062795452757
training_done



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:04<00:17,  4.39s/it][A
 40%|████      | 2/5 [00:10<00:15,  5.18s/it][A
 60%|██████    | 3/5 [00:16<00:11,  5.63s/it][A
 80%|████████  | 4/5 [00:23<00:06,  6.15s/it][A
100%|██████████| 5/5 [00:30<00:00,  6.16s/it]


reading_done
sampling_done


type: 3it [07:23, 147.89s/it]

pairlogit: 0.18069645427941486
training_done





In [None]:
ranker.best_score_

{'learn': {'PairLogit': 0.3890073754283778}}

## Предсказание + оценка (повторное)

In [None]:
lv_predictions_full = pd.DataFrame()

# Итерация по типам событий ('clicks', 'carts', 'orders')
for typee in tqdm(['clicks', 'carts', 'orders'], desc='type'):

    # Проверка наличия кандидатов выбранного типа в рабочей директории
    if not os.path.exists(f'/content/test_candidates_{typee}'):
        # Импорт кандидатов выбранного типа
        shutil.copytree(f'{main_path}/lv/ver_6/test_candidates_{typee}/',
                        f'/content/test_candidates_{typee}/')

    # Загрузка данных для текущего типа событий
    path = f'test_candidates_{typee}/*'
    test_candidates = read_parquets(path)
    test_candidates = test_candidates.sort_values('session')
    test_candidates = test_candidates.reset_index(drop=True)

    # Выделение признаков
    FEATURES = test_candidates.columns[2:]

    # Создание пула данных для CatBoost
    dtest = Pool(data=test_candidates[FEATURES])

    # Загрузка обученной модели для текущего типа событий (в данном случае, для 4-го фолда)
    model = CatBoostRanker(random_state=42)
    model.load_model(f'model/{ver_tun}/catb_tun_4_{typee}')

    # Получение прогнозов
    preds = model.predict(dtest)

    # Создание DataFrame с прогнозами для текущего типа
    lv_predictions = test_candidates[['session', 'aid']].copy()
    lv_predictions['pred'] = preds
    lv_predictions = lv_predictions.sort_values(
        ['session', 'pred'], ascending=[True, False]).reset_index(drop=True)
    lv_predictions['n'] = lv_predictions.groupby(
        'session').aid.cumcount().astype('int8')
    lv_predictions = lv_predictions.loc[lv_predictions.n < 20]
    lv_predictions = lv_predictions.groupby('session').aid.apply(list)
    lv_predictions = lv_predictions.to_frame().reset_index()
    lv_predictions.rename(columns={'aid': 'labels'}, inplace=True)
    lv_predictions['type'] = typee

    # Объединение временного DataFrame с основным датасетом
    lv_predictions_full = pd.concat([lv_predictions_full, lv_predictions],
                                    ignore_index=True)
    del lv_predictions, preds, test_candidates

    # Сохранение результатов прогнозирования в Parquet-файл
    # lv_predictions.to_parquet(f'lv_predictions_{typee}.parquet')
    # shutil.copy2(f'lv_predictions_{typee}.parquet', f'{main_path}/lv/{ver_folder}/lv_predictions_{typee}.parquet')

type:   0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:02<00:09,  2.40s/it][A
 40%|████      | 2/5 [00:04<00:07,  2.45s/it][A
 60%|██████    | 3/5 [00:07<00:05,  2.55s/it][A
 80%|████████  | 4/5 [00:10<00:02,  2.84s/it][A
100%|██████████| 5/5 [00:14<00:00,  2.83s/it]
type:  33%|███▎      | 1/3 [01:26<02:53, 86.67s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:01<00:04,  1.18s/it][A
 40%|████      | 2/5 [00:02<00:04,  1.49s/it][A
 60%|██████    | 3/5 [00:04<00:03,  1.70s/it][A
 80%|████████  | 4/5 [00:07<00:01,  1.92s/it][A
100%|██████████| 5/5 [00:09<00:00,  1.93s/it]
type:  67%|██████▋   | 2/3 [02:34<01:15, 75.49s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:01<00:04,  1.17s/it][A
 40%|████      | 2/5 [00:02<00:04,  1.47s/it][A
 60%|██████    | 3/5 [00:04<00:03,  1.68s/it][A
 80%|████████  | 4/5 [00:07<00:01,  1.92s/it][A
100%|██████████| 5/5 [00:09<00:00,  1.95s/it]
typ

## Score

In [None]:
id2type_name = 'id2type.pkl'
# version = 'ver_6.1'

In [None]:
valid = pd.read_parquet(f'{main_path}/s/test_labels_loc.parquet')

with open(f'{main_path}/pkl/{id2type_name}', 'rb') as file:
    id2type = pickle.load(file)

print('Model score :', metric_eval(lv_predictions_full, valid, id2type))

Model score : (0.52662368294355, type
carts     0.367851
clicks    0.393613
orders    0.628179
dtype: float64)


0.5268101 - если обучать готовую модель на кросс-валидации,  
но бейзлайн оубчался без кросс-валидации.


# Ver_8 CatB Tune PFound

## Подбор гиперпараметров с Optuna

In [None]:
ver_tun = 'catb_tun'
ver = 6
ver_folder = f'ver_{ver}'
# typee = 'clicks'
# typee = 'carts'
typee = 'orders'

id2type_name = 'id2type.pkl'

random_state = 42

In [None]:
# Список файлов Parquet
file_list = glob.glob(f'{main_path}/tr/{ver_folder}/tr_candidates_{typee}/*')

# Создание пустого DataFrame для сбора данных
tr_candidates = pd.DataFrame()

# Цикл для чтения и объединения файлов
for file in tqdm(file_list):
    # Чтение файла Parquet во временный DataFrame
    df_ = pd.read_parquet(file)

    # Объединение временного DataFrame с основным датасетом
    tr_candidates = pd.concat([tr_candidates, df_], ignore_index=True)

100%|██████████| 5/5 [00:36<00:00,  7.25s/it]


**Downsampling**

In [None]:
from imblearn.under_sampling import RandomUnderSampler

# clicks
# under_sampler = RandomUnderSampler(sampling_strategy=0.43)  # Указываем желаемое соотношение классов

# carts
under_sampler = RandomUnderSampler(sampling_strategy= 0.0624)  # Указываем желаемое соотношение классов

# orders
# under_sampler = RandomUnderSampler(sampling_strategy= 0.0624)  # Указываем желаемое соотношение классов

In [None]:
FEATURES = tr_candidates.columns[:-1]
X = tr_candidates.loc[:, FEATURES]
y = tr_candidates.loc[:, 'target']
X_train, y_train = under_sampler.fit_resample(X, y)
tr_candidates = pd.merge(X_train, y_train, left_index=True, right_index=True)

In [None]:
tr_candidates = tr_candidates.sort_values('session')
tr_candidates = tr_candidates.reset_index(drop = True)

In [None]:
del X, y, X_train, y_train

**Optuna**

In [None]:
!pip install catboost

Collecting catboost
  Obtaining dependency information for catboost from https://files.pythonhosted.org/packages/a2/aa/1f6a0ae3224f11bafb195035f56bb0fc99ed948cca3c052138665d73801c/catboost-1.2.2-cp311-cp311-macosx_11_0_universal2.whl.metadata
  Downloading catboost-1.2.2-cp311-cp311-macosx_11_0_universal2.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.0/47.0 kB[0m [31m423.8 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Downloading catboost-1.2.2-cp311-cp311-macosx_11_0_universal2.whl (25.7 MB)
[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━[0m [32m18.0/25.7 MB[0m [31m10.7 MB/s[0m eta [36m0:00:01[0m

In [None]:
import catboost
from catboost import CatBoostRanker, Pool

In [None]:
import os
os.environ['CATBOOST_GPU'] = '1'

In [None]:
folder_path = 'model'
os.makedirs(folder_path, exist_ok=True)

**Очень странно - чтобы вытащить метрику PFound для return в objective - нужно указать в catboost_params custom_metric = 'Pairlogit', но не PFound. Но потом она оказывается таки в слвоаре best_score_, и я могу ее брать и выдавать для оптимизации. Написал запрос в поддержку CatBoost**

In [None]:
def objective(trial, data, random_state):

    catboost_params = {
        'loss_function':
        'YetiRank:hints=skip_train~false',
        # 'loss_function': 'PairLogit',# Функция потерь PairLogit
        # 'custom_metric': 'PFound:hints=skip_train~false',
        'custom_metric':
        'PairLogit',
        'task_type':
        'GPU',  # Использовать GPU
        'metric_period':
        250,
        'random_seed':
        trial.suggest_categorical('random_seed',
                                  [random_state]),  # Случайное зерно
        'iterations':
        trial.suggest_categorical("iterations", [828]),
        'learning_rate':
        trial.suggest_categorical("learning_rate", [0.12676718423508196]),
        # 'iterations':
#         trial.suggest_int('iterations', 500, 2000),  # Количество итераций
        # 'learning_rate':
#         trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
        'l2_leaf_reg':
        trial.suggest_float('l2_leaf_reg', 1e-5, 1e2,
                            log=True),  # L2 регуляризация
        'border_count':
        trial.suggest_int('border_count', 32,
                          255),  # Количество бинов для числовых признаков
        'random_strength':
        trial.suggest_float('random_strength', 0.0, 1.0),  # Сила случайности
        # 'bootstrap_type':
#         trial.suggest_categorical('bootstrap_type', ['Bernoulli', 'MVS', 'Poisson']),
        "depth":
        trial.suggest_int("depth", 1, 10),
        # "subsample":
#         trial.suggest_float("subsample", 0.05, 1.0),
        "min_data_in_leaf":
        trial.suggest_int("min_data_in_leaf", 1, 100)
    }

    # data = tr_candidates
    skf = GroupKFold(n_splits=5)
    FEATURES = data.columns[:-1]

    scores = []

    for fold, (train_idx, valid_idx) in enumerate(
            skf.split(data, data['target'], groups=data['session'])):

        X_train = data.loc[train_idx, FEATURES]
        y_train = data.loc[train_idx, 'target']
        X_valid = data.loc[valid_idx, FEATURES]
        y_valid = data.loc[valid_idx, 'target']

        group_id_tr = X_train.iloc[:, 0]
        group_id_v = X_valid.iloc[:, 0]

        X_train = X_train.iloc[:, 2:]
        X_valid = X_valid.iloc[:, 2:]

        catboost_train = Pool(data=X_train,
                              label=y_train,
                              group_id=group_id_tr)
        catboost_valid = Pool(data=X_valid, label=y_valid, group_id=group_id_v)

        ranker = CatBoostRanker(**catboost_params)

        ranker.fit(catboost_train, eval_set=catboost_valid, verbose_eval=250)

        # Рассчитать метрику PFound
        print(ranker.best_score_)
        PFound = ranker.best_score_['validation']['PFound']
        # PFound = ranker.best_score_['validation']['PairLogit']
        scores.append(PFound)

    # Усреднить PFound по всем фолдам
    PFound_mean = np.mean(scores)

    return PFound_mean

In [None]:
# Создать Optuna study для оптимизации
study = optuna.create_study(direction="maximize",
                            study_name="CatBoost_Ranker_Optimization")
func = lambda trial: objective(trial, tr_candidates, random_state=random_state)

[I 2023-10-03 03:08:48,011] A new study created in memory with name: CatBoost_Ranker_Optimization


In [None]:
study.optimize(func, n_trials=1, show_progress_bar=True, n_jobs=6)

  0%|          | 0/1 [00:00<?, ?it/s]

Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.0921486	best: 0.0921486 (0)	total: 91.7ms	remaining: 1m 15s
250:	test: 0.0938066	best: 0.0938066 (250)	total: 9.21s	remaining: 21.2s
500:	test: 0.0938388	best: 0.0938388 (500)	total: 18.4s	remaining: 12s
750:	test: 0.0938526	best: 0.0938526 (750)	total: 27.7s	remaining: 2.83s
827:	test: 0.0938592	best: 0.0938592 (827)	total: 30.5s	remaining: 0us
bestTest = 0.09385917151
bestIteration = 827
{'learn': {'PairLogit': 0.19596942640881637}, 'validation': {'PFound': 0.093859171511442, 'PairLogit': 0.1967569390450675}}


Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.0926297	best: 0.0926297 (0)	total: 92.6ms	remaining: 1m 16s
250:	test: 0.0941957	best: 0.0941957 (250)	total: 9.21s	remaining: 21.2s
500:	test: 0.0942279	best: 0.0942279 (500)	total: 18.4s	remaining: 12s
750:	test: 0.0942356	best: 0.0942356 (750)	total: 27.6s	remaining: 2.83s
827:	test: 0.0942496	best: 0.0942496 (827)	total: 30.4s	remaining: 0us
bestTest = 0.09424962198
bestIteration = 827
{'learn': {'PairLogit': 0.19594987703196293}, 'validation': {'PFound': 0.09424962198143552, 'PairLogit': 0.19638691812244805}}


Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.0921410	best: 0.0921410 (0)	total: 87.6ms	remaining: 1m 12s
250:	test: 0.0937905	best: 0.0937905 (250)	total: 9.17s	remaining: 21.1s
500:	test: 0.0938374	best: 0.0938374 (500)	total: 18.3s	remaining: 12s
750:	test: 0.0938671	best: 0.0938671 (750)	total: 27.5s	remaining: 2.82s
827:	test: 0.0938646	best: 0.0938671 (750)	total: 30.4s	remaining: 0us
bestTest = 0.09386705738
bestIteration = 750
Shrink model to first 751 iterations.
{'learn': {'PairLogit': 0.19430743095612654}, 'validation': {'PFound': 0.09386705738080897, 'PairLogit': 0.2034723882991134}}


Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.0917060	best: 0.0917060 (0)	total: 93.5ms	remaining: 1m 17s
250:	test: 0.0933459	best: 0.0933459 (250)	total: 9.2s	remaining: 21.1s
500:	test: 0.0933894	best: 0.0933894 (500)	total: 18.4s	remaining: 12s
750:	test: 0.0934004	best: 0.0934004 (750)	total: 27.6s	remaining: 2.83s
827:	test: 0.0934061	best: 0.0934061 (827)	total: 30.4s	remaining: 0us
bestTest = 0.09340612441
bestIteration = 827
{'learn': {'PairLogit': 0.19562670159185774}, 'validation': {'PFound': 0.0934061244060777, 'PairLogit': 0.19730977283221143}}


Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.0922445	best: 0.0922445 (0)	total: 87.5ms	remaining: 1m 12s
250:	test: 0.0938771	best: 0.0938771 (250)	total: 9.16s	remaining: 21.1s
500:	test: 0.0939175	best: 0.0939175 (500)	total: 18.3s	remaining: 12s
750:	test: 0.0939342	best: 0.0939342 (750)	total: 27.5s	remaining: 2.82s
827:	test: 0.0939437	best: 0.0939437 (827)	total: 30.4s	remaining: 0us
bestTest = 0.09394374069
bestIteration = 827
{'learn': {'PairLogit': 0.1958391153037997}, 'validation': {'PFound': 0.0939437406903854, 'PairLogit': 0.19701879633920297}}
[I 2023-10-02 08:35:23,152] Trial 0 finished with value: 0.09386514319402992 and parameters: {'random_seed': 42, 'iterations': 828, 'learning_rate': 0.12676718423508196, 'l2_leaf_reg': 0.002270370592919817, 'border_count': 37, 'random_strength': 0.9389551921967575, 'depth': 6, 'min_data_in_leaf': 73}. Best is trial 0 with value: 0.09386514319402992.


**Записываю варианты получившихся параметров и выбираю лучшие**

**Clicks** \
0.6159558 \
{'random_seed': 42, 'iterations': 800, 'learning_rate': 0.09237673064613201} \   
**0.616068** \
{'random_seed': 42,
 'iterations': 800,
 'learning_rate': 0.09237673064613201,
 'l2_leaf_reg': 0.0019442470574252018,
 'border_count': 97,
 'random_strength': 0.5971699972467241,
 'depth': 7,
 'min_data_in_leaf': 100} \
 **Carts** \
  0.150190 \   
{'random_seed': 42, 'iterations': 551, 'learning_rate': 0.010372549720642329} \
  **0.1502257** \  
  'random_seed': 42,
 'iterations': 551,
 'learning_rate': 0.010372549720642329,
 'l2_leaf_reg': 0.019335935779919735,
 'border_count': 175,
 'random_strength': 0.9145153914960099,
 'bootstrap_type': 'MVS',
 'depth': 10,
 'subsample': 0.2884983050620958,
 'min_data_in_leaf': 66 \

 **Orders** \
0.093870 \  
{'random_seed': 42, 'iterations': 828, 'learning_rate': 0.12676718423508196}\.  
**0.093865**  \   
{'random_seed': 42,
 'iterations': 828,
 'learning_rate': 0.12676718423508196,
 'l2_leaf_reg': 0.002270370592919817,
 'border_count': 37,
 'random_strength': 0.9389551921967575,
 'depth': 6,
 'min_data_in_leaf': 73} \



In [None]:
study.best_params

{'random_seed': 42,
 'iterations': 828,
 'learning_rate': 0.12676718423508196,
 'l2_leaf_reg': 0.002270370592919817,
 'border_count': 37,
 'random_strength': 0.9389551921967575,
 'depth': 6,
 'min_data_in_leaf': 73}

**Формирую словари лучших гиперпараметров**

In [None]:
best_p_cl = {
    'loss_function': 'YetiRank:hints=skip_train~false',
    'custom_metric': 'PairLogit',
    'task_type': 'GPU',
    'random_seed': 42,
    'iterations': 800,
    'learning_rate': 0.09237673064613201,
    'l2_leaf_reg': 0.0019442470574252018,
    'border_count': 97,
    'random_strength': 0.5971699972467241,
    'depth': 7,
    'min_data_in_leaf': 100
}

In [None]:
best_p_carts = {
    'loss_function': 'YetiRank:hints=skip_train~false',
    'custom_metric': 'PairLogit',
    'task_type': 'GPU',
    'random_seed': 42,
    'iterations': 551,
    'learning_rate': 0.010372549720642329,
    'l2_leaf_reg': 0.019335935779919735,
    'border_count': 175,
    'random_strength': 0.9145153914960099,
    'bootstrap_type': 'MVS',
    'depth': 10,
    'subsample': 0.2884983050620958,
    'min_data_in_leaf': 66
}

In [None]:
best_p_orders = {
    'loss_function': 'YetiRank:hints=skip_train~false',
    'custom_metric': 'PairLogit',
    'task_type': 'GPU',
    'random_seed': 42,
    'random_seed': 42,
    'iterations': 828,
    'learning_rate': 0.12676718423508196,
    'l2_leaf_reg': 0.002270370592919817,
    'border_count': 37,
    'random_strength': 0.9389551921967575,
    'depth': 6,
    'min_data_in_leaf': 73
}

## Обучение на best_params

In [None]:
downsamp_k = 16
params = [best_p_cl, best_p_carts, best_p_orders]

# Итерация по типам событий ('clicks', 'carts', 'orders') и их лучшим параметрам
for typee, best_params in tqdm(zip(['clicks', 'carts', 'orders'], params),
                               desc='type'):

    # Загрузка данных для текущего типа событий
    path = f'{main_path}/tr/ver_6/tr_candidates_{typee}/*'
    tr_candidates = read_parquets(path)
    print('reading_done')

    # Приведение типов данных для сессии и aid
    tr_candidates.session = tr_candidates.session.astype(np.int32)
    tr_candidates.aid = tr_candidates.aid.astype(np.int32)

    # Вычисление соотношения классов
    class_ratio = tr_candidates['target'].value_counts()[1] / \
                  tr_candidates['target'].value_counts()[0]

    # Указываем желаемое соотношение классов и проводим downsampling
    under_sampler = RandomUnderSampler(sampling_strategy=class_ratio *
                                       downsamp_k)
    FEATURES = tr_candidates.columns[:-1]
    X = tr_candidates.loc[:, FEATURES]
    y = tr_candidates.loc[:, 'target']
    X_train, y_train = under_sampler.fit_resample(X, y)
    tr_candidates = pd.merge(X_train,
                             y_train,
                             left_index=True,
                             right_index=True)
    tr_candidates = tr_candidates.sort_values('session')
    tr_candidates = tr_candidates.reset_index(drop=True)
    del X, y
    print('sampling_done')

    # Обучение модели с использованием GroupKFold
    skf = GroupKFold(n_splits=5)
    FEATURES = tr_candidates.columns[:-1]

    scores = []

    for fold, (train_idx, valid_idx) in enumerate(
            skf.split(tr_candidates,
                      tr_candidates['target'],
                      groups=tr_candidates['session'])):

        X_train = tr_candidates.loc[train_idx, FEATURES]
        y_train = tr_candidates.loc[train_idx, 'target']
        X_valid = tr_candidates.loc[valid_idx, FEATURES]
        y_valid = tr_candidates.loc[valid_idx, 'target']

        group_id_tr = X_train.iloc[:, 0]
        group_id_v = X_valid.iloc[:, 0]

        X_train = X_train.iloc[:, 2:]
        X_valid = X_valid.iloc[:, 2:]

        catboost_train = Pool(data=X_train,
                              label=y_train,
                              group_id=group_id_tr)
        catboost_valid = Pool(data=X_valid, label=y_valid, group_id=group_id_v)

        # Создание и обучение CatBoostRanker с лучшими параметрами
        ranker = CatBoostRanker(**best_params)
        ranker.fit(catboost_train, eval_set=catboost_valid, verbose_eval=100)

        # Рассчитать метрику PFound
        PFound = ranker.best_score_['validation']['PFound']
        scores.append(PFound)

        # Сохранение модели на диск
        folder_path = f'model/{ver_tun}'
        os.makedirs(folder_path, exist_ok=True)
        ranker.save_model(f'model/{ver_tun}/catb_tun_{fold}_{typee}')

    # Усреднить PFound по всем фолдам
    PFound_mean = np.mean(scores)
    print('PFound_mean:', PFound_mean)
    print('training_done')

type: 0it [00:00, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:04<00:19,  4.88s/it][A
 40%|████      | 2/5 [00:11<00:17,  5.78s/it][A
 60%|██████    | 3/5 [00:18<00:12,  6.32s/it][A
 80%|████████  | 4/5 [00:25<00:06,  6.80s/it][A
100%|██████████| 5/5 [00:33<00:00,  6.79s/it]


reading_done
sampling_done


Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.5977711	best: 0.5977711 (0)	total: 173ms	remaining: 2m 17s
100:	test: 0.6157592	best: 0.6157592 (100)	total: 6.61s	remaining: 45.8s
200:	test: 0.6164299	best: 0.6164299 (200)	total: 13s	remaining: 38.7s
300:	test: 0.6167142	best: 0.6167147 (295)	total: 19.4s	remaining: 32.1s
400:	test: 0.6169380	best: 0.6169380 (400)	total: 25.8s	remaining: 25.7s
500:	test: 0.6170515	best: 0.6170515 (500)	total: 32.3s	remaining: 19.3s
600:	test: 0.6171167	best: 0.6171167 (600)	total: 38.9s	remaining: 12.9s
700:	test: 0.6171755	best: 0.6171823 (675)	total: 45.5s	remaining: 6.42s
799:	test: 0.6172082	best: 0.6172082 (799)	total: 52s	remaining: 0us
bestTest = 0.6172081932
bestIteration = 799
{'learn': {'PairLogit': 0.4010464509089207}, 'validation': {'PFound': 0.6172081932040321, 'PairLogit': 0.40130148153235745}}


Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.5986438	best: 0.5986438 (0)	total: 168ms	remaining: 2m 14s
100:	test: 0.6157640	best: 0.6157640 (100)	total: 6.62s	remaining: 45.8s
200:	test: 0.6164187	best: 0.6164187 (200)	total: 13s	remaining: 38.7s
300:	test: 0.6166983	best: 0.6166983 (300)	total: 19.4s	remaining: 32.1s
400:	test: 0.6168643	best: 0.6168643 (400)	total: 25.8s	remaining: 25.7s
500:	test: 0.6169644	best: 0.6169694 (490)	total: 32.4s	remaining: 19.3s
600:	test: 0.6170731	best: 0.6170731 (600)	total: 38.9s	remaining: 12.9s
700:	test: 0.6171608	best: 0.6171668 (690)	total: 45.5s	remaining: 6.42s
799:	test: 0.6171916	best: 0.6172014 (775)	total: 52s	remaining: 0us
bestTest = 0.6172013882
bestIteration = 775
Shrink model to first 776 iterations.
{'learn': {'PairLogit': 0.400788287958621}, 'validation': {'PFound': 0.6172013882075678, 'PairLogit': 0.4024062141626345}}


Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.5979084	best: 0.5979084 (0)	total: 164ms	remaining: 2m 10s
100:	test: 0.6159632	best: 0.6159632 (100)	total: 6.63s	remaining: 45.9s
200:	test: 0.6166606	best: 0.6166606 (200)	total: 13s	remaining: 38.8s
300:	test: 0.6169369	best: 0.6169369 (300)	total: 19.4s	remaining: 32.2s
400:	test: 0.6171247	best: 0.6171247 (400)	total: 25.9s	remaining: 25.8s
500:	test: 0.6172664	best: 0.6172664 (500)	total: 32.4s	remaining: 19.4s
600:	test: 0.6173403	best: 0.6173451 (595)	total: 39s	remaining: 12.9s
700:	test: 0.6173900	best: 0.6173900 (700)	total: 45.6s	remaining: 6.44s
799:	test: 0.6174375	best: 0.6174375 (799)	total: 52.2s	remaining: 0us
bestTest = 0.6174374765
bestIteration = 799
{'learn': {'PairLogit': 0.40087928287827235}, 'validation': {'PFound': 0.6174374764650251, 'PairLogit': 0.4020474384605616}}


Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.5982555	best: 0.5982555 (0)	total: 178ms	remaining: 2m 21s
100:	test: 0.6151419	best: 0.6151419 (100)	total: 6.65s	remaining: 46s
200:	test: 0.6157586	best: 0.6157586 (200)	total: 13s	remaining: 38.8s
300:	test: 0.6160218	best: 0.6160218 (300)	total: 19.4s	remaining: 32.2s
400:	test: 0.6162243	best: 0.6162243 (400)	total: 25.9s	remaining: 25.8s
500:	test: 0.6163900	best: 0.6163923 (495)	total: 32.4s	remaining: 19.3s
600:	test: 0.6164684	best: 0.6164684 (600)	total: 38.9s	remaining: 12.9s
700:	test: 0.6165459	best: 0.6165492 (690)	total: 45.5s	remaining: 6.43s
799:	test: 0.6165975	best: 0.6166001 (795)	total: 52.1s	remaining: 0us
bestTest = 0.6166000926
bestIteration = 795
Shrink model to first 796 iterations.
{'learn': {'PairLogit': 0.40110802611798246}, 'validation': {'PFound': 0.6166000926250224, 'PairLogit': 0.4011575889229596}}


Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.5972295	best: 0.5972295 (0)	total: 176ms	remaining: 2m 20s
100:	test: 0.6151225	best: 0.6151225 (100)	total: 6.64s	remaining: 46s
200:	test: 0.6157201	best: 0.6157201 (200)	total: 13s	remaining: 38.9s
300:	test: 0.6160742	best: 0.6160742 (300)	total: 19.4s	remaining: 32.2s
400:	test: 0.6162120	best: 0.6162120 (400)	total: 25.9s	remaining: 25.8s
500:	test: 0.6162854	best: 0.6162854 (500)	total: 32.5s	remaining: 19.4s
600:	test: 0.6163664	best: 0.6163728 (590)	total: 39s	remaining: 12.9s
700:	test: 0.6164660	best: 0.6164660 (700)	total: 45.7s	remaining: 6.45s
799:	test: 0.6164958	best: 0.6165022 (785)	total: 52.2s	remaining: 0us
bestTest = 0.6165021532
bestIteration = 785
Shrink model to first 786 iterations.


type: 1it [06:23, 383.80s/it]

{'learn': {'PairLogit': 0.40122482185511466}, 'validation': {'PFound': 0.6165021531808076, 'PairLogit': 0.4007950995467958}}
PFound_mean: 0.616989860736491
training_done



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:04<00:17,  4.29s/it][A
 40%|████      | 2/5 [00:10<00:15,  5.21s/it][A
 60%|██████    | 3/5 [00:17<00:12,  6.03s/it][A
 80%|████████  | 4/5 [00:24<00:06,  6.43s/it][A
100%|██████████| 5/5 [00:31<00:00,  6.34s/it]


reading_done
sampling_done


Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.1475553	best: 0.1475553 (0)	total: 130ms	remaining: 1m 11s
100:	test: 0.1495660	best: 0.1495660 (100)	total: 6.37s	remaining: 28.4s
200:	test: 0.1496232	best: 0.1496235 (190)	total: 12.7s	remaining: 22.1s
300:	test: 0.1496692	best: 0.1496692 (300)	total: 19s	remaining: 15.8s
400:	test: 0.1496817	best: 0.1496866 (385)	total: 25.4s	remaining: 9.5s
500:	test: 0.1497126	best: 0.1497139 (495)	total: 31.7s	remaining: 3.16s
550:	test: 0.1497358	best: 0.1497358 (550)	total: 34.9s	remaining: 0us
bestTest = 0.1497358244
bestIteration = 550
{'learn': {'PairLogit': 0.3798946634378345}, 'validation': {'PFound': 0.14973582438566518, 'PairLogit': 0.38204677175091195}}


Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.1480744	best: 0.1480744 (0)	total: 125ms	remaining: 1m 8s
100:	test: 0.1496775	best: 0.1496827 (95)	total: 6.32s	remaining: 28.1s
200:	test: 0.1497926	best: 0.1497926 (200)	total: 12.5s	remaining: 21.8s
300:	test: 0.1498572	best: 0.1498580 (295)	total: 18.8s	remaining: 15.6s
400:	test: 0.1499174	best: 0.1499174 (400)	total: 25s	remaining: 9.34s
500:	test: 0.1499382	best: 0.1499382 (500)	total: 31.2s	remaining: 3.11s
550:	test: 0.1499385	best: 0.1499406 (525)	total: 34.3s	remaining: 0us
bestTest = 0.149940585
bestIteration = 525
Shrink model to first 526 iterations.
{'learn': {'PairLogit': 0.3795309222066723}, 'validation': {'PFound': 0.14994058497180032, 'PairLogit': 0.3829858433553736}}


Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.1495706	best: 0.1495706 (0)	total: 123ms	remaining: 1m 7s
100:	test: 0.1504424	best: 0.1504424 (100)	total: 6.34s	remaining: 28.3s
200:	test: 0.1505059	best: 0.1505142 (190)	total: 12.6s	remaining: 22s
300:	test: 0.1505732	best: 0.1505791 (285)	total: 18.9s	remaining: 15.7s
400:	test: 0.1506053	best: 0.1506053 (400)	total: 25.1s	remaining: 9.41s
500:	test: 0.1506425	best: 0.1506425 (500)	total: 31.4s	remaining: 3.13s
550:	test: 0.1506622	best: 0.1506622 (550)	total: 34.5s	remaining: 0us
bestTest = 0.1506622116
bestIteration = 550
{'learn': {'PairLogit': 0.3801132038275998}, 'validation': {'PFound': 0.15066221160428203, 'PairLogit': 0.3814032979554807}}


Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.1490935	best: 0.1490935 (0)	total: 125ms	remaining: 1m 8s
100:	test: 0.1503651	best: 0.1503651 (100)	total: 6.34s	remaining: 28.2s
200:	test: 0.1504431	best: 0.1504431 (200)	total: 12.5s	remaining: 21.8s
300:	test: 0.1505056	best: 0.1505056 (300)	total: 18.7s	remaining: 15.6s
400:	test: 0.1505488	best: 0.1505493 (395)	total: 24.9s	remaining: 9.32s
500:	test: 0.1506003	best: 0.1506003 (500)	total: 31.1s	remaining: 3.1s
550:	test: 0.1506198	best: 0.1506198 (550)	total: 34.2s	remaining: 0us
bestTest = 0.1506198485
bestIteration = 550
{'learn': {'PairLogit': 0.380480192410921}, 'validation': {'PFound': 0.150619848470535, 'PairLogit': 0.3812295421104451}}


Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.1496186	best: 0.1496186 (0)	total: 121ms	remaining: 1m 6s
100:	test: 0.1507157	best: 0.1507157 (100)	total: 6.26s	remaining: 27.9s
200:	test: 0.1508003	best: 0.1508003 (200)	total: 12.5s	remaining: 21.8s
300:	test: 0.1508455	best: 0.1508490 (290)	total: 18.7s	remaining: 15.6s
400:	test: 0.1508979	best: 0.1508979 (400)	total: 25s	remaining: 9.35s
500:	test: 0.1509375	best: 0.1509375 (500)	total: 31.2s	remaining: 3.12s
550:	test: 0.1509534	best: 0.1509534 (550)	total: 34.4s	remaining: 0us
bestTest = 0.1509534429
bestIteration = 550
{'learn': {'PairLogit': 0.38047749698186933}, 'validation': {'PFound': 0.15095344290256676, 'PairLogit': 0.38096935178211494}}


type: 2it [11:09, 325.99s/it]

PFound_mean: 0.15038238246696986
training_done



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:04<00:16,  4.06s/it][A
 40%|████      | 2/5 [00:09<00:14,  4.90s/it][A
 60%|██████    | 3/5 [00:15<00:11,  5.56s/it][A
 80%|████████  | 4/5 [00:22<00:06,  6.01s/it][A
100%|██████████| 5/5 [00:30<00:00,  6.08s/it]


reading_done
sampling_done


Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.0873237	best: 0.0873237 (0)	total: 115ms	remaining: 1m 35s
100:	test: 0.0891171	best: 0.0891171 (100)	total: 4.87s	remaining: 35.1s
200:	test: 0.0891690	best: 0.0891690 (200)	total: 9.66s	remaining: 30.1s
300:	test: 0.0891915	best: 0.0891915 (300)	total: 14.4s	remaining: 25.3s
400:	test: 0.0892036	best: 0.0892036 (400)	total: 19.2s	remaining: 20.5s
500:	test: 0.0892196	best: 0.0892201 (495)	total: 24.1s	remaining: 15.7s
600:	test: 0.0892317	best: 0.0892350 (590)	total: 28.9s	remaining: 10.9s
700:	test: 0.0892508	best: 0.0892509 (695)	total: 33.8s	remaining: 6.12s
800:	test: 0.0892556	best: 0.0892566 (775)	total: 38.6s	remaining: 1.3s
827:	test: 0.0892539	best: 0.0892566 (775)	total: 39.9s	remaining: 0us
bestTest = 0.08925660344
bestIteration = 775
Shrink model to first 776 iterations.
{'learn': {'PairLogit': 0.1951745063623902}, 'validation': {'PFound': 0.08925660344323434, 'PairLogit': 0.19509601139659383}}


Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.0872365	best: 0.0872365 (0)	total: 114ms	remaining: 1m 34s
100:	test: 0.0890043	best: 0.0890043 (100)	total: 4.89s	remaining: 35.2s
200:	test: 0.0890484	best: 0.0890495 (195)	total: 9.67s	remaining: 30.2s
300:	test: 0.0890689	best: 0.0890689 (300)	total: 14.5s	remaining: 25.3s
400:	test: 0.0890745	best: 0.0890817 (375)	total: 19.3s	remaining: 20.5s
500:	test: 0.0890952	best: 0.0890952 (500)	total: 24.1s	remaining: 15.7s
600:	test: 0.0891135	best: 0.0891135 (600)	total: 29s	remaining: 10.9s
700:	test: 0.0891176	best: 0.0891219 (660)	total: 33.8s	remaining: 6.12s
800:	test: 0.0891177	best: 0.0891219 (660)	total: 38.6s	remaining: 1.3s
827:	test: 0.0891140	best: 0.0891219 (660)	total: 39.9s	remaining: 0us
bestTest = 0.08912186241
bestIteration = 660
Shrink model to first 661 iterations.
{'learn': {'PairLogit': 0.19478412923777674}, 'validation': {'PFound': 0.08912186241084247, 'PairLogit': 0.19753783014193846}}


Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.0886701	best: 0.0886701 (0)	total: 115ms	remaining: 1m 34s
100:	test: 0.0904022	best: 0.0904022 (100)	total: 4.87s	remaining: 35.1s
200:	test: 0.0904431	best: 0.0904438 (195)	total: 9.65s	remaining: 30.1s
300:	test: 0.0904807	best: 0.0904818 (295)	total: 14.5s	remaining: 25.3s
400:	test: 0.0904889	best: 0.0904900 (390)	total: 19.3s	remaining: 20.5s
500:	test: 0.0905002	best: 0.0905008 (490)	total: 24.1s	remaining: 15.7s
600:	test: 0.0905232	best: 0.0905232 (600)	total: 29s	remaining: 10.9s
700:	test: 0.0905316	best: 0.0905326 (695)	total: 33.8s	remaining: 6.12s
800:	test: 0.0905476	best: 0.0905483 (790)	total: 38.6s	remaining: 1.3s
827:	test: 0.0905481	best: 0.0905485 (825)	total: 39.9s	remaining: 0us
bestTest = 0.09054848581
bestIteration = 825
Shrink model to first 826 iterations.
{'learn': {'PairLogit': 0.19394053876850326}, 'validation': {'PFound': 0.09054848580853264, 'PairLogit': 0.2001117071378343}}


Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.0885629	best: 0.0885629 (0)	total: 111ms	remaining: 1m 32s
100:	test: 0.0897535	best: 0.0897535 (100)	total: 4.88s	remaining: 35.1s
200:	test: 0.0898064	best: 0.0898083 (195)	total: 9.69s	remaining: 30.2s
300:	test: 0.0898464	best: 0.0898484 (295)	total: 14.5s	remaining: 25.4s
400:	test: 0.0898633	best: 0.0898652 (390)	total: 19.3s	remaining: 20.6s
500:	test: 0.0898809	best: 0.0898809 (500)	total: 24.1s	remaining: 15.7s
600:	test: 0.0898950	best: 0.0898968 (590)	total: 28.9s	remaining: 10.9s
700:	test: 0.0899014	best: 0.0899021 (695)	total: 33.8s	remaining: 6.13s
800:	test: 0.0899032	best: 0.0899064 (780)	total: 38.7s	remaining: 1.3s
827:	test: 0.0899071	best: 0.0899078 (820)	total: 40s	remaining: 0us
bestTest = 0.08990775654
bestIteration = 820
Shrink model to first 821 iterations.
{'learn': {'PairLogit': 0.1956116568691615}, 'validation': {'PFound': 0.08990775653754342, 'PairLogit': 0.19384133155460914}}


Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.0870828	best: 0.0870828 (0)	total: 118ms	remaining: 1m 37s
100:	test: 0.0888922	best: 0.0888922 (100)	total: 4.87s	remaining: 35.1s
200:	test: 0.0889489	best: 0.0889489 (200)	total: 9.66s	remaining: 30.1s
300:	test: 0.0889819	best: 0.0889819 (300)	total: 14.5s	remaining: 25.3s
400:	test: 0.0890005	best: 0.0890005 (400)	total: 19.3s	remaining: 20.5s
500:	test: 0.0890093	best: 0.0890103 (490)	total: 24.1s	remaining: 15.7s
600:	test: 0.0890150	best: 0.0890181 (580)	total: 28.9s	remaining: 10.9s
700:	test: 0.0890157	best: 0.0890199 (685)	total: 33.7s	remaining: 6.11s
800:	test: 0.0890323	best: 0.0890323 (800)	total: 38.5s	remaining: 1.3s


type: 3it [16:15, 325.33s/it]

827:	test: 0.0890326	best: 0.0890341 (825)	total: 39.8s	remaining: 0us
bestTest = 0.08903407522
bestIteration = 825
Shrink model to first 826 iterations.
{'learn': {'PairLogit': 0.19447849352610785}, 'validation': {'PFound': 0.08903407522394335, 'PairLogit': 0.19846317343904588}}
PFound_mean: 0.08957375668481923
training_done





## Pairlogit's каждого типа Cross-val

- pairlogit_mean_clicks: **0.3933920** (Baseline : 0.395109)
- pairlogit_mean_carts: 0.30792093 (Baseline : **0.307252**)
- pairlogit_mean_orders: **0.1896842** (Baseline : 0.191810)

In [None]:
folder_path = f'{main_path}/models/{ver_tun}/'
os.makedirs(folder_path, exist_ok=True)
file_list = glob.glob(f'/content/model/{ver_tun}/*')
[shutil.copy(file, f'{main_path}/models/{ver_tun}/') for file in file_list]

['/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/catb_tun_0_orders',
 '/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/catb_tun_1_clicks',
 '/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/catb_tun_4_orders',
 '/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/catb_tun_2_carts',
 '/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/catb_tun_2_clicks',
 '/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/catb_tun_3_clicks',
 '/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/catb_tun_4_clicks',
 '/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/catb_tun_0_carts',
 '/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/catb_tun_2_orders',
 '/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/catb_tun_1_carts',
 '/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/catb_tun_1_orders',
 '/content/drive/Othercomputers/Mac/Jup.Notebook/models/catb_tun/cat

## Предсказание + оценка

**На тестовых данных**

In [None]:
lv_predictions_full = pd.DataFrame()

# Итерация по типам событий ('clicks', 'carts', 'orders')
for typee in tqdm(['clicks', 'carts', 'orders'], desc='type'):

    # Проверка наличия кандидатов выбранного типа в нужном месте
    if os.path.exists(f'/content/test_candidates_{typee}') == False:
        # Импорт кандидатов выбранного типа
        shutil.copytree(f'{main_path}/lv/ver_6/test_candidates_{typee}/',
                        f'/content/test_candidates_{typee}/')

    # Загрузка данных для текущего типа событий
    path = f'test_candidates_{typee}/*'
    test_candidates = read_parquets(path)
    test_candidates = test_candidates.sort_values('session')
    test_candidates = test_candidates.reset_index(drop=True)

    FEATURES = test_candidates.columns[2:]
    dtest = Pool(data=test_candidates[FEATURES])
    preds = np.zeros(len(test_candidates))

    # Итерация по фолдам (в данном случае 5)
    for fold in tqdm(range(5)):
        model = CatBoostRanker(random_state=42)
        model.load_model(f'models/{ver_tun}/catb_tun_{fold}_{typee}')
        fold_preds = model.predict(dtest)
        preds += fold_preds / 5

    # Создание DataFrame для предсказаний текущего типа событий
    lv_predictions = test_candidates[['session', 'aid']].copy()
    lv_predictions['pred'] = preds
    lv_predictions = lv_predictions.sort_values(
        ['session', 'pred'], ascending=[True, False]).reset_index(drop=True)
    lv_predictions['n'] = lv_predictions.groupby(
        'session').aid.cumcount().astype('int8')
    lv_predictions = lv_predictions.loc[lv_predictions.n < 20]
    lv_predictions = lv_predictions.groupby('session').aid.apply(list)
    lv_predictions = lv_predictions.to_frame().reset_index()
    lv_predictions.rename(columns={'aid': 'labels'}, inplace=True)
    lv_predictions['type'] = typee

    # Объединение временного DataFrame с основным датасетом
    lv_predictions_full = pd.concat([lv_predictions_full, lv_predictions],
                                    ignore_index=True)
    del lv_predictions, preds, test_candidates

type:   0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:02,  1.45it/s][A
 40%|████      | 2/5 [00:01<00:03,  1.03s/it][A
 60%|██████    | 3/5 [00:03<00:02,  1.22s/it][A
 80%|████████  | 4/5 [00:05<00:01,  1.42s/it][A
100%|██████████| 5/5 [00:07<00:00,  1.42s/it]

  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:10<00:40, 10.15s/it][A
 40%|████      | 2/5 [00:18<00:28,  9.38s/it][A
 60%|██████    | 3/5 [00:29<00:19,  9.78s/it][A
 80%|████████  | 4/5 [00:38<00:09,  9.59s/it][A
100%|██████████| 5/5 [00:47<00:00,  9.44s/it]
type:  33%|███▎      | 1/3 [01:47<03:34, 107.39s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:02,  1.52it/s][A
 40%|████      | 2/5 [00:01<00:02,  1.05it/s][A
 60%|██████    | 3/5 [00:03<00:02,  1.16s/it][A
 80%|████████  | 4/5 [00:04<00:01,  1.37s/it][A
100%|██████████| 5/5 [00:06<00:00,  1.38s/it]

  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██    

**На тренировочных данных**

In [None]:
train_predictions_full = pd.DataFrame()

for typee in tqdm(['clicks', 'carts', 'orders'], desc='type'):

    path = f"{main_path}/tr/ver_6/tr_candidates_{typee}/*"
    train_candidates = read_parquets(path)
    train_candidates = train_candidates.sort_values('session')
    train_candidates = train_candidates.reset_index(drop=True)

    FEATURES = train_candidates.columns[2:-1]
    dtest = Pool(data=train_candidates[FEATURES])
    preds = np.zeros(len(train_candidates))

    for fold in tqdm(range(5)):
        model = CatBoostRanker(random_state=42)
        model.load_model(f'models/{ver_tun}/catb_tun_{fold}_{typee}')
        fold_preds = model.predict(dtest)
        preds += fold_preds / 5

    lv_predictions = train_candidates[['session', 'aid']].copy()
    lv_predictions['pred'] = preds
    lv_predictions = lv_predictions.sort_values(
        ['session', 'pred'], ascending=[True, False]).reset_index(drop=True)
    lv_predictions['n'] = lv_predictions.groupby(
        'session').aid.cumcount().astype('int8')
    lv_predictions = lv_predictions.loc[lv_predictions.n < 20]
    lv_predictions = lv_predictions.groupby('session').aid.apply(list)
    lv_predictions = lv_predictions.to_frame().reset_index()
    lv_predictions.rename(columns={'aid': 'labels'}, inplace=True)
    lv_predictions['type'] = typee

    # Объединение временного DataFrame с основным датасетом
    train_predictions_full = pd.concat(
        [train_predictions_full, lv_predictions], ignore_index=True)
    del lv_predictions, preds, train_candidates

type:   0%|                                               | 0/3 [00:00<?, ?it/s]
  0%|                                                     | 0/5 [00:00<?, ?it/s][A
 20%|█████████                                    | 1/5 [00:02<00:09,  2.47s/it][A
 40%|██████████████████                           | 2/5 [00:06<00:09,  3.16s/it][A
 60%|███████████████████████████                  | 3/5 [00:10<00:07,  3.84s/it][A
 80%|████████████████████████████████████         | 4/5 [00:15<00:04,  4.32s/it][A
100%|█████████████████████████████████████████████| 5/5 [00:20<00:00,  4.17s/it][A

  0%|                                                     | 0/5 [00:00<?, ?it/s][A
 20%|█████████                                    | 1/5 [00:35<02:23, 35.86s/it][A
 40%|██████████████████                           | 2/5 [01:10<01:45, 35.01s/it][A
 60%|███████████████████████████                  | 3/5 [01:43<01:08, 34.24s/it][A
 80%|████████████████████████████████████         | 4/5 [02:16<00:33, 33.63s/i

## Score

**Test**

In [None]:
id2type_name = 'id2type.pkl'

In [None]:
valid = pd.read_parquet(f'{main_path}/s/test_labels_loc.parquet')

with open(f'{main_path}/pkl/{id2type_name}', 'rb') as file:
    id2type = pickle.load(file)

print('Model score :', metric_eval(lv_predictions_full, valid, id2type))

Model score : (0.5272565979917239, type
carts     0.367094
clicks    0.398173
orders    0.628852
dtype: float64)


**Train**

In [None]:
# В роли test_labels испольую таргеты, созданные для обучения на этом тренировочном
# датасете (на котором я выше предсказался)
test_labels = pd.read_parquet(f'{main_path}/s/targets.parquet')

with open(f'{main_path}/pkl/{id2type_name}', 'rb') as file:
    id2type = pickle.load(file)

print('Model score :', metric_eval(train_predictions_full, test_labels,
                                   id2type))

Model score : (0.5272965939417337, type
carts     0.378590
clicks    0.394375
orders    0.624489
dtype: float64)


In [None]:
print(
    abs(0.5272965939417337 - 0.5272565979917239) / 0.5272565979917239 * 100,
    '%')

0.007585670840753659 %


**Модель не переобучилась, разница в метрике между трейн и тест = 0.007%**

# Вывод

CatBoost Ranker Baseline метрика = 0.527116  
**YetiRank(PFound)** обошел по метрике Pairlogit на тюнинге  
Tuning (PFound в качестве прокси метрики) метрика = **0.527256**  
Отлично, двигаюсь далее к стекингу