In [None]:
#🔵
main_path = '/Users/stanislavkrupnov/Jup.Notebook'

In [None]:
#🟠
main_path = '/content/drive/Othercomputers/Mac/Jup.Notebook'

# Цель ноутбука

Выбираю лучшую бейзлайн-модель для второго этапа рек.системы - ранжирования  
первичных кандидатов внутри каждой сессии на основе признаков и таргетов.  
Выбор будет происходить между **XGBoost Ranker, CatBoost Ranker, LGBM Ranker**

# Функции

In [None]:
def metric_eval(predictions: pd.DataFrame, valid: pd.DataFrame,
                id2type: dict) -> tuple:
    """
    Вычисляет метрики для оценки качества модели.

    Parameters:
    - predictions (pd.DataFrame): DataFrame с предсказаниями модели.
    - valid (pd.DataFrame): DataFrame с данными для валидации.
    - id2type (dict): Словарь для преобразования индексов типов в соответствующие строки.

    Returns:
    - tuple: Кортеж, содержащий локальную метрику и реколл для каждого типа.
    """
    # Преобразование индексов типов в соответствующие строки
    valid.type = valid.type.map(lambda idx: id2type[idx])

    # Создание списка с метками для каждой сессии и типа
    ground_truth = valid.groupby(['session', 'type'])['aid'].apply(list)
    ground_truth = ground_truth.reset_index().rename(columns={'aid': 'labels'})

    # Обрезка меток для типа 'clicks' до одной метки
    ground_truth.loc[ground_truth.type == 'clicks',
                     'labels'] = ground_truth.loc[ground_truth.type ==
                                                  'clicks', 'labels'].str[:1]

    # Объединение предсказаний с истинными метками
    submission_with_gt = predictions.merge(
        ground_truth[['session', 'type', 'labels']],
        how='left',
        on=['session', 'type'])

    # Отбрасывание сессий без истинных меток
    submission_with_gt = submission_with_gt[~submission_with_gt.labels_y.isna(
    )]

    # Вычисление количества совпадений между предсказанными и истинными метками
    submission_with_gt['hits'] = submission_with_gt.apply(
        lambda df: len(set(df.labels_x).intersection(set(df.labels_y))),
        axis=1)

    # Вычисление количества истинных меток для каждого типа
    submission_with_gt['gt_count'] = submission_with_gt.labels_y.str.len(
    ).clip(0, 20)

    # Вычисление реколла для каждого типа
    recall_per_type = submission_with_gt.groupby([
        'type'
    ])['hits'].sum() / submission_with_gt.groupby(['type'])['gt_count'].sum()

    # Вычисление локальной метрики на основе реколла для каждого типа
    local_validation_score = (recall_per_type * pd.Series({
        'clicks': 0.10,
        'carts': 0.30,
        'orders': 0.60
    })).sum()

    return local_validation_score, recall_per_type

In [None]:
def read_parquets(path):
    """
    Чтение данных из файлов Parquet и их объединение в единый DataFrame.

    Parameters:
    - path (str): Путь к файлам Parquet.

    Returns:
    - tr_candidates (pd.DataFrame): Объединенный DataFrame.
    """
    # Список файлов Parquet
    file_list = glob.glob(path)

    # Создание пустого DataFrame для сбора данных
    tr_candidates = pd.DataFrame()

    # Цикл для чтения и объединения файлов
    for file in tqdm(file_list):
        # Чтение файла Parquet во временный DataFrame
        df_ = pd.read_parquet(file)

        # Объединение временного DataFrame с основным датасетом
        tr_candidates = pd.concat([tr_candidates, df_], ignore_index=True)

    return tr_candidates

# Import

In [None]:
import os
import sys
from tqdm import tqdm
import glob
import pyarrow.parquet as pq
import pickle
import pandas as pd
import gc
import numpy as np
import itertools
from datetime import datetime as dt
import polars as pl
import shutil
import xgboost as xgb
from sklearn.model_selection import GroupKFold
from imblearn.under_sampling import RandomUnderSampler
import joblib

In [None]:
import lightgbm as lgb
from lightgbm.sklearn import LGBMRanker

In [None]:
import gdown
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


# XGBoost Baseline

## Тренировка

In [None]:
ver_folder = 'xgb_base'
random_state = 42
# slice_value = 1
downsamp_k = 16

In [None]:
# Перебор типов 'clicks', 'carts', 'orders'
for typee in tqdm(['clicks', 'carts', 'orders'], desc='type'):

    # Путь к файлам Parquet для текущего типа
    path = f'{main_path}/tr/ver_6/tr_candidates_{typee}/*'

    # Чтение данных из файлов Parquet
    tr_candidates = read_parquets(path)
    print('Чтение данных завершено')

    # Преобразование типов данных
    tr_candidates.session = tr_candidates.session.astype(np.int32)
    tr_candidates.aid = tr_candidates.aid.astype(np.int32)

    # Расчет соотношения классов
    class_ratio = tr_candidates['target'].value_counts(
    )[1] / tr_candidates['target'].value_counts()[0]

    # Указание желаемого соотношения классов
    under_sampler = RandomUnderSampler(sampling_strategy=class_ratio *
                                       downsamp_k)

    # Undersampling классов для лучшего баланса
    FEATURES = tr_candidates.columns[:-1]
    X = tr_candidates.loc[:, FEATURES]
    y = tr_candidates.loc[:, 'target']
    X_train, y_train = under_sampler.fit_resample(X, y)
    tr_candidates = pd.merge(X_train,
                             y_train,
                             left_index=True,
                             right_index=True)
    del X, y
    print('Undersampling классов завершено')

    # Обучение модели XGBoost
    folder_path = f'model/baseline/{ver_folder}'
    os.makedirs(folder_path, exist_ok=True)

    # Сортировка и сброс индексов
    tr_candidates = tr_candidates.sort_values('session')
    tr_candidates = tr_candidates.reset_index(drop=True)

    FEATURES = tr_candidates.columns[:-1]

    X_train = tr_candidates[FEATURES]
    y_train = tr_candidates['target']

    # Группировка по сессиям и подсчет числа действий
    group_sizes_tr = X_train.groupby('session')[['aid']].count()

    # Удаление ненужных столбцов
    X_train = X_train.iloc[:, 2:]

    # Подготовка данных для XGBoost
    dtrain = xgb.DMatrix(X_train, y_train, group=group_sizes_tr.aid.tolist())

    # Параметры XGBoost
    xgb_params = {'objective': 'rank:pairwise', 'tree_method': 'gpu_hist'}

    # Обучение модели
    model = xgb.train(xgb_params, dtrain=dtrain, verbose_eval=100)

    # Сохранение модели
    model.save_model(f'model/baseline/{ver_folder}/XGB_base_{typee}.xgb')
    print('training_done')

    # Сохранение модели на диск
    # folder_path = f'{main_path}/models/{ver_folder}/'
    # os.makedirs(folder_path, exist_ok=True)

    # file_list = glob.glob(f'/content/model/*{typee}.xgb')
    # [shutil.copy(file, f'{main_path}/models/{ver_folder}/') for file in file_list]

type:   0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:02<00:08,  2.24s/it][A
 40%|████      | 2/5 [00:05<00:09,  3.07s/it][A
 60%|██████    | 3/5 [00:10<00:07,  3.68s/it][A
 80%|████████  | 4/5 [00:15<00:04,  4.24s/it][A
100%|██████████| 5/5 [00:21<00:00,  4.25s/it]


reading_done
sampling_done


type:  33%|███▎      | 1/3 [01:16<02:33, 76.56s/it]

training_done



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:06<00:25,  6.49s/it][A
 40%|████      | 2/5 [00:14<00:22,  7.46s/it][A
 60%|██████    | 3/5 [00:22<00:15,  7.63s/it][A
 80%|████████  | 4/5 [00:32<00:08,  8.72s/it][A
100%|██████████| 5/5 [00:44<00:00,  8.93s/it]


reading_done
sampling_done


type:  67%|██████▋   | 2/3 [02:59<01:32, 92.04s/it]

training_done



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:08<00:34,  8.65s/it][A
 40%|████      | 2/5 [00:16<00:24,  8.28s/it][A
 60%|██████    | 3/5 [00:26<00:17,  8.88s/it][A
 80%|████████  | 4/5 [00:35<00:08,  8.88s/it][A
100%|██████████| 5/5 [00:46<00:00,  9.20s/it]


reading_done
sampling_done


type: 100%|██████████| 3/3 [04:43<00:00, 94.57s/it]

training_done





## Предсказание + оценка

In [None]:
lv_predictions_full = pd.DataFrame()

# Перебор типов 'clicks', 'carts', 'orders'
for typee in tqdm(['clicks', 'carts', 'orders'], desc='type'):

    # Проверка наличия директории с кандидатами
    if not os.path.exists(f'/content/test_candidates_{typee}'):
        # Импорт кандидатов выбранного типа
        shutil.copytree(f'{main_path}/lv/ver_6/test_candidates_{typee}/',
                        f'/content/test_candidates_{typee}/')

    # Путь к файлам Parquet для текущего типа
    path = f'test_candidates_{typee}/*'

    # Чтение данных из файлов Parquet
    test_candidates = read_parquets(path)
    test_candidates = test_candidates.sort_values('session')
    test_candidates = test_candidates.reset_index(drop=True)

    FEATURES = test_candidates.columns[2:]

    # Подготовка массива для предсказаний
    preds = np.zeros(len(test_candidates))

    # Загрузка обученной модели
    model = xgb.Booster()
    model.load_model(f'model/baseline/{ver_folder}/XGB_base_{typee}.xgb')
    model.set_param({'predictor': 'gpu_predictor'})

    # Подготовка данных для XGBoost
    dtest = xgb.DMatrix(data=test_candidates[FEATURES])

    # Предсказание
    preds = model.predict(dtest)

    # Формирование DataFrame с предсказаниями
    lv_predictions = test_candidates[['session', 'aid']].copy()
    lv_predictions['pred'] = preds
    lv_predictions = lv_predictions.sort_values(
        ['session', 'pred'], ascending=[True, False]).reset_index(drop=True)
    lv_predictions['n'] = lv_predictions.groupby(
        'session').aid.cumcount().astype('int8')
    lv_predictions = lv_predictions.loc[lv_predictions.n < 20]
    lv_predictions = lv_predictions.groupby('session').aid.apply(list)
    lv_predictions = lv_predictions.to_frame().reset_index()
    lv_predictions.rename(columns={'aid': 'labels'}, inplace=True)
    lv_predictions['type'] = typee

    # Объединение временного DataFrame с основным датасетом
    lv_predictions_full = pd.concat([lv_predictions_full, lv_predictions],
                                    ignore_index=True)
    del lv_predictions, preds, dtest, test_candidates

# lv_predictions.to_parquet(f'lv_predictions_{typee}.parquet')
# shutil.copy2(f'lv_predictions_{typee}.parquet', f'{main_path}/lv/{ver_folder}/lv_predictions_{typee}.parquet' )

type:   0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:03,  1.31it/s][A
 40%|████      | 2/5 [00:02<00:03,  1.11s/it][A
 60%|██████    | 3/5 [00:03<00:02,  1.34s/it][A
 80%|████████  | 4/5 [00:05<00:01,  1.54s/it][A
100%|██████████| 5/5 [00:07<00:00,  1.53s/it]
type:  33%|███▎      | 1/3 [00:50<01:41, 50.98s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:03,  1.26it/s][A
 40%|████      | 2/5 [00:02<00:03,  1.08s/it][A
 60%|██████    | 3/5 [00:03<00:02,  1.32s/it][A
 80%|████████  | 4/5 [00:05<00:01,  1.52s/it][A
100%|██████████| 5/5 [00:07<00:00,  1.53s/it]
type:  67%|██████▋   | 2/3 [01:41<00:50, 50.76s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:03,  1.32it/s][A
 40%|████      | 2/5 [00:02<00:03,  1.06s/it][A
 60%|██████    | 3/5 [00:03<00:02,  1.30s/it][A
 80%|████████  | 4/5 [00:05<00:01,  1.57s/it][A
100%|██████████| 5/5 [00:07<00:00,  1.55s/it]
typ

In [None]:
id2type_name = 'id2type.pkl'
# version = 'ver_6.1'

In [None]:
valid = pd.read_parquet(f'{main_path}/s/test_labels_loc.parquet')

with open(f'{main_path}/pkl/{id2type_name}', 'rb') as file:
    id2type = pickle.load(file)

print('Model score :', metric_eval(lv_predictions_full, valid, id2type))

Model score : (0.5220552152693079, type
carts     0.365164
clicks    0.386253
orders    0.623134
dtype: float64)


In [None]:
best_score_base = {'xgb': 0.5220552152693079}

# LGBM Baseline

## Тренировка

In [None]:
ver_folder = 'lgbm_base'
random_state = 42
# slice_value = 1
downsamp_k = 16

In [None]:
for typee in tqdm(['clicks', 'carts', 'orders'], desc='type'):

    # Загрузка предварительно сэмплированных кандидатов
    tr_candidates = pd.read_parquet(
        f'{main_path}/tr/ver_6/downsampled/tr_candidates_{typee}.parquet')
    print('reading_done')

    # В данном участке кода отключены downsample и перевод в int32,
    # поскольку они не используются
    # tr_candidates.session = tr_candidates.session.astype(np.int32)
    # tr_candidates.aid = tr_candidates.aid.astype(np.int32)

    # Downsampling трен.датасета
    # class_ratio = tr_candidates['target'].value_counts()[1] / \
    #               tr_candidates['target'].value_counts()[0]
    # under_sampler = RandomUnderSampler(sampling_strategy=class_ratio * downsamp_k)
    # FEATURES = tr_candidates.columns[:-1]
    # X = tr_candidates.loc[:, FEATURES]
    # y = tr_candidates.loc[:, 'target']
    # X_train, y_train = under_sampler.fit_resample(X, y)
    # tr_candidates = pd.merge(X_train, y_train, left_index=True, right_index=True)
    # del X, y
    print('sampling_done')

    # Обучение модели LGBMRanker
    folder_path = f'model/baseline/{ver_folder}'
    os.makedirs(folder_path, exist_ok=True)

    tr_candidates = tr_candidates.sort_values('session')
    tr_candidates = tr_candidates.reset_index(drop=True)

    FEATURES = tr_candidates.columns[:-1]

    # Создание объекта LGBMRanker
    ranker = LGBMRanker(objective="lambdarank")

    X_train = tr_candidates[FEATURES]
    y_train = tr_candidates['target']

    group_id_tr = X_train.groupby('session').size().tolist()
    X_train = X_train.iloc[:, 2:]  # Убираю ['session','aid'] для обучения

    # Обучение ранкера
    ranker = ranker.fit(X_train, tr_candidates['target'], group=group_id_tr)

    # Сохранение модели
    folder_path = f'model/baseline/{ver_folder}'
    os.makedirs(folder_path, exist_ok=True)
    joblib.dump(ranker, f'model/baseline/{ver_folder}/LGB_base_{typee}.joblib')
    print('training_done')

# Сохранение модели на диск
# folder_path = f'{main_path}/models/{ver_folder}/'
# os.makedirs(folder_path, exist_ok=True)

# file_list = glob.glob(f'/content/model/*{typee}.xgb')
# [shutil.copy(file, f'{main_path}/models/{ver_folder}/') for file in file_list]

## Предсказание + оценка

In [None]:
lv_predictions_full = pd.DataFrame()

for typee in tqdm(['clicks', 'carts', 'orders'], desc='type'):

    if os.path.exists(f'/content/test_candidates_{typee}') == False:
        # импорт кандидатов выбранного типа
        shutil.copytree(f'{main_path}/lv/ver_6/test_candidates_{typee}/',
                        f'/content/test_candidates_{typee}/')

    path = f'test_candidates_{typee}/*'
    test_candidates = read_parquets(path)
    test_candidates = test_candidates.sort_values('session')
    test_candidates = test_candidates.reset_index(drop=True)

    # Создание предсказаний
    FEATURES = test_candidates.columns[2:]
    model = joblib.load(f'model/baseline/{ver_folder}/LGB_base_{typee}.joblib')
    preds = model.predict(test_candidates[FEATURES])

    # Получение топ-20 предсказаний для каждой сессии и доп преобразования
    lv_predictions = test_candidates[['session', 'aid']].copy()
    lv_predictions['pred'] = preds
    lv_predictions = lv_predictions.sort_values(
        ['session', 'pred'], ascending=[True, False]).reset_index(drop=True)
    lv_predictions['n'] = lv_predictions.groupby(
        'session').aid.cumcount().astype('int8')
    lv_predictions = lv_predictions.loc[lv_predictions.n < 20]
    lv_predictions = lv_predictions.groupby('session').aid.apply(list)
    lv_predictions = lv_predictions.to_frame().reset_index()
    lv_predictions.rename(columns={'aid': 'labels'}, inplace=True)
    lv_predictions['type'] = typee

    # Объединение временного DataFrame с основным датасетом
    lv_predictions_full = pd.concat([lv_predictions_full, lv_predictions],
                                    ignore_index=True)
    del lv_predictions, preds, test_candidates

    # lv_predictions.to_parquet(f'lv_predictions_{typee}.parquet')
    # shutil.copy2(f'lv_predictions_{typee}.parquet', f'{main_path}/lv/{ver_folder}/lv_predictions_{typee}.parquet' )

type:   0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:01<00:04,  1.16s/it][A
 40%|████      | 2/5 [00:04<00:07,  2.57s/it][A
 60%|██████    | 3/5 [00:08<00:06,  3.13s/it][A
 80%|████████  | 4/5 [00:12<00:03,  3.62s/it][A
100%|██████████| 5/5 [00:17<00:00,  3.57s/it]
type:  33%|███▎      | 1/3 [01:57<03:54, 117.34s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:03,  1.16it/s][A
 40%|████      | 2/5 [00:02<00:03,  1.23s/it][A
 60%|██████    | 3/5 [00:04<00:03,  1.52s/it][A
 80%|████████  | 4/5 [00:06<00:01,  1.89s/it][A
100%|██████████| 5/5 [00:09<00:00,  1.90s/it]
type:  67%|██████▋   | 2/3 [03:55<01:58, 118.02s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:03,  1.23it/s][A
 40%|████      | 2/5 [00:02<00:04,  1.35s/it][A
 60%|██████    | 3/5 [00:04<00:03,  1.60s/it][A
 80%|████████  | 4/5 [00:06<00:01,  1.92s/it][A
100%|██████████| 5/5 [00:09<00:00,  1.91s/it]
t

In [None]:
id2type_name = 'id2type.pkl'
# version = 'ver_6.1'

In [None]:
valid = pd.read_parquet(f'{main_path}/s/test_labels_loc.parquet')

with open(f'{main_path}/pkl/{id2type_name}', 'rb') as file:
    id2type = pickle.load(file)

print('Model score :', metric_eval(lv_predictions_full, valid, id2type))

Model score : (0.5248555778378172, type
carts     0.364534
clicks    0.391417
orders    0.627256
dtype: float64)


In [None]:
best_score_base['lgbm'] = 0.5248555778378172
best_score_base

{'xgb': 0.5220552152693079, 'lgbm': 0.5248555778378172}

# Catboost Baseline

## Тренировка

In [None]:
ver_folder = 'catb_base'
random_state = 42
# slice_value = 1
downsamp_k = 16

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [None]:
import catboost
from catboost import CatBoostRanker, Pool

In [None]:
import os
os.environ['CATBOOST_GPU'] = '1'

In [None]:
# Итерация по типам событий ('clicks', 'carts', 'orders')
for typee in tqdm(['clicks', 'carts', 'orders'], desc='type'):

    # Путь к данным тренировочных кандидатов выбранного типа
    path = f'{main_path}/tr/ver_6/tr_candidates_{typee}/*'

    # Чтение данных тренировочных кандидатов из Parquet-файлов
    tr_candidates = read_parquets(path)
    print('reading_done')

    # Приведение типов колонок 'session' и 'aid' к int32
    tr_candidates.session = tr_candidates.session.astype(np.int32)
    tr_candidates.aid = tr_candidates.aid.astype(np.int32)

    # Расчет соотношения классов
    class_ratio = tr_candidates['target'].value_counts(
    )[1] / tr_candidates['target'].value_counts()[0]

    # Указание желаемого соотношения классов для RandomUnderSampler
    under_sampler = RandomUnderSampler(sampling_strategy=class_ratio *
                                       downsamp_k)

    # Отбор признаков
    FEATURES = tr_candidates.columns[:-1]

    # Разделение на признаки и целевую переменную
    X = tr_candidates.loc[:, FEATURES]
    y = tr_candidates.loc[:, 'target']

    # Применение RandomUnderSampler для балансировки классов
    X_train, y_train = under_sampler.fit_resample(X, y)
    tr_candidates = pd.merge(X_train,
                             y_train,
                             left_index=True,
                             right_index=True)
    del X, y
    print('sampling_done')

    # Создание директории для сохранения моделей
    folder_path = f'model/baseline/{ver_folder}'
    os.makedirs(folder_path, exist_ok=True)

    # Сортировка и сброс индексов в тренировочных кандидатах
    tr_candidates = tr_candidates.sort_values('session')
    tr_candidates = tr_candidates.reset_index(drop=True)

    # Отбор признаков
    FEATURES = tr_candidates.columns[:-1]

    # Разделение на признаки и целевую переменную
    X_train = tr_candidates[FEATURES]
    y_train = tr_candidates['target']

    # Создание группы для CatBoostRanker
    group_id_tr = X_train.iloc[:, 0]

    # Удаление ненужной колонки 'session' для обучения
    X_train = X_train.iloc[:, 2:]

    # Создание Pool для CatBoost
    catboost_train = Pool(data=X_train, label=y_train, group_id=group_id_tr)

    # Инициализация и обучение CatBoostRanker
    ranker = CatBoostRanker(loss_function='PairLogit',
                            task_type='GPU',
                            random_state=42)
    ranker.fit(catboost_train, early_stopping_rounds=50)

    # Создание директории для сохранения моделей CatBoost
    folder_path = f'model/baseline/{ver_folder}'
    os.makedirs(folder_path, exist_ok=True)

    # Сохранение модели CatBoostRanker на диск
    ranker.save_model(f'model/baseline/{ver_folder}/catb_base_{typee}')
    print('training_done')

type:   0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:05<00:23,  5.94s/it][A
 40%|████      | 2/5 [00:15<00:24,  8.17s/it][A
 60%|██████    | 3/5 [00:23<00:16,  8.07s/it][A
 80%|████████  | 4/5 [00:31<00:07,  7.85s/it][A
100%|██████████| 5/5 [00:39<00:00,  7.94s/it]


reading_done
sampling_done
0:	learn: 0.6722801	total: 52.4ms	remaining: 52.3s
1:	learn: 0.6521243	total: 105ms	remaining: 52.6s
2:	learn: 0.6345881	total: 156ms	remaining: 51.8s
3:	learn: 0.6183704	total: 207ms	remaining: 51.6s
4:	learn: 0.6027706	total: 257ms	remaining: 51.2s
5:	learn: 0.5885391	total: 304ms	remaining: 50.4s
6:	learn: 0.5755264	total: 352ms	remaining: 49.9s
7:	learn: 0.5636078	total: 400ms	remaining: 49.6s
8:	learn: 0.5530807	total: 448ms	remaining: 49.4s
9:	learn: 0.5431993	total: 496ms	remaining: 49.1s
10:	learn: 0.5343235	total: 544ms	remaining: 49s
11:	learn: 0.5259114	total: 592ms	remaining: 48.8s
12:	learn: 0.5179888	total: 639ms	remaining: 48.5s
13:	learn: 0.5109231	total: 687ms	remaining: 48.4s
14:	learn: 0.5044133	total: 734ms	remaining: 48.2s
15:	learn: 0.4982586	total: 782ms	remaining: 48.1s
16:	learn: 0.4923669	total: 830ms	remaining: 48s
17:	learn: 0.4870210	total: 878ms	remaining: 47.9s
18:	learn: 0.4823054	total: 924ms	remaining: 47.7s
19:	learn: 0.4779

type:  33%|███▎      | 1/3 [02:20<04:40, 140.46s/it]

training_done



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:05<00:21,  5.32s/it][A
 40%|████      | 2/5 [00:11<00:17,  5.68s/it][A
 60%|██████    | 3/5 [00:17<00:11,  5.99s/it][A
 80%|████████  | 4/5 [00:24<00:06,  6.53s/it][A
100%|██████████| 5/5 [00:32<00:00,  6.52s/it]


reading_done
sampling_done
0:	learn: 0.6607767	total: 36.3ms	remaining: 36.2s
1:	learn: 0.6314695	total: 73.1ms	remaining: 36.5s
2:	learn: 0.6048352	total: 109ms	remaining: 36.3s
3:	learn: 0.5812487	total: 145ms	remaining: 36s
4:	learn: 0.5592517	total: 179ms	remaining: 35.7s
5:	learn: 0.5397051	total: 214ms	remaining: 35.5s
6:	learn: 0.5222933	total: 249ms	remaining: 35.3s
7:	learn: 0.5055903	total: 284ms	remaining: 35.2s
8:	learn: 0.4911379	total: 319ms	remaining: 35.1s
9:	learn: 0.4782727	total: 352ms	remaining: 34.8s
10:	learn: 0.4661905	total: 387ms	remaining: 34.8s
11:	learn: 0.4549628	total: 421ms	remaining: 34.7s
12:	learn: 0.4458867	total: 457ms	remaining: 34.7s
13:	learn: 0.4374398	total: 491ms	remaining: 34.6s
14:	learn: 0.4291698	total: 525ms	remaining: 34.5s
15:	learn: 0.4219986	total: 560ms	remaining: 34.4s
16:	learn: 0.4146095	total: 595ms	remaining: 34.4s
17:	learn: 0.4085940	total: 629ms	remaining: 34.3s
18:	learn: 0.4030823	total: 665ms	remaining: 34.3s
19:	learn: 0.3

type:  67%|██████▋   | 2/3 [04:16<02:06, 126.35s/it]

999:	learn: 0.3026234	total: 35.1s	remaining: 0us
training_done



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:04<00:16,  4.15s/it][A
 40%|████      | 2/5 [00:11<00:17,  5.75s/it][A
 60%|██████    | 3/5 [00:16<00:11,  5.79s/it][A
 80%|████████  | 4/5 [00:23<00:06,  6.30s/it][A
100%|██████████| 5/5 [00:32<00:00,  6.42s/it]


reading_done
sampling_done
0:	learn: 0.6463423	total: 36.2ms	remaining: 36.2s
1:	learn: 0.6018590	total: 71.2ms	remaining: 35.5s
2:	learn: 0.5639152	total: 106ms	remaining: 35.3s
3:	learn: 0.5284154	total: 142ms	remaining: 35.4s
4:	learn: 0.4974022	total: 177ms	remaining: 35.2s
5:	learn: 0.4694077	total: 212ms	remaining: 35.1s
6:	learn: 0.4444772	total: 246ms	remaining: 34.9s
7:	learn: 0.4217797	total: 280ms	remaining: 34.7s
8:	learn: 0.4027890	total: 314ms	remaining: 34.6s
9:	learn: 0.3850198	total: 347ms	remaining: 34.4s
10:	learn: 0.3682255	total: 380ms	remaining: 34.2s
11:	learn: 0.3541375	total: 414ms	remaining: 34.1s
12:	learn: 0.3411670	total: 447ms	remaining: 33.9s
13:	learn: 0.3297538	total: 479ms	remaining: 33.7s
14:	learn: 0.3194655	total: 512ms	remaining: 33.6s
15:	learn: 0.3102211	total: 543ms	remaining: 33.4s
16:	learn: 0.3019609	total: 576ms	remaining: 33.3s
17:	learn: 0.2943388	total: 608ms	remaining: 33.2s
18:	learn: 0.2876094	total: 640ms	remaining: 33s
19:	learn: 0.2

type: 100%|██████████| 3/3 [06:12<00:00, 124.17s/it]

training_done





## Предсказание + оценка

In [None]:
lv_predictions_full = pd.DataFrame()

for typee in tqdm(['clicks', 'carts', 'orders'], desc='type'):

    if os.path.exists(f'/content/test_candidates_{typee}') == False:
        # импорт кандидатов выбранного типа
        shutil.copytree(f'{main_path}/lv/ver_6/test_candidates_{typee}/',
                        f'/content/test_candidates_{typee}/')

    path = f'test_candidates_{typee}/*'
    test_candidates = read_parquets(path)
    test_candidates = test_candidates.sort_values('session')
    test_candidates = test_candidates.reset_index(drop=True)

    # Создание предсказаний
    FEATURES = test_candidates.columns[2:]
    dtest = Pool(data=test_candidates[FEATURES])
    preds = np.zeros(len(test_candidates))
    model = CatBoostRanker(random_state=42)
    model.load_model(f'model/baseline/{ver_folder}/catb_base_{typee}')
    preds = model.predict(dtest)

    # Получение топ-20 предсказаний для каждой сессии и доп преобразования
    lv_predictions = test_candidates[['session', 'aid']].copy()
    lv_predictions['pred'] = preds
    lv_predictions = lv_predictions.sort_values(
        ['session', 'pred'], ascending=[True, False]).reset_index(drop=True)
    lv_predictions['n'] = lv_predictions.groupby(
        'session').aid.cumcount().astype('int8')
    lv_predictions = lv_predictions.loc[lv_predictions.n < 20]
    lv_predictions = lv_predictions.groupby('session').aid.apply(list)
    lv_predictions = lv_predictions.to_frame().reset_index()
    lv_predictions.rename(columns={'aid': 'labels'}, inplace=True)
    lv_predictions['type'] = typee

    # Объединение временного DataFrame с основным датасетом
    lv_predictions_full = pd.concat([lv_predictions_full, lv_predictions],
                                    ignore_index=True)
    del lv_predictions, preds, test_candidates

    # lv_predictions.to_parquet(f'lv_predictions_{typee}.parquet')
    # shutil.copy2(f'lv_predictions_{typee}.parquet', f'{main_path}/lv/{ver_folder}/lv_predictions_{typee}.parquet' )

type:   0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:02,  1.69it/s][A
 40%|████      | 2/5 [00:01<00:02,  1.12it/s][A
 60%|██████    | 3/5 [00:03<00:02,  1.09s/it][A
 80%|████████  | 4/5 [00:04<00:01,  1.31s/it][A
100%|██████████| 5/5 [00:06<00:00,  1.30s/it]
type:  33%|███▎      | 1/3 [01:21<02:43, 81.83s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:02,  1.75it/s][A
 40%|████      | 2/5 [00:01<00:02,  1.15it/s][A
 60%|██████    | 3/5 [00:02<00:02,  1.06s/it][A
 80%|████████  | 4/5 [00:04<00:01,  1.28s/it][A
100%|██████████| 5/5 [00:06<00:00,  1.27s/it]
type:  67%|██████▋   | 2/3 [02:28<01:12, 72.71s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:02,  1.76it/s][A
 40%|████      | 2/5 [00:01<00:02,  1.15it/s][A
 60%|██████    | 3/5 [00:02<00:02,  1.07s/it][A
 80%|████████  | 4/5 [00:04<00:01,  1.29s/it][A
100%|██████████| 5/5 [00:06<00:00,  1.29s/it]
typ

In [None]:
id2type_name = 'id2type.pkl'
# version = 'ver_6.1'

In [None]:
valid = pd.read_parquet(f'{main_path}/s/test_labels_loc.parquet')

with open(f'{main_path}/pkl/{id2type_name}', 'rb') as file:
    id2type = pickle.load(file)

print('Model score :', metric_eval(lv_predictions_full, valid, id2type))

Model score : (0.5271976703740328, type
carts     0.368052
clicks    0.398249
orders    0.628262
dtype: float64)


In [None]:
best_score_base = {
    'xgb': 0.5220552152693079,
    'lgbm': 0.5248555778378172,
    'catb': 0.5271976703740328
}

## Тренировка с кросс-валидацией

In [None]:
for typee in tqdm(['clicks', 'carts', 'orders'], desc='type'):

    path = f'{main_path}/tr/ver_6/tr_candidates_{typee}/*'
    tr_candidates = read_parquets(path)
    print('reading_done')

    tr_candidates.session = tr_candidates.session.astype(np.int32)
    tr_candidates.aid = tr_candidates.aid.astype(np.int32)

    class_ratio = tr_candidates['target'].value_counts()[1] / \
                  tr_candidates['target'].value_counts()[0]

    # Указываем желаемое соотношение классов
    under_sampler = RandomUnderSampler(sampling_strategy=class_ratio *
                                       downsamp_k)

    # downsamling для лучшего баланса классов
    FEATURES = tr_candidates.columns[:-1]
    X = tr_candidates.loc[:, FEATURES]
    y = tr_candidates.loc[:, 'target']
    X_train, y_train = under_sampler.fit_resample(X, y)
    tr_candidates = pd.merge(X_train,
                             y_train,
                             left_index=True,
                             right_index=True)
    tr_candidates = tr_candidates.sort_values('session')
    tr_candidates = tr_candidates.reset_index(drop=True)

    del X, y
    print('sampling_done')

    # Обучение модели
    skf = GroupKFold(n_splits=5)
    FEATURES = tr_candidates.columns[:-1]

    scores = []

    for fold, (train_idx, valid_idx) in enumerate(
            skf.split(tr_candidates,
                      tr_candidates['target'],
                      groups=tr_candidates['session'])):

        X_train = tr_candidates.loc[train_idx, FEATURES]
        y_train = tr_candidates.loc[train_idx, 'target']
        X_valid = tr_candidates.loc[valid_idx, FEATURES]
        y_valid = tr_candidates.loc[valid_idx, 'target']

        group_id_tr = X_train.iloc[:, 0]
        group_id_v = X_valid.iloc[:, 0]

        X_train = X_train.iloc[:, 2:]
        X_valid = X_valid.iloc[:, 2:]

        catboost_train = Pool(data=X_train,
                              label=y_train,
                              group_id=group_id_tr)
        catboost_valid = Pool(data=X_valid, label=y_valid, group_id=group_id_v)

        ranker = CatBoostRanker(loss_function='PairLogit',
                                task_type='GPU',
                                random_state=42)
        ranker.fit(catboost_train,
                   eval_set=catboost_valid,
                   early_stopping_rounds=50,
                   verbose=False)

        # Рассчитать метрику pairlogit
        pairlogit = ranker.best_score_['validation']['PairLogit']
        scores.append(pairlogit)

        folder_path = f'model/baseline/{ver_folder}_kfolds'
        os.makedirs(folder_path, exist_ok=True)
        ranker.save_model(
            f'model/baseline/{ver_folder}_kfolds/catb_base_{fold}_{typee}')
    # Усреднить pairlogit по всем фолдам
    pairlogit_mean = np.mean(scores)
    print(f'pairlogit_mean_{typee}:', pairlogit_mean)

    # Сохранение модели на диск
    # folder_path = f'{main_path}/models/{ver_folder}/'
    # os.makedirs(folder_path, exist_ok=True)

    # file_list = glob.glob(f'/content/model/*{typee}.xgb')
    # [shutil.copy(file, f'{main_path}/models/{ver_folder}/') for file in file_list]

type:   0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:02<00:09,  2.29s/it][A
 40%|████      | 2/5 [00:05<00:08,  2.98s/it][A
 60%|██████    | 3/5 [00:09<00:07,  3.53s/it][A
 80%|████████  | 4/5 [00:14<00:03,  3.97s/it][A
100%|██████████| 5/5 [00:20<00:00,  4.00s/it]


reading_done
sampling_done


type:  33%|███▎      | 1/3 [05:14<10:28, 314.04s/it]

pairlogit_mean_clicks: 0.39510939463998984



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:04<00:18,  4.63s/it][A
 40%|████      | 2/5 [00:10<00:16,  5.61s/it][A
 60%|██████    | 3/5 [00:17<00:12,  6.24s/it][A
 80%|████████  | 4/5 [00:25<00:06,  6.91s/it][A
100%|██████████| 5/5 [00:34<00:00,  6.94s/it]


reading_done
sampling_done


type:  67%|██████▋   | 2/3 [09:37<04:44, 284.56s/it]

pairlogit_mean_carts: 0.3072526368101608



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:05<00:20,  5.13s/it][A
 40%|████      | 2/5 [00:12<00:18,  6.21s/it][A
 60%|██████    | 3/5 [00:18<00:12,  6.15s/it][A
 80%|████████  | 4/5 [00:25<00:06,  6.62s/it][A
100%|██████████| 5/5 [00:33<00:00,  6.79s/it]


reading_done
sampling_done


type: 100%|██████████| 3/3 [13:55<00:00, 278.63s/it]

pairlogit_mean_orders: 0.19181008929776722





## Pairlogit's каждого типа Cross-val

 - pairlogit_mean_clicks: 0.395109
 - pairlogit_mean_carts: 0.307252
 - pairlogit_mean_orders: 0.191810

## Предсказание + оценка (кросс-валидация

In [None]:
lv_predictions_full = pd.DataFrame()

for typee in tqdm(['clicks', 'carts', 'orders'], desc='type'):

    if os.path.exists(f'/content/test_candidates_{typee}') == False:
        # импорт кандидатов выбранного типа
        shutil.copytree(f'{main_path}/lv/ver_6/test_candidates_{typee}/',
                        f'/content/test_candidates_{typee}/')

    path = f'test_candidates_{typee}/*'
    test_candidates = read_parquets(path)
    test_candidates = test_candidates.sort_values('session')
    test_candidates = test_candidates.reset_index(drop=True)

    FEATURES = test_candidates.columns[2:]
    dtest = Pool(data=test_candidates[FEATURES])
    preds = np.zeros(len(test_candidates))
    for fold in tqdm(range(5)):
        model = CatBoostRanker(random_state=42)
        model.load_model(
            f'model/baseline/{ver_folder}_kfolds/catb_base_{fold}_{typee}')
        fold_preds = model.predict(dtest)
        preds += fold_preds / 5

    lv_predictions = test_candidates[['session', 'aid']].copy()
    lv_predictions['pred'] = preds
    lv_predictions = lv_predictions.sort_values(
        ['session', 'pred'], ascending=[True, False]).reset_index(drop=True)
    lv_predictions['n'] = lv_predictions.groupby(
        'session').aid.cumcount().astype('int8')
    lv_predictions = lv_predictions.loc[lv_predictions.n < 20]
    lv_predictions = lv_predictions.groupby('session').aid.apply(list)
    lv_predictions = lv_predictions.to_frame().reset_index()
    lv_predictions.rename(columns={'aid': 'labels'}, inplace=True)
    lv_predictions['type'] = typee

    # Объединение временного DataFrame с основным датасетом
    lv_predictions_full = pd.concat([lv_predictions_full, lv_predictions],
                                    ignore_index=True)
    del lv_predictions, preds, test_candidates

    # lv_predictions.to_parquet(f'lv_predictions_{typee}.parquet')
    # shutil.copy2(f'lv_predictions_{typee}.parquet', f'{main_path}/lv/{ver_folder}/lv_predictions_{typee}.parquet' )

type:   0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:02,  1.53it/s][A
 40%|████      | 2/5 [00:01<00:03,  1.01s/it][A
 60%|██████    | 3/5 [00:03<00:02,  1.20s/it][A
 80%|████████  | 4/5 [00:05<00:01,  1.44s/it][A
100%|██████████| 5/5 [00:07<00:00,  1.43s/it]

  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:11<00:44, 11.07s/it][A
 40%|████      | 2/5 [00:21<00:32, 10.89s/it][A
 60%|██████    | 3/5 [00:32<00:21, 10.63s/it][A
 80%|████████  | 4/5 [00:42<00:10, 10.58s/it][A
100%|██████████| 5/5 [00:53<00:00, 10.70s/it]
type:  33%|███▎      | 1/3 [01:55<03:51, 115.91s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:02,  1.41it/s][A
 40%|████      | 2/5 [00:01<00:02,  1.04it/s][A
 60%|██████    | 3/5 [00:03<00:02,  1.21s/it][A
 80%|████████  | 4/5 [00:05<00:01,  1.45s/it][A
100%|██████████| 5/5 [00:07<00:00,  1.44s/it]

  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██    

In [None]:
id2type_name = 'id2type.pkl'
# version = 'ver_6.1'

In [None]:
valid = pd.read_parquet(f'{main_path}/s/test_labels_loc.parquet')

with open(f'{main_path}/pkl/{id2type_name}', 'rb') as file:
    id2type = pickle.load(file)

print('Model score :', metric_eval(lv_predictions_full, valid, id2type))

Model score : (0.5271169524492465, type
carts     0.368120
clicks    0.397638
orders    0.628195
dtype: float64)


# Вывод

In [None]:
best_score_base = {
    'xgb': 0.5220552152693079,
    'lgbm': 0.5248555778378172,
    'catb': 0.5271169524492465
}

**Лучшую метрику показал CatBoost Ranker**  
Продолжу его тюнинг