**About :** Trains XGBoost models.

**TODO**:
- better neg sampling technique ??

In [2]:
cd ../src

/workspace/kaggle_otto_rs/src


In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [5]:
import os
import gc
import cudf
import json
import glob
import numba
import xgboost
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from pandarallel import pandarallel
from numerize.numerize import numerize

warnings.simplefilter(action="ignore", category=FutureWarning)
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
pandarallel.initialize(nb_workers=32, progress_bar=False)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [6]:
from params import *

from utils.metrics import get_coverage
from utils.plot import plot_importances
from utils.load import *
from utils.logger import save_config, prepare_log_folder, create_logger
from utils.torch import seed_everything

from training.xgb import kfold, optimize

### Load

In [7]:
# VERSION = "clicks_cv3-tv5.10"
VERSION = "cv3-tv5.10"

GT_VERSION = "gt.7"

#### Train data
- neg sampling could use candidates from lower versions

In [8]:
POS_RATIO = 0.2
TARGET = "gt_carts"   # "gt_clicks", "gt_carts", "gt_orders"

In [9]:
PROBS_PATHS = {
    "gt_orders": "../logs/2023-01-14/9/",
    "gt_carts": "../logs/2023-01-14/8/",
}

In [10]:
# def load_sessions(regex):
#     dfs = []
#     for idx, chunk_file in enumerate(glob.glob(regex)):
#         df = cudf.read_parquet(chunk_file, columns=["session"])
#         dfs.append(df.drop_duplicates(keep="first"))

#     return cudf.concat(dfs).reset_index(drop=True)

# sessions = load_sessions( f"../output/features/fts_val_{VERSION}/*")

# from sklearn.model_selection import KFold
# K = 4

# kf = KFold(n_splits=K, shuffle=True, random_state=42)
# splits = kf.split(sessions)

# sessions['fold'] = -1
# for i, (_, val_idx) in enumerate(splits):
#     sessions.loc[val_idx, "fold"] = i

# sessions.to_csv(f"../input/folds_{K}.csv", index=False)

In [11]:
# df_train = load_parquets_cudf_folds(
#     f"../output/features/fts_val_{VERSION}/*",
#     "../input/folds_4.csv",
#     fold=0,
#     pos_ratio=POS_RATIO,
#     target=TARGET,
#     max_n=1,
#     train_only=True,
#     probs_file=PROBS_PATHS[TARGET]
# )

In [12]:
# df_train = load_parquets_cudf_folds(
#     f"../output/features/fts_val_{VERSION}/*",
#     "../input/folds_4.csv",
#     fold=0,
#     pos_ratio=0.1,
#     target=TARGET,
#     train_only=True,
#     columns=['session', 'candidates', 'gt_clicks', 'gt_carts', 'gt_orders'],
#     max_n=1,
# )

In [13]:
# df_train = df_train.sort_values('session', ignore_index=True)
# group = df_train[['session', 'candidates']].groupby('session').size().values

In [14]:
FEATURES = [  # REMOVE CORRELATED
    'clicks_popularity_w_pos-log', 'clicks_popularity_w_type-163', 'clicks_popularity_w_lastday', 'clicks_popularity_w_recsys', 
    'carts_popularity_w_pos-log', 'carts_popularity_w_type-163', 'carts_popularity_w_lastday', 'carts_popularity_w_recsys', 
    'orders_popularity_w_pos-log', 'orders_popularity_w_type-163', 'orders_popularity_w_lastday', 'orders_popularity_w_recsys', 
    'clicks_popularity_w_pos-log_w', 'clicks_popularity_w_type-163_w', 'clicks_popularity_w_recsys_w', 
    'carts_popularity_w_pos-log_w', 'carts_popularity_w_type-163_w', 'carts_popularity_w_recsys_w', 
    'orders_popularity_w_pos-log_w', 'orders_popularity_w_type-163_w', 'orders_popularity_w_recsys_w',
    'w_pos-log', 'w_type-163', 'w_lastday', 'w_time', 'w_recsys',
    'matrix_123_temporal_20_mean', 'matrix_123_temporal_20_sum', 'matrix_123_temporal_20_max', 'matrix_123_temporal_20_pos-log_mean', 'matrix_123_temporal_20_pos-log_sum', 'matrix_123_temporal_20_pos-log_max', 'matrix_123_temporal_20_type-163_mean', 'matrix_123_temporal_20_type-163_sum', 'matrix_123_temporal_20_type-163_max', 'matrix_123_temporal_20_lastday_mean', 'matrix_123_temporal_20_lastday_sum', 'matrix_123_temporal_20_lastday_max', 'matrix_123_temporal_20_time_mean', 'matrix_123_temporal_20_time_sum', 'matrix_123_temporal_20_time_max', 'matrix_123_temporal_20_recsys_mean', 'matrix_123_temporal_20_recsys_sum', 'matrix_123_temporal_20_recsys_max',
    'matrix_123_type136_20_mean', 'matrix_123_type136_20_sum', 'matrix_123_type136_20_max', 'matrix_123_type136_20_pos-log_mean', 'matrix_123_type136_20_pos-log_sum', 'matrix_123_type136_20_pos-log_max', 'matrix_123_type136_20_type-163_mean', 'matrix_123_type136_20_type-163_sum', 'matrix_123_type136_20_type-163_max', 'matrix_123_type136_20_lastday_mean', 'matrix_123_type136_20_lastday_sum', 'matrix_123_type136_20_lastday_max', 'matrix_123_type136_20_time_mean', 'matrix_123_type136_20_time_sum', 'matrix_123_type136_20_time_max', 'matrix_123_type136_20_recsys_mean', 'matrix_123_type136_20_recsys_sum', 'matrix_123_type136_20_recsys_max',
    'matrix_12__20_mean', 'matrix_12__20_sum', 'matrix_12__20_max', 'matrix_12__20_pos-log_mean', 'matrix_12__20_pos-log_sum', 'matrix_12__20_pos-log_max', 'matrix_12__20_type-163_mean', 'matrix_12__20_type-163_sum', 'matrix_12__20_type-163_max', 'matrix_12__20_lastday_mean', 'matrix_12__20_lastday_sum', 'matrix_12__20_lastday_max', 'matrix_12__20_time_mean', 'matrix_12__20_time_sum', 'matrix_12__20_time_max', 'matrix_12__20_recsys_mean', 'matrix_12__20_recsys_sum', 'matrix_12__20_recsys_max',
    'matrix_123_type0.590.5_20_mean', 'matrix_123_type0.590.5_20_sum', 'matrix_123_type0.590.5_20_max', 'matrix_123_type0.590.5_20_pos-log_mean', 'matrix_123_type0.590.5_20_pos-log_sum', 'matrix_123_type0.590.5_20_pos-log_max', 'matrix_123_type0.590.5_20_type-163_mean', 'matrix_123_type0.590.5_20_type-163_sum', 'matrix_123_type0.590.5_20_type-163_max', 'matrix_123_type0.590.5_20_lastday_mean', 'matrix_123_type0.590.5_20_lastday_sum', 'matrix_123_type0.590.5_20_lastday_max', 'matrix_123_type0.590.5_20_time_mean', 'matrix_123_type0.590.5_20_time_sum', 'matrix_123_type0.590.5_20_time_max', 'matrix_123_type0.590.5_20_recsys_mean', 'matrix_123_type0.590.5_20_recsys_sum', 'matrix_123_type0.590.5_20_recsys_max',
    'matrix_cpu-90_mean', 'matrix_cpu-90_sum', 'matrix_cpu-90_max', 'matrix_cpu-90_pos-log_mean', 'matrix_cpu-90_pos-log_sum', 'matrix_cpu-90_pos-log_max', 'matrix_cpu-90_type-163_mean', 'matrix_cpu-90_type-163_sum', 'matrix_cpu-90_type-163_max', 'matrix_cpu-90_lastday_mean', 'matrix_cpu-90_lastday_sum', 'matrix_cpu-90_lastday_max', 'matrix_cpu-90_time_mean', 'matrix_cpu-90_time_sum', 'matrix_cpu-90_time_max', 'matrix_cpu-90_recsys_mean', 'matrix_cpu-90_recsys_sum', 'matrix_cpu-90_recsys_max',
    'matrix_cpu-95_mean', 'matrix_cpu-95_sum', 'matrix_cpu-95_max', 'matrix_cpu-95_pos-log_mean', 'matrix_cpu-95_pos-log_sum', 'matrix_cpu-95_pos-log_max', 'matrix_cpu-95_type-163_mean', 'matrix_cpu-95_type-163_sum', 'matrix_cpu-95_type-163_max', 'matrix_cpu-95_lastday_mean', 'matrix_cpu-95_lastday_sum', 'matrix_cpu-95_lastday_max', 'matrix_cpu-95_time_mean', 'matrix_cpu-95_time_sum', 'matrix_cpu-95_time_max', 'matrix_cpu-95_recsys_mean', 'matrix_cpu-95_recsys_sum', 'matrix_cpu-95_recsys_max',
    'matrix_cpu-99_mean', 'matrix_cpu-99_sum', 'matrix_cpu-99_max', 'matrix_cpu-99_pos-log_mean', 'matrix_cpu-99_pos-log_sum', 'matrix_cpu-99_pos-log_max', 'matrix_cpu-99_type-163_mean', 'matrix_cpu-99_type-163_sum', 'matrix_cpu-99_type-163_max', 'matrix_cpu-99_lastday_mean', 'matrix_cpu-99_lastday_sum', 'matrix_cpu-99_lastday_max', 'matrix_cpu-99_time_mean', 'matrix_cpu-99_time_sum', 'matrix_cpu-99_time_max', 'matrix_cpu-99_recsys_mean', 'matrix_cpu-99_recsys_sum', 'matrix_cpu-99_recsys_max',
    'matrix_gpu-116_mean', 'matrix_gpu-116_sum', 'matrix_gpu-116_max', 'matrix_gpu-116_pos-log_mean', 'matrix_gpu-116_pos-log_sum', 'matrix_gpu-116_pos-log_max', 'matrix_gpu-116_type-163_mean', 'matrix_gpu-116_type-163_sum', 'matrix_gpu-116_type-163_max', 'matrix_gpu-116_lastday_mean', 'matrix_gpu-116_lastday_sum', 'matrix_gpu-116_lastday_max', 'matrix_gpu-116_time_mean', 'matrix_gpu-116_time_sum', 'matrix_gpu-116_time_max', 'matrix_gpu-116_recsys_mean', 'matrix_gpu-116_recsys_sum', 'matrix_gpu-116_recsys_max',
    'matrix_gpu-115_mean', 'matrix_gpu-115_sum', 'matrix_gpu-115_max', 'matrix_gpu-115_pos-log_mean', 'matrix_gpu-115_pos-log_sum', 'matrix_gpu-115_pos-log_max', 'matrix_gpu-115_type-163_mean', 'matrix_gpu-115_type-163_sum', 'matrix_gpu-115_type-163_max', 'matrix_gpu-115_lastday_mean', 'matrix_gpu-115_lastday_sum', 'matrix_gpu-115_lastday_max', 'matrix_gpu-115_time_mean', 'matrix_gpu-115_time_sum', 'matrix_gpu-115_time_max', 'matrix_gpu-115_recsys_mean', 'matrix_gpu-115_recsys_sum', 'matrix_gpu-115_recsys_max',
    'matrix_gpu-93_mean', 'matrix_gpu-93_sum', 'matrix_gpu-93_max', 'matrix_gpu-93_pos-log_mean', 'matrix_gpu-93_pos-log_sum', 'matrix_gpu-93_pos-log_max', 'matrix_gpu-93_type-163_mean', 'matrix_gpu-93_type-163_sum', 'matrix_gpu-93_type-163_max', 'matrix_gpu-93_lastday_mean', 'matrix_gpu-93_lastday_sum', 'matrix_gpu-93_lastday_max', 'matrix_gpu-93_time_mean', 'matrix_gpu-93_time_sum', 'matrix_gpu-93_time_max', 'matrix_gpu-93_recsys_mean', 'matrix_gpu-93_recsys_sum', 'matrix_gpu-93_recsys_max',
    'matrix_gpu-217_mean', 'matrix_gpu-217_sum', 'matrix_gpu-217_max', 'matrix_gpu-217_pos-log_mean', 'matrix_gpu-217_pos-log_sum', 'matrix_gpu-217_pos-log_max', 'matrix_gpu-217_type-163_mean', 'matrix_gpu-217_type-163_sum', 'matrix_gpu-217_type-163_max', 'matrix_gpu-217_lastday_mean', 'matrix_gpu-217_lastday_sum', 'matrix_gpu-217_lastday_max', 'matrix_gpu-217_time_mean', 'matrix_gpu-217_time_sum', 'matrix_gpu-217_time_max', 'matrix_gpu-217_recsys_mean', 'matrix_gpu-217_recsys_sum', 'matrix_gpu-217_recsys_max',
    'matrix_gpu-226_mean','matrix_gpu-226_sum','matrix_gpu-226_max','matrix_gpu-226_pos-log_mean','matrix_gpu-226_pos-log_sum','matrix_gpu-226_pos-log_max','matrix_gpu-226_type-163_mean','matrix_gpu-226_type-163_sum','matrix_gpu-226_type-163_max','matrix_gpu-226_lastday_mean','matrix_gpu-226_lastday_sum','matrix_gpu-226_lastday_max','matrix_gpu-226_time_mean','matrix_gpu-226_time_sum','matrix_gpu-226_time_max','matrix_gpu-226_recsys_mean','matrix_gpu-226_recsys_sum','matrix_gpu-226_recsys_max',
    'matrix_gpu-232_mean', 'matrix_gpu-232_sum', 'matrix_gpu-232_max', 'matrix_gpu-232_pos-log_mean', 'matrix_gpu-232_pos-log_sum', 'matrix_gpu-232_pos-log_max', 'matrix_gpu-232_type-163_mean', 'matrix_gpu-232_type-163_sum', 'matrix_gpu-232_type-163_max', 'matrix_gpu-232_lastday_mean', 'matrix_gpu-232_lastday_sum', 'matrix_gpu-232_lastday_max', 'matrix_gpu-232_time_mean', 'matrix_gpu-232_time_sum', 'matrix_gpu-232_time_max', 'matrix_gpu-232_recsys_mean', 'matrix_gpu-232_recsys_sum', 'matrix_gpu-232_recsys_max',
    'matrix_gpu-239_mean', 'matrix_gpu-239_sum', 'matrix_gpu-239_max', 'matrix_gpu-239_pos-log_mean', 'matrix_gpu-239_pos-log_sum', 'matrix_gpu-239_pos-log_max', 'matrix_gpu-239_type-163_mean', 'matrix_gpu-239_type-163_sum', 'matrix_gpu-239_type-163_max', 'matrix_gpu-239_lastday_mean', 'matrix_gpu-239_lastday_sum', 'matrix_gpu-239_lastday_max', 'matrix_gpu-239_time_mean', 'matrix_gpu-239_time_sum', 'matrix_gpu-239_time_max', 'matrix_gpu-239_recsys_mean', 'matrix_gpu-239_recsys_sum', 'matrix_gpu-239_recsys_max',
    'matrix_gpu-700_mean', 'matrix_gpu-700_sum', 'matrix_gpu-700_max', 'matrix_gpu-700_pos-log_mean', 'matrix_gpu-700_pos-log_sum', 'matrix_gpu-700_pos-log_max', 'matrix_gpu-700_type-163_mean', 'matrix_gpu-700_type-163_sum', 'matrix_gpu-700_type-163_max', 'matrix_gpu-700_lastday_mean', 'matrix_gpu-700_lastday_sum', 'matrix_gpu-700_lastday_max', 'matrix_gpu-700_time_mean', 'matrix_gpu-700_time_sum', 'matrix_gpu-700_time_max', 'matrix_gpu-700_recsys_mean', 'matrix_gpu-700_recsys_sum', 'matrix_gpu-700_recsys_max',
    'matrix_gpu-701_mean', 'matrix_gpu-701_sum', 'matrix_gpu-701_max', 'matrix_gpu-701_pos-log_mean', 'matrix_gpu-701_pos-log_sum', 'matrix_gpu-701_pos-log_max', 'matrix_gpu-701_type-163_mean', 'matrix_gpu-701_type-163_sum', 'matrix_gpu-701_type-163_max', 'matrix_gpu-701_lastday_mean', 'matrix_gpu-701_lastday_sum', 'matrix_gpu-701_lastday_max', 'matrix_gpu-701_time_mean', 'matrix_gpu-701_time_sum', 'matrix_gpu-701_time_max', 'matrix_gpu-701_recsys_mean', 'matrix_gpu-701_recsys_sum', 'matrix_gpu-701_recsys_max',
    'candidate_clicks_before', 'candidate_carts_before', 'candidate_orders_before', 'candidate_*_before', 'n_views', 'n_clicks', 'n_carts', 'n_orders',
    'clicks_popularity_w_pos-log_rank', 'clicks_popularity_w_type-163_rank', 'clicks_popularity_w_lastday_rank', 'clicks_popularity_w_time_rank', 'clicks_popularity_w_recsys_rank', 'carts_popularity_w_pos-log_rank', 'carts_popularity_w_type-163_rank', 'carts_popularity_w_lastday_rank', 'carts_popularity_w_time_rank', 'carts_popularity_w_recsys_rank', 'orders_popularity_w_pos-log_rank', 'orders_popularity_w_type-163_rank', 'orders_popularity_w_lastday_rank', 'orders_popularity_w_time_rank', 'orders_popularity_w_recsys_rank',
    'clicks_popularity_w_pos-log_w_rank', 'clicks_popularity_w_type-163_w_rank', 'clicks_popularity_w_lastday_w_rank', 'clicks_popularity_w_time_w_rank', 'clicks_popularity_w_recsys_w_rank', 'carts_popularity_w_pos-log_w_rank', 'carts_popularity_w_type-163_w_rank', 'carts_popularity_w_lastday_w_rank', 'carts_popularity_w_time_w_rank', 'carts_popularity_w_recsys_w_rank', 'orders_popularity_w_pos-log_w_rank', 'orders_popularity_w_type-163_w_rank', 'orders_popularity_w_lastday_w_rank', 'orders_popularity_w_time_w_rank', 'orders_popularity_w_recsys_w_rank',
    'w_pos-log_rank', 'w_type-163_rank', 'w_lastday_rank', 'w_time_rank', 'w_recsys_rank',
    'matrix_123_temporal_20_mean_rank', 'matrix_123_temporal_20_pos-log_mean_rank', 'matrix_123_temporal_20_type-163_mean_rank', 'matrix_123_temporal_20_lastday_mean_rank', 'matrix_123_temporal_20_time_mean_rank', 'matrix_123_temporal_20_recsys_mean_rank', 'matrix_123_type136_20_mean_rank', 'matrix_123_type136_20_pos-log_mean_rank', 'matrix_123_type136_20_type-163_mean_rank', 'matrix_123_type136_20_lastday_mean_rank', 'matrix_123_type136_20_time_mean_rank', 'matrix_123_type136_20_recsys_mean_rank', 
    'matrix_12__20_mean_rank', 'matrix_12__20_pos-log_mean_rank', 'matrix_12__20_type-163_mean_rank', 'matrix_12__20_lastday_mean_rank', 'matrix_12__20_time_mean_rank', 'matrix_12__20_recsys_mean_rank', 'matrix_123_type0.590.5_20_mean_rank', 'matrix_123_type0.590.5_20_pos-log_mean_rank', 'matrix_123_type0.590.5_20_type-163_mean_rank', 'matrix_123_type0.590.5_20_lastday_mean_rank', 'matrix_123_type0.590.5_20_time_mean_rank', 'matrix_123_type0.590.5_20_recsys_mean_rank',
    'matrix_cpu-90_mean_rank', 'matrix_cpu-90_pos-log_mean_rank', 'matrix_cpu-90_type-163_mean_rank', 'matrix_cpu-90_lastday_mean_rank', 'matrix_cpu-90_time_mean_rank', 'matrix_cpu-90_recsys_mean_rank', 'matrix_cpu-95_mean_rank', 'matrix_cpu-95_pos-log_mean_rank', 'matrix_cpu-95_type-163_mean_rank', 'matrix_cpu-95_lastday_mean_rank', 'matrix_cpu-95_time_mean_rank', 'matrix_cpu-95_recsys_mean_rank', 'matrix_cpu-99_mean_rank', 'matrix_cpu-99_pos-log_mean_rank', 'matrix_cpu-99_type-163_mean_rank', 'matrix_cpu-99_lastday_mean_rank', 'matrix_cpu-99_time_mean_rank', 'matrix_cpu-99_recsys_mean_rank',
    'matrix_gpu-116_mean_rank', 'matrix_gpu-116_pos-log_mean_rank', 'matrix_gpu-116_type-163_mean_rank', 'matrix_gpu-116_lastday_mean_rank', 'matrix_gpu-116_time_mean_rank', 'matrix_gpu-116_recsys_mean_rank', 'matrix_gpu-115_mean_rank', 'matrix_gpu-115_pos-log_mean_rank', 'matrix_gpu-115_type-163_mean_rank', 'matrix_gpu-115_lastday_mean_rank', 'matrix_gpu-115_time_mean_rank', 'matrix_gpu-115_recsys_mean_rank', 'matrix_gpu-93_mean_rank', 'matrix_gpu-93_pos-log_mean_rank', 'matrix_gpu-93_type-163_mean_rank', 'matrix_gpu-93_lastday_mean_rank', 'matrix_gpu-93_time_mean_rank', 'matrix_gpu-93_recsys_mean_rank',
    'matrix_gpu-217_mean_rank', 'matrix_gpu-217_pos-log_mean_rank', 'matrix_gpu-217_type-163_mean_rank', 'matrix_gpu-217_lastday_mean_rank', 'matrix_gpu-217_time_mean_rank', 'matrix_gpu-217_recsys_mean_rank', 'matrix_gpu-226_mean_rank', 'matrix_gpu-226_pos-log_mean_rank', 'matrix_gpu-226_type-163_mean_rank', 'matrix_gpu-226_lastday_mean_rank', 'matrix_gpu-226_time_mean_rank', 'matrix_gpu-226_recsys_mean_rank', 'matrix_gpu-232_mean_rank', 'matrix_gpu-232_pos-log_mean_rank', 'matrix_gpu-232_type-163_mean_rank', 'matrix_gpu-232_lastday_mean_rank', 'matrix_gpu-232_time_mean_rank', 'matrix_gpu-232_recsys_mean_rank',
    'matrix_gpu-239_mean_rank', 'matrix_gpu-239_pos-log_mean_rank', 'matrix_gpu-239_type-163_mean_rank', 'matrix_gpu-239_lastday_mean_rank', 'matrix_gpu-239_time_mean_rank', 'matrix_gpu-239_recsys_mean_rank', 'matrix_gpu-700_mean_rank', 'matrix_gpu-700_pos-log_mean_rank', 'matrix_gpu-700_type-163_mean_rank', 'matrix_gpu-700_lastday_mean_rank', 'matrix_gpu-700_time_mean_rank', 'matrix_gpu-700_recsys_mean_rank', 'matrix_gpu-701_mean_rank', 'matrix_gpu-701_pos-log_mean_rank', 'matrix_gpu-701_type-163_mean_rank', 'matrix_gpu-701_lastday_mean_rank', 'matrix_gpu-701_time_mean_rank', 'matrix_gpu-701_recsys_mean_rank',
]

In [15]:
FEATURES += [
    'popularity_week_clicks','popularity_day_clicks','popularity_hour_clicks','popularity_hour/day_clicks','popularity_day/week_clicks','popularity_week_carts','popularity_day_carts','popularity_hour_carts','popularity_hour/day_carts','popularity_day/week_carts','popularity_week_orders','popularity_day_orders','popularity_hour_orders','popularity_hour/day_orders','popularity_day/week_orders',
    'embed_1-9_64_cartbuy_last_0', 'embed_1-9_64_cartbuy_last_1', 'embed_1-9_64_cartbuy_last_2', 'embed_1-9_64_cartbuy_last_3', 'embed_1-9_64_cartbuy_last_4', 'embed_1-9_64_cartbuy_pos-log_mean', 'embed_1-9_64_cartbuy_pos-log_sum', 'embed_1-9_64_cartbuy_pos-log_max', 'embed_1-9_64_cartbuy_type-163_mean', 'embed_1-9_64_cartbuy_type-163_sum', 'embed_1-9_64_cartbuy_type-163_max', 'embed_1-9_64_cartbuy_lastday_mean', 'embed_1-9_64_cartbuy_lastday_sum', 'embed_1-9_64_cartbuy_lastday_max', 'embed_1-9_64_cartbuy_time_mean', 'embed_1-9_64_cartbuy_time_sum', 'embed_1-9_64_cartbuy_time_max', 'embed_1-9_64_cartbuy_recsys_mean', 'embed_1-9_64_cartbuy_recsys_sum', 'embed_1-9_64_cartbuy_recsys_max',
    'embed_1_64_last_0', 'embed_1_64_last_1', 'embed_1_64_last_2', 'embed_1_64_last_3', 'embed_1_64_last_4', 'embed_1_64_pos-log_mean', 'embed_1_64_pos-log_sum', 'embed_1_64_pos-log_max', 'embed_1_64_type-163_mean', 'embed_1_64_type-163_sum', 'embed_1_64_type-163_max', 'embed_1_64_lastday_mean', 'embed_1_64_lastday_sum', 'embed_1_64_lastday_max', 'embed_1_64_time_mean', 'embed_1_64_time_sum', 'embed_1_64_time_max', 'embed_1_64_recsys_mean', 'embed_1_64_recsys_sum', 'embed_1_64_recsys_max',
    'embed_1-5_64_last_0', 'embed_1-5_64_last_1', 'embed_1-5_64_last_2', 'embed_1-5_64_last_3', 'embed_1-5_64_last_4', 'embed_1-5_64_pos-log_mean', 'embed_1-5_64_pos-log_sum', 'embed_1-5_64_pos-log_max', 'embed_1-5_64_type-163_mean', 'embed_1-5_64_type-163_sum', 'embed_1-5_64_type-163_max', 'embed_1-5_64_lastday_mean', 'embed_1-5_64_lastday_sum', 'embed_1-5_64_lastday_max', 'embed_1-5_64_time_mean', 'embed_1-5_64_time_sum', 'embed_1-5_64_time_max', 'embed_1-5_64_recsys_mean', 'embed_1-5_64_recsys_sum', 'embed_1-5_64_recsys_max',
]


In [16]:
FEATURES += [
    'popularity_week_clicks_rank', 'popularity_day_clicks_rank', 'popularity_hour_clicks_rank', 'popularity_hour/day_clicks_rank', 'popularity_day/week_clicks_rank', 'popularity_week_carts_rank', 'popularity_day_carts_rank', 'popularity_hour_carts_rank', 'popularity_hour/day_carts_rank', 'popularity_day/week_carts_rank', 'popularity_week_orders_rank', 'popularity_day_orders_rank', 'popularity_hour_orders_rank', 'popularity_hour/day_orders_rank', 'popularity_day/week_orders_rank',
    'embed_1-9_64_cartbuy_last_0_rank', 'embed_1-9_64_cartbuy_last_1_rank', 'embed_1-9_64_cartbuy_last_2_rank', 'embed_1-9_64_cartbuy_last_3_rank', 'embed_1-9_64_cartbuy_last_4_rank', 'embed_1-9_64_cartbuy_pos-log_mean_rank', 'embed_1-9_64_cartbuy_type-163_mean_rank', 'embed_1-9_64_cartbuy_lastday_mean_rank', 'embed_1-9_64_cartbuy_time_mean_rank', 'embed_1-9_64_cartbuy_recsys_mean_rank', 'embed_1_64_last_0_rank', 'embed_1_64_last_1_rank', 'embed_1_64_last_2_rank', 'embed_1_64_last_3_rank', 'embed_1_64_last_4_rank', 'embed_1_64_pos-log_mean_rank', 'embed_1_64_type-163_mean_rank', 'embed_1_64_lastday_mean_rank', 'embed_1_64_time_mean_rank', 'embed_1_64_recsys_mean_rank', 'embed_1-5_64_last_0_rank', 'embed_1-5_64_last_1_rank', 'embed_1-5_64_last_2_rank', 'embed_1-5_64_last_3_rank', 'embed_1-5_64_last_4_rank', 'embed_1-5_64_pos-log_mean_rank', 'embed_1-5_64_type-163_mean_rank', 'embed_1-5_64_lastday_mean_rank', 'embed_1-5_64_time_mean_rank', 'embed_1-5_64_recsys_mean_rank'
]

In [17]:
TO_REMOVE = []
TO_REMOVE += [f for f in FEATURES if "popularity_w_time" in f]
TO_REMOVE += [f for f in FEATURES if "popularity_w_lastday_w" in f]

FEATURES = [f for f in FEATURES if f not in TO_REMOVE]

In [18]:
len(FEATURES)

564

In [19]:
# df_train = cudf.from_pandas(df_train)
# corr = df_train[FEATURES].corr()
# corr = corr.to_pandas()
# corr = corr.values

# mask = np.zeros_like(corr, dtype=bool)
# mask[np.triu_indices_from(mask)] = True
# corr[mask] = 0

In [20]:
# TH = 0.99

# for i in range(len(corr)):
#     for j in range(len(corr)):
#         if corr[i, j] > TH:
#             if FEATURES[i] in TO_REMOVE or FEATURES[j] in TO_REMOVE:
#                 continue
#             print(FEATURES[i], FEATURES[j], f'{corr[i, j] :.3f}')

In [21]:
# df = cudf.read_parquet(glob.glob(REGEX)[0])
# df = df.rename(columns={"clicks_popularity_w_pos-log_rank" : "clicks_popularity_w_pos-log_rank_ref"})
# df = cudf.read_parquet(glob.glob(TEST_REGEX)[0])

# from data.fe import add_rank_feature
# for c in ['clicks_popularity_w_pos-log', 'clicks_popularity_w_type-163', 'clicks_popularity_w_lastday']:
#     if c + "_rank" not in df.columns:
#         print(f'Add rank ft for {c}')
#         df = df.reset_index(drop=True)
#         add_rank_feature(df, c)
# (df['clicks_popularity_w_pos-log_rank'] == df['clicks_popularity_w_pos-log_rank_ref']).all()

# for f in tqdm(glob.glob(TEST_REGEX)):
#     dft = cudf.read_parquet(f, columns=['clicks_popularity_w_pos-log_rank'])

In [22]:
REGEX = f"../output/features/fts_val_{VERSION}/*"
len(glob.glob(REGEX))

91

In [23]:
TEST_REGEX = f"../output/features/fts_test_{VERSION}/*"
len(glob.glob(TEST_REGEX))

85

In [24]:
GT_REGEX = f"../output/features/fts_val_{GT_VERSION}/*"
len(glob.glob(GT_REGEX))

0

### Params

In [25]:
PARAMS = {
    "xgb":
    {
        "learning_rate": 0.01,
        'max_depth': 8,
        "subsample": 0.9,  # 0.7 / 0.8 / O.9
        'colsample_bytree': 0.7,  # 0.7 / 0.8 / 0.9
        'reg_alpha': 0.01,
        'reg_lambda': 0.1,
        "min_child_weight": 0,
#         "gamma": 0.01,
#         'scale_pos_weight': 1,
        'eval_metric': 'auc',
        'objective': 'binary:logistic',  # 'binary:logistic',
        'tree_method':'gpu_hist',
        'predictor':'gpu_predictor',
        "random_state": 42,
    },
}

In [26]:
# TO_REMOVE = [
#     'candidate_*_before', 'matrix_gpu-700_lastday_max', 'matrix_12__20_lastday_max', 'matrix_gpu-226_lastday_max', 'matrix_cpu-90_lastday_max', 'matrix_gpu-700_sum', 'matrix_gpu-700_pos-log_sum', 'matrix_12__20_lastday_sum',
#     'matrix_gpu-700_pos-log_max', 'matrix_gpu-226_lastday_sum', 'matrix_gpu-700_max', 'matrix_gpu-700_time_sum', 'matrix_123_type136_20_lastday_max', 'matrix_cpu-90_lastday_sum', 'matrix_cpu-90_type-163_max', 'matrix_gpu-700_time_max',
#     'matrix_12__20_time_sum', 'matrix_gpu-700_type-163_sum', 'matrix_gpu-700_lastday_sum', 'matrix_gpu-700_type-163_max', 'matrix_cpu-90_time_sum', 'matrix_123_type136_20_time_sum', 'matrix_gpu-217_lastday_max', 'matrix_12__20_pos-log_sum',
#     'matrix_12__20_type-163_max', 'matrix_12__20_time_max', 'matrix_cpu-90_max', 'matrix_cpu-90_type-163_sum', 'matrix_cpu-99_lastday_max', 'matrix_cpu-90_sum', 'matrix_gpu-226_sum', 'matrix_gpu-226_time_sum', 'matrix_12__20_time_mean',
#     'matrix_12__20_type-163_mean', 'matrix_gpu-700_pos-log_mean', 'matrix_123_type0.590.5_20_lastday_max', 'matrix_gpu-700_time_mean', 'matrix_12__20_type-163_sum', 'matrix_12__20_pos-log_max', 'matrix_123_type136_20_lastday_sum',
#     'matrix_cpu-90_time_mean', 'matrix_gpu-226_max', 'matrix_123_type136_20_type-163_max', 'matrix_gpu-226_type-163_max', 'matrix_gpu-226_lastday_mean', 'matrix_gpu-226_type-163_sum', 'matrix_cpu-99_time_sum', 'matrix_12__20_lastday_mean',
#     'matrix_gpu-700_type-163_mean','matrix_123_type136_20_type-163_sum'
# ][:50]

In [27]:
class Config:
    seed = 42
    version = VERSION
    
    folds_file = "../input/folds_4.csv"
    k = 4
    mode = ""

    features = FEATURES
#     features = [ft for ft in features if ft not in TO_REMOVE]

    cat_features = []

    target = TARGET
    pos_ratio = POS_RATIO

    use_gt_sessions = True  # filter out sessions with no gt
    use_gt_pos = False  # add candidates from gt
    gt_regex = GT_REGEX
    
    model = "xgb"

    params = PARAMS[model]

    use_es = True
    num_boost_round = 10000
    
    probs_file = None  # PROBS_PATHS[target]
    probs_mode = ""  # "head"  "rank_40"
    restrict_all = False

    selected_folds = [0, 1, 2, 3]
    folds_optimize = [0, 1, 2, 3]
    n_trials = 20

### Main

In [28]:
log_folder = None

#### Optimize

In [29]:
# DEBUG = False
# DEBUG_MORE = True

In [None]:
# %%time

# if not DEBUG:
#     log_folder = prepare_log_folder(LOG_PATH)
#     print(f'Logging results to {log_folder}')
#     save_config(Config, log_folder + 'config')
#     create_logger(directory=log_folder, name="logs.txt")

# study = optimize(REGEX, Config, log_folder=log_folder, n_trials=1, debug=DEBUG_MORE)

100%|██████████| 91/91 [16:35<00:00, 10.94s/it]
[32m[I 2023-01-19 20:47:14,680][0m A new study created in memory with name: no-name-d66a27c2-af33-4add-ad25-efe001e20f1e[0m



    -> 1.32M training candidates
    -> 6.29M validation candidates

[0]	val-auc:0.90221
[100]	val-auc:0.91410
[200]	val-auc:0.91558
[300]	val-auc:0.91732
[400]	val-auc:0.91873
[500]	val-auc:0.91992
[600]	val-auc:0.92081
[700]	val-auc:0.92141
[800]	val-auc:0.92182
[900]	val-auc:0.92213
[1000]	val-auc:0.92235
[1100]	val-auc:0.92255
[1200]	val-auc:0.92270
[1300]	val-auc:0.92281
[1400]	val-auc:0.92291
[1500]	val-auc:0.92299
[1600]	val-auc:0.92305
[1700]	val-auc:0.92309
[1800]	val-auc:0.92312
[1900]	val-auc:0.92315
[2000]	val-auc:0.92317
[2100]	val-auc:0.92318
[2200]	val-auc:0.92317
[2252]	val-auc:0.92318

-> gt_carts  -  Recall : 0.4416



[32m[I 2023-01-19 20:55:28,234][0m Trial 0 finished with value: 0.44159312344829027 and parameters: {'max_depth': 10, 'subsample': 0.5535625113178915, 'colsample_bytree': 0.6173362783584253, 'reg_alpha': 0.0001192762091281622, 'reg_lambda': 0.00444264752975915}. Best is trial 0 with value: 0.44159312344829027.[0m


Params : {'max_depth': 10, 'subsample': '0.554', 'colsample_bytree': '0.617', 'reg_alpha': '1.19e-04', 'reg_lambda': '4.44e-03'},

[0]	val-auc:0.89871
[100]	val-auc:0.90960
[200]	val-auc:0.91182
[300]	val-auc:0.91412
[400]	val-auc:0.91613
[500]	val-auc:0.91772
[600]	val-auc:0.91891
[700]	val-auc:0.91969
[800]	val-auc:0.92026
[900]	val-auc:0.92070
[1000]	val-auc:0.92103
[1100]	val-auc:0.92132
[1200]	val-auc:0.92154
[1300]	val-auc:0.92173
[1400]	val-auc:0.92190
[1500]	val-auc:0.92207
[1600]	val-auc:0.92222
[1700]	val-auc:0.92234
[1800]	val-auc:0.92245
[1900]	val-auc:0.92256
[2000]	val-auc:0.92264
[2100]	val-auc:0.92272
[2200]	val-auc:0.92279
[2300]	val-auc:0.92285
[2400]	val-auc:0.92292
[2500]	val-auc:0.92297
[2600]	val-auc:0.92301
[2700]	val-auc:0.92307
[2800]	val-auc:0.92311
[2900]	val-auc:0.92315
[3000]	val-auc:0.92319
[3100]	val-auc:0.92323
[3200]	val-auc:0.92326
[3300]	val-auc:0.92330
[3400]	val-auc:0.92332
[3500]	val-auc:0.92334
[3600]	val-auc:0.92338
[3700]	val-auc:0.92341
[3800]	

[32m[I 2023-01-19 21:03:39,383][0m Trial 1 finished with value: 0.4418643466649976 and parameters: {'max_depth': 7, 'subsample': 0.6139978815685576, 'colsample_bytree': 0.9216640381046171, 'reg_alpha': 0.0003006375012855398, 'reg_lambda': 0.0038668608487277215}. Best is trial 1 with value: 0.4418643466649976.[0m


Params : {'max_depth': 7, 'subsample': '0.614', 'colsample_bytree': '0.922', 'reg_alpha': '3.01e-04', 'reg_lambda': '3.87e-03'},

[0]	val-auc:0.89392
[100]	val-auc:0.90731
[200]	val-auc:0.91011
[300]	val-auc:0.91243
[400]	val-auc:0.91452
[500]	val-auc:0.91625
[600]	val-auc:0.91765
[700]	val-auc:0.91858
[800]	val-auc:0.91921
[900]	val-auc:0.91967
[1000]	val-auc:0.92007
[1100]	val-auc:0.92037
[1200]	val-auc:0.92064
[1300]	val-auc:0.92086
[1400]	val-auc:0.92107
[1500]	val-auc:0.92125
[1600]	val-auc:0.92141
[1700]	val-auc:0.92156
[1800]	val-auc:0.92169
[1900]	val-auc:0.92180
[2000]	val-auc:0.92190
[2100]	val-auc:0.92201
[2200]	val-auc:0.92209
[2300]	val-auc:0.92218
[2400]	val-auc:0.92226
[2500]	val-auc:0.92233
[2600]	val-auc:0.92239
[2700]	val-auc:0.92246
[2800]	val-auc:0.92251
[2900]	val-auc:0.92257
[3000]	val-auc:0.92263
[3100]	val-auc:0.92268
[3200]	val-auc:0.92273
[3300]	val-auc:0.92278
[3400]	val-auc:0.92282
[3500]	val-auc:0.92286
[3600]	val-auc:0.92289
[3700]	val-auc:0.92292
[3800]	v

[32m[I 2023-01-19 21:15:28,763][0m Trial 2 finished with value: 0.4422885675936937 and parameters: {'max_depth': 6, 'subsample': 0.9297443412770525, 'colsample_bytree': 0.8366556995786414, 'reg_alpha': 2.155208707858932e-05, 'reg_lambda': 1.4533938654939735e-06}. Best is trial 2 with value: 0.4422885675936937.[0m


Params : {'max_depth': 6, 'subsample': '0.930', 'colsample_bytree': '0.837', 'reg_alpha': '2.16e-05', 'reg_lambda': '1.45e-06'},

[0]	val-auc:0.89707
[100]	val-auc:0.90960
[200]	val-auc:0.91180
[300]	val-auc:0.91400
[400]	val-auc:0.91594
[500]	val-auc:0.91753
[600]	val-auc:0.91878
[700]	val-auc:0.91962
[800]	val-auc:0.92020
[900]	val-auc:0.92064
[1000]	val-auc:0.92099
[1100]	val-auc:0.92127
[1200]	val-auc:0.92152
[1300]	val-auc:0.92172
[1400]	val-auc:0.92190
[1500]	val-auc:0.92206
[1600]	val-auc:0.92218
[1700]	val-auc:0.92232
[1800]	val-auc:0.92243
[1900]	val-auc:0.92253
[2000]	val-auc:0.92261
[2100]	val-auc:0.92270
[2200]	val-auc:0.92278
[2300]	val-auc:0.92285
[2400]	val-auc:0.92291
[2500]	val-auc:0.92297
[2600]	val-auc:0.92303
[2700]	val-auc:0.92308
[2800]	val-auc:0.92312
[2900]	val-auc:0.92317
[3000]	val-auc:0.92320
[3100]	val-auc:0.92324
[3200]	val-auc:0.92328
[3300]	val-auc:0.92330
[3400]	val-auc:0.92333
[3500]	val-auc:0.92336
[3600]	val-auc:0.92339
[3700]	val-auc:0.92343
[3800]	v

[32m[I 2023-01-19 21:24:54,942][0m Trial 3 finished with value: 0.4419130277551758 and parameters: {'max_depth': 7, 'subsample': 0.8498047777660901, 'colsample_bytree': 0.6580670707585281, 'reg_alpha': 0.00264008630935056, 'reg_lambda': 8.238565404045005e-05}. Best is trial 2 with value: 0.4422885675936937.[0m


Params : {'max_depth': 7, 'subsample': '0.850', 'colsample_bytree': '0.658', 'reg_alpha': '2.64e-03', 'reg_lambda': '8.24e-05'},

[0]	val-auc:0.90169
[100]	val-auc:0.91161
[200]	val-auc:0.91337
[300]	val-auc:0.91536
[400]	val-auc:0.91708
[500]	val-auc:0.91856
[600]	val-auc:0.91968
[700]	val-auc:0.92041
[800]	val-auc:0.92094
[900]	val-auc:0.92133
[1000]	val-auc:0.92165
[1100]	val-auc:0.92191
[1200]	val-auc:0.92212
[1300]	val-auc:0.92229
[1400]	val-auc:0.92244
[1500]	val-auc:0.92257
[1600]	val-auc:0.92270
[1700]	val-auc:0.92279
[1800]	val-auc:0.92290
[1900]	val-auc:0.92297
[2000]	val-auc:0.92304
[2100]	val-auc:0.92311
[2200]	val-auc:0.92316
[2300]	val-auc:0.92322
[2400]	val-auc:0.92328
[2500]	val-auc:0.92332
[2600]	val-auc:0.92336
[2700]	val-auc:0.92340
[2800]	val-auc:0.92343
[2900]	val-auc:0.92347
[3000]	val-auc:0.92350
[3100]	val-auc:0.92353
[3200]	val-auc:0.92356
[3300]	val-auc:0.92358
[3400]	val-auc:0.92360
[3500]	val-auc:0.92362
[3600]	val-auc:0.92363
[3700]	val-auc:0.92365
[3800]	v

[32m[I 2023-01-19 21:33:56,968][0m Trial 4 finished with value: 0.4420660254671646 and parameters: {'max_depth': 8, 'subsample': 0.637685760858498, 'colsample_bytree': 0.5012036884547091, 'reg_alpha': 0.001009962734750557, 'reg_lambda': 0.024423693147837537}. Best is trial 2 with value: 0.4422885675936937.[0m


Params : {'max_depth': 8, 'subsample': '0.638', 'colsample_bytree': '0.501', 'reg_alpha': '1.01e-03', 'reg_lambda': '2.44e-02'},

[0]	val-auc:0.90075
[100]	val-auc:0.90976
[200]	val-auc:0.91184
[300]	val-auc:0.91403
[400]	val-auc:0.91597
[500]	val-auc:0.91758
[600]	val-auc:0.91882
[700]	val-auc:0.91962
[800]	val-auc:0.92021
[900]	val-auc:0.92066
[1000]	val-auc:0.92100
[1100]	val-auc:0.92130
[1200]	val-auc:0.92153
[1300]	val-auc:0.92173
[1400]	val-auc:0.92192
[1500]	val-auc:0.92206
[1600]	val-auc:0.92219
[1700]	val-auc:0.92230
[1800]	val-auc:0.92241
[1900]	val-auc:0.92251
[2000]	val-auc:0.92259
[2100]	val-auc:0.92266
[2200]	val-auc:0.92273
[2300]	val-auc:0.92280
[2400]	val-auc:0.92287
[2500]	val-auc:0.92292
[2600]	val-auc:0.92296
[2700]	val-auc:0.92301
[2800]	val-auc:0.92305
[2900]	val-auc:0.92308
[3000]	val-auc:0.92312
[3100]	val-auc:0.92315
[3200]	val-auc:0.92319
[3300]	val-auc:0.92322
[3400]	val-auc:0.92323
[3500]	val-auc:0.92326
[3600]	val-auc:0.92328
[3700]	val-auc:0.92331
[3800]	v

[32m[I 2023-01-19 21:43:28,150][0m Trial 5 finished with value: 0.44207297990861866 and parameters: {'max_depth': 7, 'subsample': 0.6625966473126667, 'colsample_bytree': 0.6401266921280526, 'reg_alpha': 0.0007863837131432546, 'reg_lambda': 4.7827763522273124e-05}. Best is trial 2 with value: 0.4422885675936937.[0m


Params : {'max_depth': 7, 'subsample': '0.663', 'colsample_bytree': '0.640', 'reg_alpha': '7.86e-04', 'reg_lambda': '4.78e-05'},

[0]	val-auc:0.90132
[100]	val-auc:0.91109
[200]	val-auc:0.91317
[300]	val-auc:0.91540
[400]	val-auc:0.91722
[500]	val-auc:0.91861
[600]	val-auc:0.91975
[700]	val-auc:0.92051
[800]	val-auc:0.92100
[900]	val-auc:0.92137
[1000]	val-auc:0.92168
[1100]	val-auc:0.92193
[1200]	val-auc:0.92213
[1300]	val-auc:0.92230
[1400]	val-auc:0.92244
[1500]	val-auc:0.92258
[1600]	val-auc:0.92268
[1700]	val-auc:0.92278
[1800]	val-auc:0.92288
[1900]	val-auc:0.92295
[2000]	val-auc:0.92302
[2100]	val-auc:0.92310
[2200]	val-auc:0.92316
[2300]	val-auc:0.92322
[2400]	val-auc:0.92327
[2500]	val-auc:0.92331
[2600]	val-auc:0.92335
[2700]	val-auc:0.92337
[2800]	val-auc:0.92340
[2900]	val-auc:0.92343
[3000]	val-auc:0.92345
[3100]	val-auc:0.92347
[3200]	val-auc:0.92349
[3300]	val-auc:0.92351
[3400]	val-auc:0.92353
[3500]	val-auc:0.92356
[3600]	val-auc:0.92358
[3700]	val-auc:0.92359
[3800]	v

[32m[I 2023-01-19 21:52:10,260][0m Trial 6 finished with value: 0.4419686632868081 and parameters: {'max_depth': 8, 'subsample': 0.8637023807335716, 'colsample_bytree': 0.7970244874100325, 'reg_alpha': 7.031856679597856e-05, 'reg_lambda': 1.4677442037314857e-06}. Best is trial 2 with value: 0.4422885675936937.[0m


Params : {'max_depth': 8, 'subsample': '0.864', 'colsample_bytree': '0.797', 'reg_alpha': '7.03e-05', 'reg_lambda': '1.47e-06'},

[0]	val-auc:0.89432
[100]	val-auc:0.90758
[200]	val-auc:0.91021
[300]	val-auc:0.91250
[400]	val-auc:0.91457
[500]	val-auc:0.91630
[600]	val-auc:0.91767
[700]	val-auc:0.91857
[800]	val-auc:0.91922
[900]	val-auc:0.91971
[1000]	val-auc:0.92009
[1100]	val-auc:0.92040
[1200]	val-auc:0.92067
[1300]	val-auc:0.92089
[1400]	val-auc:0.92110
[1500]	val-auc:0.92127
[1600]	val-auc:0.92143
[1700]	val-auc:0.92157
[1800]	val-auc:0.92169
[1900]	val-auc:0.92181
[2000]	val-auc:0.92192
[2100]	val-auc:0.92202
[2200]	val-auc:0.92212
[2300]	val-auc:0.92220
[2400]	val-auc:0.92228
[2500]	val-auc:0.92236
[2600]	val-auc:0.92243
[2700]	val-auc:0.92249
[2800]	val-auc:0.92255
[2900]	val-auc:0.92262
[3000]	val-auc:0.92266
[3100]	val-auc:0.92270
[3200]	val-auc:0.92275
[3300]	val-auc:0.92280
[3400]	val-auc:0.92283
[3500]	val-auc:0.92287
[3600]	val-auc:0.92291
[3700]	val-auc:0.92295
[3800]	v

[32m[I 2023-01-19 22:03:48,298][0m Trial 7 finished with value: 0.4418295744577274 and parameters: {'max_depth': 6, 'subsample': 0.6163809141040459, 'colsample_bytree': 0.6586927618521798, 'reg_alpha': 0.016603637997881445, 'reg_lambda': 7.299505262612437e-06}. Best is trial 2 with value: 0.4422885675936937.[0m


Params : {'max_depth': 6, 'subsample': '0.616', 'colsample_bytree': '0.659', 'reg_alpha': '1.66e-02', 'reg_lambda': '7.30e-06'},

[0]	val-auc:0.89809
[100]	val-auc:0.90958
[200]	val-auc:0.91189
[300]	val-auc:0.91417
[400]	val-auc:0.91610
[500]	val-auc:0.91768
[600]	val-auc:0.91889
[700]	val-auc:0.91969
[800]	val-auc:0.92024
[900]	val-auc:0.92067
[1000]	val-auc:0.92103
[1100]	val-auc:0.92130
[1200]	val-auc:0.92155
[1300]	val-auc:0.92176
[1400]	val-auc:0.92193
[1500]	val-auc:0.92208
[1600]	val-auc:0.92222
[1700]	val-auc:0.92233
[1800]	val-auc:0.92244
[1900]	val-auc:0.92251
[2000]	val-auc:0.92261
[2100]	val-auc:0.92268
[2200]	val-auc:0.92276
[2300]	val-auc:0.92282
[2400]	val-auc:0.92289
[2500]	val-auc:0.92294
[2600]	val-auc:0.92298
[2700]	val-auc:0.92303
[2800]	val-auc:0.92308
[2900]	val-auc:0.92314
[3000]	val-auc:0.92317
[3100]	val-auc:0.92320
[3200]	val-auc:0.92324
[3300]	val-auc:0.92326
[3400]	val-auc:0.92328
[3500]	val-auc:0.92329
[3600]	val-auc:0.92333
[3700]	val-auc:0.92336
[3800]	v

[32m[I 2023-01-19 22:12:27,411][0m Trial 8 finished with value: 0.44171134895300884 and parameters: {'max_depth': 7, 'subsample': 0.5039939527377342, 'colsample_bytree': 0.9048930827740393, 'reg_alpha': 0.005446833408455922, 'reg_lambda': 0.06356916421789417}. Best is trial 2 with value: 0.4422885675936937.[0m


Params : {'max_depth': 7, 'subsample': '0.504', 'colsample_bytree': '0.905', 'reg_alpha': '5.45e-03', 'reg_lambda': '6.36e-02'},

[0]	val-auc:0.90156
[100]	val-auc:0.91143
[200]	val-auc:0.91329
[300]	val-auc:0.91528
[400]	val-auc:0.91709
[500]	val-auc:0.91851
[600]	val-auc:0.91966
[700]	val-auc:0.92041
[800]	val-auc:0.92093
[900]	val-auc:0.92131
[1000]	val-auc:0.92161
[1100]	val-auc:0.92186
[1200]	val-auc:0.92205
[1300]	val-auc:0.92221
[1400]	val-auc:0.92237
[1500]	val-auc:0.92250
[1600]	val-auc:0.92261
[1700]	val-auc:0.92272
[1800]	val-auc:0.92282
[1900]	val-auc:0.92289
[2000]	val-auc:0.92295
[2100]	val-auc:0.92302
[2200]	val-auc:0.92308
[2300]	val-auc:0.92313
[2400]	val-auc:0.92318
[2500]	val-auc:0.92323
[2600]	val-auc:0.92329
[2700]	val-auc:0.92333
[2800]	val-auc:0.92336
[2900]	val-auc:0.92340
[3000]	val-auc:0.92342
[3100]	val-auc:0.92345
[3200]	val-auc:0.92347
[3300]	val-auc:0.92349
[3400]	val-auc:0.92350
[3500]	val-auc:0.92352
[3600]	val-auc:0.92354
[3700]	val-auc:0.92356
[3800]	v

[32m[I 2023-01-19 22:20:06,858][0m Trial 9 finished with value: 0.4418852099893597 and parameters: {'max_depth': 8, 'subsample': 0.9685299219501908, 'colsample_bytree': 0.5183526001682017, 'reg_alpha': 0.008105291097785749, 'reg_lambda': 7.349449532631487e-06}. Best is trial 2 with value: 0.4422885675936937.[0m


Params : {'max_depth': 8, 'subsample': '0.969', 'colsample_bytree': '0.518', 'reg_alpha': '8.11e-03', 'reg_lambda': '7.35e-06'},

[0]	val-auc:0.90293
[100]	val-auc:0.91350
[200]	val-auc:0.91545
[300]	val-auc:0.91721
[400]	val-auc:0.91876
[500]	val-auc:0.91993
[600]	val-auc:0.92080
[700]	val-auc:0.92138
[800]	val-auc:0.92178
[900]	val-auc:0.92204
[1000]	val-auc:0.92224
[1100]	val-auc:0.92239
[1200]	val-auc:0.92252
[1300]	val-auc:0.92262
[1400]	val-auc:0.92269
[1500]	val-auc:0.92274
[1600]	val-auc:0.92281
[1700]	val-auc:0.92285
[1800]	val-auc:0.92288
[1900]	val-auc:0.92290
[2000]	val-auc:0.92291
[2100]	val-auc:0.92291
[2200]	val-auc:0.92292
[2267]	val-auc:0.92292

-> gt_carts  -  Recall : 0.4419



[32m[I 2023-01-19 22:28:33,817][0m Trial 10 finished with value: 0.4418573922235436 and parameters: {'max_depth': 10, 'subsample': 0.9819484957041292, 'colsample_bytree': 0.9865005128946852, 'reg_alpha': 1.1701657054446882e-05, 'reg_lambda': 0.00032520439216438354}. Best is trial 2 with value: 0.4422885675936937.[0m


Params : {'max_depth': 10, 'subsample': '0.982', 'colsample_bytree': '0.987', 'reg_alpha': '1.17e-05', 'reg_lambda': '3.25e-04'},

[0]	val-auc:0.89387
[100]	val-auc:0.90739
[200]	val-auc:0.91013
[300]	val-auc:0.91242
[400]	val-auc:0.91451
[500]	val-auc:0.91633
[600]	val-auc:0.91770
[700]	val-auc:0.91861
[800]	val-auc:0.91925
[900]	val-auc:0.91973
[1000]	val-auc:0.92010
[1100]	val-auc:0.92042
[1200]	val-auc:0.92072
[1300]	val-auc:0.92096
[1400]	val-auc:0.92116
[1500]	val-auc:0.92134
[1600]	val-auc:0.92150
[1700]	val-auc:0.92164
[1800]	val-auc:0.92178
[1900]	val-auc:0.92190
[2000]	val-auc:0.92200
[2100]	val-auc:0.92210
[2200]	val-auc:0.92220
[2300]	val-auc:0.92228
[2400]	val-auc:0.92236
[2500]	val-auc:0.92243
[2600]	val-auc:0.92251
[2700]	val-auc:0.92257
[2800]	val-auc:0.92263
[2900]	val-auc:0.92268
[3000]	val-auc:0.92273
[3100]	val-auc:0.92280
[3200]	val-auc:0.92285
[3300]	val-auc:0.92289
[3400]	val-auc:0.92294
[3500]	val-auc:0.92298
[3600]	val-auc:0.92300
[3700]	val-auc:0.92304
[3800]	

[32m[I 2023-01-19 22:40:05,402][0m Trial 11 finished with value: 0.4420451621428025 and parameters: {'max_depth': 6, 'subsample': 0.7496889095353529, 'colsample_bytree': 0.7779711746076128, 'reg_alpha': 1.8227222977147326e-05, 'reg_lambda': 5.007654074085801e-05}. Best is trial 2 with value: 0.4422885675936937.[0m


Params : {'max_depth': 6, 'subsample': '0.750', 'colsample_bytree': '0.778', 'reg_alpha': '1.82e-05', 'reg_lambda': '5.01e-05'},

[0]	val-auc:0.89481
[100]	val-auc:0.90743
[200]	val-auc:0.91020
[300]	val-auc:0.91244
[400]	val-auc:0.91454
[500]	val-auc:0.91628
[600]	val-auc:0.91769
[700]	val-auc:0.91861
[800]	val-auc:0.91925
[900]	val-auc:0.91972
[1000]	val-auc:0.92012
[1100]	val-auc:0.92045
[1200]	val-auc:0.92073
[1300]	val-auc:0.92095
[1400]	val-auc:0.92114
[1500]	val-auc:0.92133
[1600]	val-auc:0.92149
[1700]	val-auc:0.92164
[1800]	val-auc:0.92177
[1900]	val-auc:0.92187
[2000]	val-auc:0.92200
[2100]	val-auc:0.92209
[2200]	val-auc:0.92219
[2300]	val-auc:0.92227
[2400]	val-auc:0.92235
[2500]	val-auc:0.92242
[2600]	val-auc:0.92250
[2700]	val-auc:0.92256
[2800]	val-auc:0.92261
[2900]	val-auc:0.92268
[3000]	val-auc:0.92272
[3100]	val-auc:0.92277
[3200]	val-auc:0.92282
[3300]	val-auc:0.92286
[3400]	val-auc:0.92290
[3500]	val-auc:0.92294
[3600]	val-auc:0.92298
[3700]	val-auc:0.92302
[3800]	v

[32m[I 2023-01-19 22:50:56,810][0m Trial 12 finished with value: 0.4420799343500727 and parameters: {'max_depth': 6, 'subsample': 0.7076663949145994, 'colsample_bytree': 0.7351985898432332, 'reg_alpha': 0.0004695635726627124, 'reg_lambda': 1.2504608672945654e-06}. Best is trial 2 with value: 0.4422885675936937.[0m


Params : {'max_depth': 6, 'subsample': '0.708', 'colsample_bytree': '0.735', 'reg_alpha': '4.70e-04', 'reg_lambda': '1.25e-06'},

[0]	val-auc:0.89387
[100]	val-auc:0.90730
[200]	val-auc:0.91015
[300]	val-auc:0.91244
[400]	val-auc:0.91455
[500]	val-auc:0.91633
[600]	val-auc:0.91771
[700]	val-auc:0.91860
[800]	val-auc:0.91923
[900]	val-auc:0.91971
[1000]	val-auc:0.92010
[1100]	val-auc:0.92041
[1200]	val-auc:0.92068
[1300]	val-auc:0.92093
[1400]	val-auc:0.92111
[1500]	val-auc:0.92130
[1600]	val-auc:0.92146
[1700]	val-auc:0.92160
[1800]	val-auc:0.92175
[1900]	val-auc:0.92187
[2000]	val-auc:0.92197
[2100]	val-auc:0.92209
[2200]	val-auc:0.92217
[2300]	val-auc:0.92225
[2400]	val-auc:0.92234
[2500]	val-auc:0.92240
[2600]	val-auc:0.92247
[2700]	val-auc:0.92252
[2800]	val-auc:0.92259
[2900]	val-auc:0.92264
[3000]	val-auc:0.92270
[3100]	val-auc:0.92276
[3200]	val-auc:0.92281
[3300]	val-auc:0.92286
[3400]	val-auc:0.92289
[3500]	val-auc:0.92292
[3600]	val-auc:0.92296
[3700]	val-auc:0.92300
[3800]	v

[32m[I 2023-01-19 23:01:44,324][0m Trial 13 finished with value: 0.44199648105262423 and parameters: {'max_depth': 6, 'subsample': 0.7476836189149149, 'colsample_bytree': 0.8176799754181346, 'reg_alpha': 0.057774342826650976, 'reg_lambda': 1.0511913444868173e-06}. Best is trial 2 with value: 0.4422885675936937.[0m


Params : {'max_depth': 6, 'subsample': '0.748', 'colsample_bytree': '0.818', 'reg_alpha': '5.78e-02', 'reg_lambda': '1.05e-06'},

[0]	val-auc:0.90300
[100]	val-auc:0.91263
[200]	val-auc:0.91455
[300]	val-auc:0.91660
[400]	val-auc:0.91829
[500]	val-auc:0.91961
[600]	val-auc:0.92063
[700]	val-auc:0.92130
[800]	val-auc:0.92175
[900]	val-auc:0.92207
[1000]	val-auc:0.92233
[1100]	val-auc:0.92254
[1200]	val-auc:0.92271
[1300]	val-auc:0.92285
[1400]	val-auc:0.92297
[1500]	val-auc:0.92307
[1600]	val-auc:0.92316
[1700]	val-auc:0.92321
[1800]	val-auc:0.92327
[1900]	val-auc:0.92332
[2000]	val-auc:0.92336
[2100]	val-auc:0.92338
[2200]	val-auc:0.92342
[2300]	val-auc:0.92344
[2400]	val-auc:0.92347
[2500]	val-auc:0.92348
[2600]	val-auc:0.92350
[2700]	val-auc:0.92350
[2800]	val-auc:0.92351
[2900]	val-auc:0.92352
[3000]	val-auc:0.92354
[3100]	val-auc:0.92354
[3189]	val-auc:0.92354

-> gt_carts  -  Recall : 0.4421



[32m[I 2023-01-19 23:10:02,830][0m Trial 14 finished with value: 0.4421286154402509 and parameters: {'max_depth': 9, 'subsample': 0.8292977837802313, 'colsample_bytree': 0.7325328665085745, 'reg_alpha': 4.397893600300813e-05, 'reg_lambda': 0.33605643347498065}. Best is trial 2 with value: 0.4422885675936937.[0m


Params : {'max_depth': 9, 'subsample': '0.829', 'colsample_bytree': '0.733', 'reg_alpha': '4.40e-05', 'reg_lambda': '3.36e-01'},

[0]	val-auc:0.90318
[100]	val-auc:0.91254
[200]	val-auc:0.91457
[300]	val-auc:0.91660
[400]	val-auc:0.91832
[500]	val-auc:0.91963
[600]	val-auc:0.92065
[700]	val-auc:0.92133
[800]	val-auc:0.92178
[900]	val-auc:0.92209
[1000]	val-auc:0.92235
[1100]	val-auc:0.92258
[1200]	val-auc:0.92274
[1300]	val-auc:0.92289
[1400]	val-auc:0.92298
[1500]	val-auc:0.92308
[1600]	val-auc:0.92315
[1700]	val-auc:0.92322
[1800]	val-auc:0.92328
[1900]	val-auc:0.92333
[2000]	val-auc:0.92336
[2100]	val-auc:0.92340
[2200]	val-auc:0.92343
[2300]	val-auc:0.92347
[2400]	val-auc:0.92348
[2500]	val-auc:0.92351
[2600]	val-auc:0.92351
[2700]	val-auc:0.92352
[2800]	val-auc:0.92353
[2900]	val-auc:0.92353
[3000]	val-auc:0.92354
[3030]	val-auc:0.92354

-> gt_carts  -  Recall : 0.4419



[32m[I 2023-01-19 23:18:08,299][0m Trial 15 finished with value: 0.4418573922235436 and parameters: {'max_depth': 9, 'subsample': 0.8449563539611286, 'colsample_bytree': 0.8527949156491711, 'reg_alpha': 3.9181267112778005e-05, 'reg_lambda': 0.31367002916690134}. Best is trial 2 with value: 0.4422885675936937.[0m


Params : {'max_depth': 9, 'subsample': '0.845', 'colsample_bytree': '0.853', 'reg_alpha': '3.92e-05', 'reg_lambda': '3.14e-01'},

[0]	val-auc:0.90456
[100]	val-auc:0.91260
[200]	val-auc:0.91454
[300]	val-auc:0.91654
[400]	val-auc:0.91825
[500]	val-auc:0.91957
[600]	val-auc:0.92057
[700]	val-auc:0.92127
[800]	val-auc:0.92170
[900]	val-auc:0.92203
[1000]	val-auc:0.92230
[1100]	val-auc:0.92252
[1200]	val-auc:0.92269
[1300]	val-auc:0.92283
[1400]	val-auc:0.92293
[1500]	val-auc:0.92303
[1600]	val-auc:0.92310
[1700]	val-auc:0.92317
[1800]	val-auc:0.92323
[1900]	val-auc:0.92329
[2000]	val-auc:0.92333
[2100]	val-auc:0.92336
[2200]	val-auc:0.92341
[2300]	val-auc:0.92343
[2400]	val-auc:0.92346
[2500]	val-auc:0.92347
[2600]	val-auc:0.92349
[2700]	val-auc:0.92350
[2800]	val-auc:0.92350
[2900]	val-auc:0.92351
[3000]	val-auc:0.92352
[3092]	val-auc:0.92352

-> gt_carts  -  Recall : 0.4418



[32m[I 2023-01-19 23:26:11,528][0m Trial 16 finished with value: 0.4417878478090032 and parameters: {'max_depth': 9, 'subsample': 0.9136570496655704, 'colsample_bytree': 0.7157660740125938, 'reg_alpha': 2.4467952080933123e-05, 'reg_lambda': 0.40275912080412446}. Best is trial 2 with value: 0.4422885675936937.[0m


Params : {'max_depth': 9, 'subsample': '0.914', 'colsample_bytree': '0.716', 'reg_alpha': '2.45e-05', 'reg_lambda': '4.03e-01'},

[0]	val-auc:0.90211
[100]	val-auc:0.91260
[200]	val-auc:0.91450
[300]	val-auc:0.91647
[400]	val-auc:0.91815
[500]	val-auc:0.91943
[600]	val-auc:0.92041
[700]	val-auc:0.92112
[800]	val-auc:0.92157
[900]	val-auc:0.92190
[1000]	val-auc:0.92217
[1100]	val-auc:0.92240
[1200]	val-auc:0.92257
[1300]	val-auc:0.92270
[1400]	val-auc:0.92282
[1500]	val-auc:0.92293
[1600]	val-auc:0.92301
[1700]	val-auc:0.92309
[1800]	val-auc:0.92315
[1900]	val-auc:0.92322
[2000]	val-auc:0.92328
[2100]	val-auc:0.92331
[2200]	val-auc:0.92336
[2300]	val-auc:0.92340
[2400]	val-auc:0.92343
[2500]	val-auc:0.92345
[2600]	val-auc:0.92347
[2700]	val-auc:0.92348
[2800]	val-auc:0.92351
[2900]	val-auc:0.92352
[3000]	val-auc:0.92353
[3100]	val-auc:0.92355
[3200]	val-auc:0.92355
[3287]	val-auc:0.92355

-> gt_carts  -  Recall : 0.4418



[32m[I 2023-01-19 23:34:48,291][0m Trial 17 finished with value: 0.44179480225045725 and parameters: {'max_depth': 9, 'subsample': 0.8058970033475875, 'colsample_bytree': 0.8451251614903262, 'reg_alpha': 0.00010128914756459993, 'reg_lambda': 0.001139718370253406}. Best is trial 2 with value: 0.4422885675936937.[0m


Params : {'max_depth': 9, 'subsample': '0.806', 'colsample_bytree': '0.845', 'reg_alpha': '1.01e-04', 'reg_lambda': '1.14e-03'},

[0]	val-auc:0.90267
[100]	val-auc:0.91255
[200]	val-auc:0.91451
[300]	val-auc:0.91653
[400]	val-auc:0.91828
[500]	val-auc:0.91961
[600]	val-auc:0.92065
[700]	val-auc:0.92133
[800]	val-auc:0.92176


In [None]:
PARAMS['xgb'].update(study.best_params)
Config.params = PARAMS['xgb']

### Train

In [None]:
DEBUG = False
DEBUG_MORE = False

In [None]:
%%time

if not DEBUG:
    if log_folder is None:
        log_folder = prepare_log_folder(LOG_PATH)
        print(f'Logging results to {log_folder}')
        create_logger(directory=log_folder, name="logs.txt")

    save_config(Config, log_folder + 'config')

df_val, ft_imp = kfold(REGEX, TEST_REGEX, Config, log_folder=log_folder, debug=DEBUG_MORE)

In [None]:
# plot_importances(ft_imp)

**Fold 0**
- orders	-  Found 52.04K GTs	-  Recall : 0.6672

Done