**About :** Trains XGBoost models.

**TODO**:
- better neg sampling technique ??

In [None]:
cd ../src

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [4]:
import os
import gc
import cudf
import json
import glob
import numba
import xgboost
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from pandarallel import pandarallel
from numerize.numerize import numerize

warnings.simplefilter(action="ignore", category=FutureWarning)
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
pandarallel.initialize(nb_workers=32, progress_bar=False)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [5]:
from params import *

from utils.metrics import get_coverage
from utils.plot import plot_importances
from utils.load import *
from utils.logger import save_config, prepare_log_folder, create_logger

### Load

In [6]:
VERSION = "c-orders-v4.7"
GT_VERSION = "gt.7"

#### Train data
- neg sampling could use candidates from lower versions

In [7]:
POS_RATIO = 0.2
TARGET = "gt_orders"   # "gt_clicks", "gt_carts", "gt_orders"

In [8]:
# def load_sessions(regex):
#     dfs = []
#     for idx, chunk_file in enumerate(glob.glob(regex)):
#         df = cudf.read_parquet(chunk_file, columns=["session"])
#         dfs.append(df.drop_duplicates(keep="first"))

#     return cudf.concat(dfs).reset_index(drop=True)

# sessions = load_sessions( f"../output/features/fts_val_{VERSION}/*")

# from sklearn.model_selection import KFold
# K = 4

# kf = KFold(n_splits=K, shuffle=True, random_state=42)
# splits = kf.split(sessions)

# sessions['fold'] = -1
# for i, (_, val_idx) in enumerate(splits):
#     sessions.loc[val_idx, "fold"] = i

# sessions.to_csv(f"../input/folds_{K}.csv", index=False)

In [9]:
FEATURES = [  # REMOVE CORRELATED
    'clicks_popularity_w_pos-log', 'clicks_popularity_w_type-163', 'clicks_popularity_w_lastday', 'clicks_popularity_w_recsys', 
    'carts_popularity_w_pos-log', 'carts_popularity_w_type-163', 'carts_popularity_w_lastday', 'carts_popularity_w_recsys', 
    'orders_popularity_w_pos-log', 'orders_popularity_w_type-163', 'orders_popularity_w_lastday', 'orders_popularity_w_recsys', 
    'clicks_popularity_w_pos-log_w', 'clicks_popularity_w_type-163_w', 'clicks_popularity_w_recsys_w', 
    'carts_popularity_w_pos-log_w', 'carts_popularity_w_type-163_w', 'carts_popularity_w_recsys_w', 
    'orders_popularity_w_pos-log_w', 'orders_popularity_w_type-163_w', 'orders_popularity_w_recsys_w',
    'w_pos-log', 'w_type-163', 'w_lastday', 'w_time', 'w_recsys',
    'matrix_123_temporal_20_mean', 'matrix_123_temporal_20_sum', 'matrix_123_temporal_20_max', 'matrix_123_temporal_20_pos-log_mean', 'matrix_123_temporal_20_pos-log_sum', 'matrix_123_temporal_20_pos-log_max', 'matrix_123_temporal_20_type-163_mean', 'matrix_123_temporal_20_type-163_sum', 'matrix_123_temporal_20_type-163_max', 'matrix_123_temporal_20_lastday_mean', 'matrix_123_temporal_20_lastday_sum', 'matrix_123_temporal_20_lastday_max', 'matrix_123_temporal_20_time_mean', 'matrix_123_temporal_20_time_sum', 'matrix_123_temporal_20_time_max', 'matrix_123_temporal_20_recsys_mean', 'matrix_123_temporal_20_recsys_sum', 'matrix_123_temporal_20_recsys_max',
    'matrix_123_type136_20_mean', 'matrix_123_type136_20_sum', 'matrix_123_type136_20_max', 'matrix_123_type136_20_pos-log_mean', 'matrix_123_type136_20_pos-log_sum', 'matrix_123_type136_20_pos-log_max', 'matrix_123_type136_20_type-163_mean', 'matrix_123_type136_20_type-163_sum', 'matrix_123_type136_20_type-163_max', 'matrix_123_type136_20_lastday_mean', 'matrix_123_type136_20_lastday_sum', 'matrix_123_type136_20_lastday_max', 'matrix_123_type136_20_time_mean', 'matrix_123_type136_20_time_sum', 'matrix_123_type136_20_time_max', 'matrix_123_type136_20_recsys_mean', 'matrix_123_type136_20_recsys_sum', 'matrix_123_type136_20_recsys_max',
    'matrix_12__20_mean', 'matrix_12__20_sum', 'matrix_12__20_max', 'matrix_12__20_pos-log_mean', 'matrix_12__20_pos-log_sum', 'matrix_12__20_pos-log_max', 'matrix_12__20_type-163_mean', 'matrix_12__20_type-163_sum', 'matrix_12__20_type-163_max', 'matrix_12__20_lastday_mean', 'matrix_12__20_lastday_sum', 'matrix_12__20_lastday_max', 'matrix_12__20_time_mean', 'matrix_12__20_time_sum', 'matrix_12__20_time_max', 'matrix_12__20_recsys_mean', 'matrix_12__20_recsys_sum', 'matrix_12__20_recsys_max',
    'matrix_123_type0.590.5_20_mean', 'matrix_123_type0.590.5_20_sum', 'matrix_123_type0.590.5_20_max', 'matrix_123_type0.590.5_20_pos-log_mean', 'matrix_123_type0.590.5_20_pos-log_sum', 'matrix_123_type0.590.5_20_pos-log_max', 'matrix_123_type0.590.5_20_type-163_mean', 'matrix_123_type0.590.5_20_type-163_sum', 'matrix_123_type0.590.5_20_type-163_max', 'matrix_123_type0.590.5_20_lastday_mean', 'matrix_123_type0.590.5_20_lastday_sum', 'matrix_123_type0.590.5_20_lastday_max', 'matrix_123_type0.590.5_20_time_mean', 'matrix_123_type0.590.5_20_time_sum', 'matrix_123_type0.590.5_20_time_max', 'matrix_123_type0.590.5_20_recsys_mean', 'matrix_123_type0.590.5_20_recsys_sum', 'matrix_123_type0.590.5_20_recsys_max',
    'matrix_cpu-90_mean', 'matrix_cpu-90_sum', 'matrix_cpu-90_max', 'matrix_cpu-90_pos-log_mean', 'matrix_cpu-90_pos-log_sum', 'matrix_cpu-90_pos-log_max', 'matrix_cpu-90_type-163_mean', 'matrix_cpu-90_type-163_sum', 'matrix_cpu-90_type-163_max', 'matrix_cpu-90_lastday_mean', 'matrix_cpu-90_lastday_sum', 'matrix_cpu-90_lastday_max', 'matrix_cpu-90_time_mean', 'matrix_cpu-90_time_sum', 'matrix_cpu-90_time_max', 'matrix_cpu-90_recsys_mean', 'matrix_cpu-90_recsys_sum', 'matrix_cpu-90_recsys_max',
    'matrix_cpu-95_mean', 'matrix_cpu-95_sum', 'matrix_cpu-95_max', 'matrix_cpu-95_pos-log_mean', 'matrix_cpu-95_pos-log_sum', 'matrix_cpu-95_pos-log_max', 'matrix_cpu-95_type-163_mean', 'matrix_cpu-95_type-163_sum', 'matrix_cpu-95_type-163_max', 'matrix_cpu-95_lastday_mean', 'matrix_cpu-95_lastday_sum', 'matrix_cpu-95_lastday_max', 'matrix_cpu-95_time_mean', 'matrix_cpu-95_time_sum', 'matrix_cpu-95_time_max', 'matrix_cpu-95_recsys_mean', 'matrix_cpu-95_recsys_sum', 'matrix_cpu-95_recsys_max',
    'matrix_cpu-99_mean', 'matrix_cpu-99_sum', 'matrix_cpu-99_max', 'matrix_cpu-99_pos-log_mean', 'matrix_cpu-99_pos-log_sum', 'matrix_cpu-99_pos-log_max', 'matrix_cpu-99_type-163_mean', 'matrix_cpu-99_type-163_sum', 'matrix_cpu-99_type-163_max', 'matrix_cpu-99_lastday_mean', 'matrix_cpu-99_lastday_sum', 'matrix_cpu-99_lastday_max', 'matrix_cpu-99_time_mean', 'matrix_cpu-99_time_sum', 'matrix_cpu-99_time_max', 'matrix_cpu-99_recsys_mean', 'matrix_cpu-99_recsys_sum', 'matrix_cpu-99_recsys_max',
    'matrix_gpu-116_mean', 'matrix_gpu-116_sum', 'matrix_gpu-116_max', 'matrix_gpu-116_pos-log_mean', 'matrix_gpu-116_pos-log_sum', 'matrix_gpu-116_pos-log_max', 'matrix_gpu-116_type-163_mean', 'matrix_gpu-116_type-163_sum', 'matrix_gpu-116_type-163_max', 'matrix_gpu-116_lastday_mean', 'matrix_gpu-116_lastday_sum', 'matrix_gpu-116_lastday_max', 'matrix_gpu-116_time_mean', 'matrix_gpu-116_time_sum', 'matrix_gpu-116_time_max', 'matrix_gpu-116_recsys_mean', 'matrix_gpu-116_recsys_sum', 'matrix_gpu-116_recsys_max',
    'matrix_gpu-115_mean', 'matrix_gpu-115_sum', 'matrix_gpu-115_max', 'matrix_gpu-115_pos-log_mean', 'matrix_gpu-115_pos-log_sum', 'matrix_gpu-115_pos-log_max', 'matrix_gpu-115_type-163_mean', 'matrix_gpu-115_type-163_sum', 'matrix_gpu-115_type-163_max', 'matrix_gpu-115_lastday_mean', 'matrix_gpu-115_lastday_sum', 'matrix_gpu-115_lastday_max', 'matrix_gpu-115_time_mean', 'matrix_gpu-115_time_sum', 'matrix_gpu-115_time_max', 'matrix_gpu-115_recsys_mean', 'matrix_gpu-115_recsys_sum', 'matrix_gpu-115_recsys_max',
    'matrix_gpu-93_mean', 'matrix_gpu-93_sum', 'matrix_gpu-93_max', 'matrix_gpu-93_pos-log_mean', 'matrix_gpu-93_pos-log_sum', 'matrix_gpu-93_pos-log_max', 'matrix_gpu-93_type-163_mean', 'matrix_gpu-93_type-163_sum', 'matrix_gpu-93_type-163_max', 'matrix_gpu-93_lastday_mean', 'matrix_gpu-93_lastday_sum', 'matrix_gpu-93_lastday_max', 'matrix_gpu-93_time_mean', 'matrix_gpu-93_time_sum', 'matrix_gpu-93_time_max', 'matrix_gpu-93_recsys_mean', 'matrix_gpu-93_recsys_sum', 'matrix_gpu-93_recsys_max',
    'matrix_gpu-217_mean', 'matrix_gpu-217_sum', 'matrix_gpu-217_max', 'matrix_gpu-217_pos-log_mean', 'matrix_gpu-217_pos-log_sum', 'matrix_gpu-217_pos-log_max', 'matrix_gpu-217_type-163_mean', 'matrix_gpu-217_type-163_sum', 'matrix_gpu-217_type-163_max', 'matrix_gpu-217_lastday_mean', 'matrix_gpu-217_lastday_sum', 'matrix_gpu-217_lastday_max', 'matrix_gpu-217_time_mean', 'matrix_gpu-217_time_sum', 'matrix_gpu-217_time_max', 'matrix_gpu-217_recsys_mean', 'matrix_gpu-217_recsys_sum', 'matrix_gpu-217_recsys_max',
    'matrix_gpu-226_mean','matrix_gpu-226_sum','matrix_gpu-226_max','matrix_gpu-226_pos-log_mean','matrix_gpu-226_pos-log_sum','matrix_gpu-226_pos-log_max','matrix_gpu-226_type-163_mean','matrix_gpu-226_type-163_sum','matrix_gpu-226_type-163_max','matrix_gpu-226_lastday_mean','matrix_gpu-226_lastday_sum','matrix_gpu-226_lastday_max','matrix_gpu-226_time_mean','matrix_gpu-226_time_sum','matrix_gpu-226_time_max','matrix_gpu-226_recsys_mean','matrix_gpu-226_recsys_sum','matrix_gpu-226_recsys_max',
    'matrix_gpu-232_mean', 'matrix_gpu-232_sum', 'matrix_gpu-232_max', 'matrix_gpu-232_pos-log_mean', 'matrix_gpu-232_pos-log_sum', 'matrix_gpu-232_pos-log_max', 'matrix_gpu-232_type-163_mean', 'matrix_gpu-232_type-163_sum', 'matrix_gpu-232_type-163_max', 'matrix_gpu-232_lastday_mean', 'matrix_gpu-232_lastday_sum', 'matrix_gpu-232_lastday_max', 'matrix_gpu-232_time_mean', 'matrix_gpu-232_time_sum', 'matrix_gpu-232_time_max', 'matrix_gpu-232_recsys_mean', 'matrix_gpu-232_recsys_sum', 'matrix_gpu-232_recsys_max',
    'matrix_gpu-239_mean', 'matrix_gpu-239_sum', 'matrix_gpu-239_max', 'matrix_gpu-239_pos-log_mean', 'matrix_gpu-239_pos-log_sum', 'matrix_gpu-239_pos-log_max', 'matrix_gpu-239_type-163_mean', 'matrix_gpu-239_type-163_sum', 'matrix_gpu-239_type-163_max', 'matrix_gpu-239_lastday_mean', 'matrix_gpu-239_lastday_sum', 'matrix_gpu-239_lastday_max', 'matrix_gpu-239_time_mean', 'matrix_gpu-239_time_sum', 'matrix_gpu-239_time_max', 'matrix_gpu-239_recsys_mean', 'matrix_gpu-239_recsys_sum', 'matrix_gpu-239_recsys_max',
    'matrix_gpu-700_mean', 'matrix_gpu-700_sum', 'matrix_gpu-700_max', 'matrix_gpu-700_pos-log_mean', 'matrix_gpu-700_pos-log_sum', 'matrix_gpu-700_pos-log_max', 'matrix_gpu-700_type-163_mean', 'matrix_gpu-700_type-163_sum', 'matrix_gpu-700_type-163_max', 'matrix_gpu-700_lastday_mean', 'matrix_gpu-700_lastday_sum', 'matrix_gpu-700_lastday_max', 'matrix_gpu-700_time_mean', 'matrix_gpu-700_time_sum', 'matrix_gpu-700_time_max', 'matrix_gpu-700_recsys_mean', 'matrix_gpu-700_recsys_sum', 'matrix_gpu-700_recsys_max',
    'matrix_gpu-701_mean', 'matrix_gpu-701_sum', 'matrix_gpu-701_max', 'matrix_gpu-701_pos-log_mean', 'matrix_gpu-701_pos-log_sum', 'matrix_gpu-701_pos-log_max', 'matrix_gpu-701_type-163_mean', 'matrix_gpu-701_type-163_sum', 'matrix_gpu-701_type-163_max', 'matrix_gpu-701_lastday_mean', 'matrix_gpu-701_lastday_sum', 'matrix_gpu-701_lastday_max', 'matrix_gpu-701_time_mean', 'matrix_gpu-701_time_sum', 'matrix_gpu-701_time_max', 'matrix_gpu-701_recsys_mean', 'matrix_gpu-701_recsys_sum', 'matrix_gpu-701_recsys_max',
    'candidate_clicks_before', 'candidate_carts_before', 'candidate_orders_before', 'candidate_*_before', 'n_views', 'n_clicks', 'n_carts', 'n_orders',
]

In [10]:
TO_REMOVE = []
TO_REMOVE = [f for f in FEATURES if "type-191" in f]
TO_REMOVE += [f for f in FEATURES if "matrix_gpu-220" in f]
TO_REMOVE += [f for f in FEATURES if "matrix_gpu-235" in f]
TO_REMOVE += [f for f in FEATURES if "lasthour" in f]
TO_REMOVE += [f for f in FEATURES if "lastmin" in f]
TO_REMOVE += [f for f in FEATURES if f.startswith("popularity")]
TO_REMOVE += [f for f in FEATURES if "_old" in f]
TO_REMOVE += [f for f in FEATURES if "popularity_w_time" in f]
TO_REMOVE += [f for f in FEATURES if "popularity_w_lastday_w" in f]

FEATURES = [f for f in FEATURES if f not in TO_REMOVE]

In [11]:
len(FEATURES)

322

In [12]:
# df_train = load_parquets_cudf_folds(
#     f"../output/features/fts_val_{VERSION}/*",
#     "../input/folds_4.csv",
#     fold=0,
#     pos_ratio=POS_RATIO,
#     target=TARGET,
#     use_gt=USE_GT_SESSIONS,
# #     max_n=5,
#     train_only=True,
#     columns=['session','candidates','gt_clicks','gt_carts','gt_orders'] + FEATURES,
# )

In [13]:
# df_train = cudf.from_pandas(df_train)
# corr = df_train[FEATURES].corr()
# corr = corr.to_pandas()
# corr = corr.values

# mask = np.zeros_like(corr, dtype=bool)
# mask[np.triu_indices_from(mask)] = True
# corr[mask] = 0

In [14]:
# TH = 0.99

# for i in range(len(corr)):
#     for j in range(len(corr)):
#         if corr[i, j] > TH:
#             if FEATURES[i] in TO_REMOVE or FEATURES[j] in TO_REMOVE:
#                 continue
#             print(FEATURES[i], FEATURES[j], f'{corr[i, j] :.3f}')

In [15]:
REGEX = f"../output/features/fts_val_{VERSION}/*"
len(glob.glob(REGEX))

82

In [16]:
TEST_REGEX = f"../output/features/fts_test_{VERSION}/*"
len(glob.glob(TEST_REGEX))

77

In [17]:
GT_REGEX = f"../output/features/fts_val_{GT_VERSION}/*"
len(glob.glob(GT_REGEX))

21

### Train

In [18]:
# import optuna
import cuml
from sklearn.metrics import roc_auc_score
from numerize.numerize import numerize
from utils.torch import seed_everything

from model_zoo import TRAIN_FCTS, PREDICT_FCTS

def train(df_train, val_regex, config, log_folder=None, optimize=False, fold=0, debug=False):
    seed_everything(config.seed)

    txt = f"{'Optimizing' if optimize else 'Training'} {config.model.upper()} Model"
    print(f"\n-------------   {txt}   -------------\n")

    if optimize:  # TODO
        study = optuna.create_study(direction="minimize")
        objective = lambda x: objective_xgb(x, df_train, val_regex, features, target)
        study.optimize(objective, n_trials=50)
        print(study.best_params)
        return study.best_params

    val_candids = sum([len(cudf.read_parquet(f, columns=['gt_orders'])) for f in glob.glob(val_regex)])
    print(f"    -> {numerize(len(df_train))} training candidates")
    print(f"    -> {numerize(val_candids)} validation candidates\n")
    
    train_fct = TRAIN_FCTS[config.model]
    df_val, model = train_fct(
        df_train,
        val_regex,
        features=config.features,
        target=config.target,
        params=config.params,
        use_es=config.use_es,
        num_boost_round=config.num_boost_round,
        folds_file=config.folds_file,
        fold=fold,
        debug=debug,
    )

    # Feature importance
    if config.model == "xgb":
        ft_imp = model.get_score()
    else:
        ft_imp = model.feature_importances_  # TODO
    try:
        ft_imp = pd.DataFrame(
            pd.Series(ft_imp, index=config.features), columns=["importance"]
        )
    except:
        ft_imp = None
        
    if config.mode == "test":
        return df_val, ft_imp, model

    # Score
    try:
        auc = roc_auc_score(df_val[config.target], df_val["pred"])
    except:
        auc = cuml.metrics.roc_auc_score(df_val[config.target].astype('int32'), df_val["pred"].values)
    
    print(f'\n -> AUC : {auc:.4f}\n')

    if log_folder is None:
        return df_val, ft_imp, model

    # Save model
    if config.model == "xgb":
        model.save_model(log_folder + f"{config.model}_{fold}.json")
    elif config.model == "lgbm":
        try:
            model.booster_.save_model(log_folder + f"{config.model}_{fold}.txt")
        except Exception:
            model.save_model(log_folder + f"{config.model}_{fold}.txt")
    else:   # catboost, verif
        model.save_model(log_folder + f"{config.model}_{fold}.txt")

    return df_val, ft_imp, model

In [19]:
def kfold(regex, test_regex, config, log_folder, debug=False):
    dfs_val, ft_imps, dfs_test = [], [], []
    for fold in range(config.k):
        print(f"\n-------------   Fold {fold + 1} / {config.k}  -------------\n")

        df_train = load_parquets_cudf_folds(
            regex,
            config.folds_file,
            fold=fold,
            pos_ratio=config.pos_ratio,
            target=config.target,
            use_gt=config.use_gt_sessions,
            train_only=True,
            columns=['session', 'candidates', 'gt_clicks', 'gt_carts', 'gt_orders'] + config.features,
            max_n=5 if debug else 0
        )

        if config.use_gt_pos:
            df_train_gt = load_parquets_cudf_folds(
                config.gt_regex,
                config.folds_file,
                fold=fold,
                pos_ratio=-1,
                target=config.target,
                use_gt=config.use_gt_sessions,
                train_only=True,
                columns=['session', 'candidates', 'gt_clicks', 'gt_carts', 'gt_orders'] + config.features,
                max_n=3 if debug else 0
            )
            df_train = pd.concat([df_train, df_train_gt], ignore_index=True)
            df_train = df_train.drop_duplicates(subset=['session', 'candidates'], keep="first").reset_index(drop=True)
            
        df_val, ft_imp, model = train(df_train, regex, config, log_folder=log_folder, fold=fold, debug=debug)
        dfs_val.append(df_val)
        ft_imps.append(ft_imp)
        
        try:
            train_sessions = set(list(df_train["session"].unique()))
            val_sessions = set(list(df_val["session"].unique().to_pandas()))
            print('Train / val sess inter', len(train_sessions.intersection(val_sessions)))
        except:
            pass
        
        if log_folder is None:
            return df_val, ft_imp, None

        predict_fct = PREDICT_FCTS[config.model]
        df_test = predict_fct(model, test_regex, config.features, debug=debug)
        dfs_test.append(df_test)
        
        print('\n -> Saving predictions \n')
        df_val[['session', 'candidates', 'pred']].to_parquet(log_folder + f"df_val_{fold}.parquet")
        
        df_test[['session', 'candidates']] = df_test[['session', 'candidates']].astype('int32')
        df_test['pred'] = df_test['pred'].astype('float32')
        df_test[['session', 'candidates', 'pred']].to_parquet(log_folder + f"df_test_{fold}.parquet")

        del df_train, df_val, ft_imp, df_test, model
        numba.cuda.current_context().deallocations.clear()
        gc.collect()

    dfs_test = cudf.concat(dfs_test).groupby(['session', 'candidates']).mean().reset_index()
    dfs_val =  cudf.concat(dfs_val).sort_values(['session', 'candidates'], ignore_index=True)
    ft_imps = pd.concat(ft_imps).reset_index().groupby('index').mean()

    if log_folder is not None:
        ft_imps.to_csv(log_folder + "ft_imp.csv")
        dfs_test[['session', 'candidates', 'pred']].to_parquet(log_folder + f"df_test.parquet")

    return dfs_val, ft_imps, dfs_test

### Params

In [20]:
PARAMS = {
    "xgb":
    {
        "learning_rate": 0.01,
        'max_depth': 8,  # or 6
        "subsample": 0.75,
        'colsample_bytree': 0.9,
        'reg_alpha': 0.01,
        'reg_lambda': 0.1,
#         "min_child_weight": 0.01,
#         "gamma": 0.01,
#         'scale_pos_weight': 1,
        'eval_metric': 'auc',
        'objective': 'rank:pairwise',  # binary:logistic
        'tree_method':'gpu_hist',
        'predictor':'gpu_predictor',
    },
}

In [21]:
# TO_REMOVE = [
#     'candidate_*_before', 'matrix_gpu-700_lastday_max', 'matrix_12__20_lastday_max', 'matrix_gpu-226_lastday_max', 'matrix_cpu-90_lastday_max', 'matrix_gpu-700_sum', 'matrix_gpu-700_pos-log_sum', 'matrix_12__20_lastday_sum',
#     'matrix_gpu-700_pos-log_max', 'matrix_gpu-226_lastday_sum', 'matrix_gpu-700_max', 'matrix_gpu-700_time_sum', 'matrix_123_type136_20_lastday_max', 'matrix_cpu-90_lastday_sum', 'matrix_cpu-90_type-163_max', 'matrix_gpu-700_time_max',
#     'matrix_12__20_time_sum', 'matrix_gpu-700_type-163_sum', 'matrix_gpu-700_lastday_sum', 'matrix_gpu-700_type-163_max', 'matrix_cpu-90_time_sum', 'matrix_123_type136_20_time_sum', 'matrix_gpu-217_lastday_max', 'matrix_12__20_pos-log_sum',
#     'matrix_12__20_type-163_max', 'matrix_12__20_time_max', 'matrix_cpu-90_max', 'matrix_cpu-90_type-163_sum', 'matrix_cpu-99_lastday_max', 'matrix_cpu-90_sum', 'matrix_gpu-226_sum', 'matrix_gpu-226_time_sum', 'matrix_12__20_time_mean',
#     'matrix_12__20_type-163_mean', 'matrix_gpu-700_pos-log_mean', 'matrix_123_type0.590.5_20_lastday_max', 'matrix_gpu-700_time_mean', 'matrix_12__20_type-163_sum', 'matrix_12__20_pos-log_max', 'matrix_123_type136_20_lastday_sum',
#     'matrix_cpu-90_time_mean', 'matrix_gpu-226_max', 'matrix_123_type136_20_type-163_max', 'matrix_gpu-226_type-163_max', 'matrix_gpu-226_lastday_mean', 'matrix_gpu-226_type-163_sum', 'matrix_cpu-99_time_sum', 'matrix_12__20_lastday_mean',
#     'matrix_gpu-700_type-163_mean','matrix_123_type136_20_type-163_sum'
# ][:50]

In [22]:
class Config:
    seed = 100
    version = VERSION
    
    folds_file = "../input/folds_4.csv"
    k = 4
    mode = ""

    features = FEATURES
#     features = [ft for ft in features if ft not in TO_REMOVE]

    cat_features = []

    target = TARGET
    pos_ratio = POS_RATIO

    use_gt_sessions = True  # filter out sessions with no gt
    use_gt_pos = False  # add candidates from gt
    gt_regex = GT_REGEX
    
    model = "xgb"

    params = PARAMS[model]

    use_es = True
    num_boost_round = 10000

### Main

#### Optimize

#### Train

In [23]:
DEBUG = True
DEBUG_MORE = False

In [24]:
%%time

log_folder = None
if not DEBUG:
    log_folder = prepare_log_folder(LOG_PATH)
    print(f'Logging results to {log_folder}')
    save_config(Config, log_folder + 'config')
    create_logger(directory=log_folder, name="logs.txt")

df_val, ft_imp, df_test = kfold(REGEX, TEST_REGEX, Config, log_folder=log_folder, debug=DEBUG_MORE)


-------------   Fold 1 / 4  -------------



100%|██████████| 82/82 [07:03<00:00,  5.16s/it]



-------------   Training XGB Model   -------------

    -> 992.87K training candidates
    -> 127.05M validation candidates

[0]	val-auc:0.94848
[100]	val-auc:0.95733
[200]	val-auc:0.95868
[300]	val-auc:0.95962
[400]	val-auc:0.96022
[500]	val-auc:0.96063
[600]	val-auc:0.96098
[700]	val-auc:0.96124
[800]	val-auc:0.96144
[900]	val-auc:0.96154
[1000]	val-auc:0.96165
[1100]	val-auc:0.96172
[1200]	val-auc:0.96174
[1300]	val-auc:0.96174
[1400]	val-auc:0.96178
[1500]	val-auc:0.96182
[1600]	val-auc:0.96184
[1700]	val-auc:0.96190
[1800]	val-auc:0.96193
[1900]	val-auc:0.96190
[1992]	val-auc:0.96185

[Infering]


100%|██████████| 82/82 [07:14<00:00,  5.30s/it]



 -> AUC : 0.9668

Train / val sess inter 0
CPU times: user 10min 21s, sys: 8min 25s, total: 18min 47s
Wall time: 21min 1s


In [25]:
# plot_importances(ft_imp)

### Process

In [26]:
preds = df_val[['session', 'candidates', 'pred']].copy()

preds = preds.sort_values(['session', 'pred'], ascending=[True, False])
preds = preds[['session', 'candidates', 'pred']].groupby('session').agg(list).reset_index()

preds = preds.to_pandas()
preds['candidates'] = preds['candidates'].apply(lambda x: x[:20])

In [27]:
# Fill less than 20 candidates.

dfs = load_sessions(f"../output/val_parquet/*")

if Config.target == "gt_carts":
    top = dfs.loc[dfs["type"] == 1, "aid"].value_counts().index.values[:20].tolist()
elif Config.target == "gt_orders":
    top = dfs.loc[dfs["type"] == 2, "aid"].value_counts().index.values[:20].tolist()
else:
    top = dfs.loc[dfs["type"] == 0, "aid"].value_counts().index.values[:20].tolist()

preds['candidates'] = preds['candidates'].apply(lambda x: list(x) + top[: 20 - len(x)])

del dfs
numba.cuda.current_context().deallocations.clear()
gc.collect()

35

In [28]:
folds = pd.read_csv(f"../input/folds_4.csv")
len(folds[folds['fold'] == 0]), len(preds)

(450314, 450314)

In [29]:
gt = pd.read_parquet("../output/val_labels.parquet")

recalls = []
print()
for col in CLASSES:
    if "gt_" + col not in [Config.target]:
        continue

    if f"gt_{col}" not in preds.columns:
        preds = preds.merge(gt[gt["type"] == col].drop("type", axis=1), how="left").rename(
            columns={"ground_truth": f"gt_{col}"}
        )

    n_preds, n_gts, n_found = get_coverage(
        preds["candidates"].values, preds[f"gt_{col}"].values
    )

    print(
        f"- {col}\t-  Found {numerize(n_found)} GTs\t-  Recall : {n_found / n_gts :.4f}"
    )
    recalls.append(n_found / n_gts)


- orders	-  Found 51.91K GTs	-  Recall : 0.6655


- orders	-  Found 51.98K GTs	-  Recall : 0.6664  MORE CANDIDS

- orders	-  Found 207.74K GTs	-  Recall : 0.6632
- carts	-  Found 242.41K GTs	-  Recall : 0.4208
- clicks	-  Found 927.04K GTs	-  Recall : 0.5281

CHRIS :
- orders - CV 0.666 - LB 0.678
- carts - CV 0.437 - LB 0.450
- clicks - CV 0.554 - LB 0.560

In [None]:
# cv = np.average([0.5270, 0.4203, 0.6577], weights=WEIGHTS)
# # cv = np.average([0.5059, 0.4139, 0.6540], weights=WEIGHTS)
# print(f"-> CV : {cv:.4f}")

### Test

In [None]:
preds = df_test[['session', 'candidates', 'pred']].copy()

preds = preds.sort_values(['session', 'pred'], ascending=[True, False])
preds = preds[['session', 'candidates', 'pred']].groupby('session').agg(list).reset_index()

preds = preds.to_pandas()
preds['candidates'] = preds['candidates'].apply(lambda x: x[:20])

In [None]:
# Fill less than 20 candidates. This should be useless in the future

dfs = load_sessions(f"../output/test_parquet/*")

if Config.target == "gt_carts":
    top = dfs.loc[dfs["type"] == 1, "aid"].value_counts().index.values[:20].tolist()
elif Config.target == "gt_orders":
    top = dfs.loc[dfs["type"] == 2, "aid"].value_counts().index.values[:20].tolist()
else:
    top = dfs.loc[dfs["type"] == 0, "aid"].value_counts().index.values[:20].tolist()

preds['candidates'] = preds['candidates'].apply(lambda x: list(x) + top[: 20 - len(x)])

del dfs
numba.cuda.current_context().deallocations.clear()
gc.collect()

In [None]:
log_folder_2 = LOG_PATH + f"{VERSION}.0/"
os.makedirs(log_folder_2, exist_ok=True)
save_config(Config, log_folder_2 + 'config')

In [None]:
if not DEBUG:
    sub = preds[['session', 'candidates']].copy()
    assert len(sub) == 1671803

    sub['candidates'] = sub['candidates'].parallel_apply(lambda x: " ".join(map(str, x)))
    sub['session'] =  sub['session'].astype(str) + "_" + TARGET[3:]
    sub.columns = ["session_type", "labels"]

    sub.to_csv(log_folder + f'sub_{TARGET}.csv', index=False)
    print(f"\n-> Saved sub to {log_folder + f'sub_{TARGET}.csv'}")

    sub.to_csv(log_folder_2 + f'sub_{TARGET}.csv', index=False)
    print(f"-> Saved sub to {log_folder_2 + f'sub_{TARGET}.csv'}\n")

    display(sub.head())

In [None]:
# if all([os.path.exists(log_folder_2 + f'sub_gt_{c}.csv') for c in CLASSES]):
#     sub_final = cudf.concat([
#         cudf.read_csv(log_folder_2 + f'sub_gt_{c}.csv') for c in CLASSES
#     ], ignore_index=True)

#     assert len(sub_final) == 5015409
#     sub_final.to_csv(log_folder_2 + f"submission.csv", index=False)

#     print(f"\n-> Saved final sub to {log_folder_2 + f'submission.csv'}\n")

#     display(sub_final.sample(5))

Done