**About :** Trains XGBoost models.

**TODO**:
- better neg sampling technique ??

In [None]:
cd ../src

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [None]:
import os
import gc
import cudf
import json
import glob
import numba
import xgboost
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from data.fe import load_sessions

warnings.simplefilter(action="ignore", category=FutureWarning)
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500

In [None]:
from params import *

from model_zoo import TRAIN_FCTS

from utils.metrics import get_coverage
from utils.plot import plot_importances
from utils.load import *

### Load

In [None]:
VERSION = "v3.5"
# VERSION = "v2.5"

#### Train data
- neg sampling could use candidates from lower versions

In [None]:
POS_RATIO = 0.1
TARGET = "gt_orders"   # "gt_clicks", "gt_carts", "gt_orders"

In [None]:
df_train = load_parquets_cudf_chunks(
    f"../output/features/fts_train_{VERSION}/*",
    pos_ratio=POS_RATIO,
    target=TARGET,
    n_chunks=5,
)

In [None]:
# df_train = cudf.concat([  # not working ??
#     load_parquets_cudf_chunks(
#         f"../output/features/fts_train_v3.5/*",
#         pos_ratio=0.1,
#         target=TARGET,
#         n_chunks=5,
#     ),
#     load_parquets_cudf_chunks(
#         f"../output/features/fts_train_v4.5/*",
#         pos_ratio=0.,
#         target=TARGET,
#         n_chunks=5,
#     ),
# ], ignore_index=True)

In [None]:
# c = df_train.sort_values(['session', 'candidates']).head().copy()
# ref = df_train.sort_values(['session', 'candidates']).head().copy()

In [None]:
# diff = c[[col for col in ref.columns if col in c.columns]] - ref[[col for col in ref.columns if col in c.columns]]
# for i, c in cudf.DataFrame(diff.max()).to_pandas().iterrows():
#     if c.values:
#         print(i, c.values)

In [None]:
# cols = ['clicks_popularity', 'carts_popularity', 'orders_popularity', 'clicks_popularity_w', 'carts_popularity_w', 'orders_popularity_w', 'clicks_popularity_old', 'carts_popularity_old', 'orders_popularity_old']
# cudf.DataFrame(df_train[[c for c in cols if c in df_train.columns]].mean())

In [None]:
# df_val_c = load_parquets_cudf(f"../output/features/fts_val_c_{VERSION}/*")

# if POS_RATIO:
#     n_neg = int(df_val_c[TARGET].sum() / POS_RATIO)
#     pos = df_val_c.index[df_val_c[TARGET] == 1]
# #     neg = df_val_c[[TARGET]][df_val_c[TARGET] == 0].sample(n_neg).index
# #     df_val_c = df_val_c.iloc[cudf.concat([pos, neg])]
#     df_val_c = df_val_c.iloc[pos]

In [None]:
# df_train = cudf.concat([df_train, df_val_c], ignore_index=True)

# del df_val_c
# numba.cuda.current_context().deallocations.clear()
# gc.collect()

In [None]:
# df_train = df_train.to_pandas()

#### Val data

In [None]:
val_regex = f"../output/features/fts_val_{VERSION}/*"

In [None]:
glob.glob(val_regex)

In [None]:
# %%time
# df_val = pd.read_csv(f'../output/fts_train_{VERSION}.csv', nrows=10_000_000)

### Features

In [None]:
# for c in df_train.columns[5:]:
#     plt.figure(figsize=(10, 4))
#     sns.kdeplot(df_train.head(10000)[c].values, label="train")
#     sns.kdeplot(df_val.head(10000)[c].values, label="val")
#     plt.legend()
#     plt.title(c)
#     plt.show()
#     break

In [None]:
# TO_REMOVE = [
#     "clicks_popularity_lin", "carts_popularity_lin", "orders_popularity_lin", "views_popularity_lin",
#     "clicks_popularity_lin_w", "carts_popularity_lin_w", "orders_popularity_lin_w", "views_popularity_lin_w",
#     "clicks_popularity_lin_old", "carts_popularity_lin_old", "orders_popularity_lin_old", "views_popularity_lin_old",
#     "clicks_popularity", "carts_popularity", "orders_popularity", "views_popularity"
# ]

In [None]:
# corr = df_train[df_train.columns[5:]].corr().to_pandas()

# corr.values[np.triu_indices_from(corr.values)] = 0
# corr = corr * (1 - np.eye(len(corr)))

In [None]:
# TH = 0.99

# cols = list(corr.columns[corr.max() > TH])

# for i in cols:
#     for j in cols:
#         if i not in TO_REMOVE and j not in TO_REMOVE:
#             if corr.loc[i, j] > TH:
#                 print(f'{i} - {j} : {corr.loc[i, j] :.4f}')

### Train

In [None]:
# import optuna
import cuml
from sklearn.metrics import roc_auc_score
from numerize.numerize import numerize
from utils.torch import seed_everything


def train(df_train, val_regex, config, log_folder=None, optimize=False):
    seed_everything(config.seed)

    txt = f"{'Optimizing' if optimize else 'Training'} {config.model.upper()} Model"
    print(f"\n-------------   {txt}   -------------\n")

#     if config.pos_ratio:
#         n_neg = int(df_train[config.target].sum() / config.pos_ratio)
#         pos = df_train.index[df_train[config.target] == 1]
#         neg = df_train[[config.target]][df_train[config.target] == 0].sample(n_neg).index
#         df_train = df_train.iloc[cudf.concat([pos, neg])]

    if optimize:  # TODO
        study = optuna.create_study(direction="minimize")
        objective = lambda x: objective_xgb(x, df_train, val_regex, features, target)
        study.optimize(objective, n_trials=50)
        print(study.best_params)
        return study.best_params

    val_candids = sum([len(cudf.read_parquet(f, columns=['gt_orders'])) for f in glob.glob(val_regex)])
    print(f"    -> {numerize(len(df_train))} training candidates")
    print(f"    -> {numerize(val_candids)} validation candidates\n")
    
    train_fct = TRAIN_FCTS[config.model]
    df_val, model = train_fct(
        df_train,
        val_regex,
        features=config.features,
        target=config.target,
        params=config.params,
        n_candidates_es=config.n_candidates_es,
    )
    
    # Score
    try:
        auc = roc_auc_score(df_val[config.target], df_val["pred"])
    except:
        auc = cuml.metrics.roc_auc_score(df_val[config.target].astype('int32'), df_val["pred"].values)
    
    print(f'\n -> AUC : {auc:.4f}\n')

    # Feature importance
    if config.model == "xgb":
        ft_imp = model.get_score()
    else:
        ft_imp = model.feature_importances_  # TODO
    try:
        ft_imp = pd.DataFrame(
            pd.Series(ft_imp, index=config.features), columns=["importance"]
        )
    except:
        ft_imp = None
  
    if log_folder is None:
        return df_val, ft_imp, model

    # Save stuff
    if config.model == "xgb":
        model.save_model(log_folder + f"{config.model}_{fold}.json")
    elif config.model == "lgbm":
        try:
            model.booster_.save_model(log_folder + f"{config.model}_{fold}.txt")
        except Exception:
            model.save_model(log_folder + f"{config.model}_{fold}.txt")
    else:   # catboost, verif
        model.save_model(log_folder + f"{config.model}_{fold}.txt")

    ft_imp.to_csv(log_folder + "ft_imp.csv")
    df_val.to_csv(log_folder + "df_val.csv", index=False)
    
    return df_val, ft_imp, model

### Params

In [None]:
PARAMS = {
    "xgb":
    {
        "learning_rate": 0.01,
        'max_depth': 5,
        "subsample": 0.25,
        'colsample_bytree': 0.9,
        'reg_alpha': 0.01,
        'reg_lambda': 0.1,
#         "min_child_weight": 0.01,
#         "gamma": 0.01,
        'eval_metric':'auc',  # map
        'objective':'binary:logistic',  # 'rank:pairwise',
        'tree_method':'gpu_hist',
        'predictor':'gpu_predictor',
    },
    "catboost":
        {
        'depth': 12,
        "l2_leaf_reg": 0.1,
        "min_data_in_leaf": 2000,
        'reg_lambda': 0.1,
        "model_size_reg": 0.5,
        "border_count": 256,
        },
    "lgbm": {
        "learning_rate": 0.05,
        "num_leaves": 511,
        "colsample_bytree": 0.5,
        "reg_alpha": 1,
        "reg_lambda": 70,
        "min_child_samples": 2000,  # MODIF  # 2000
        "min_split_gain": 0.02,
        "min_child_weight": 0.03,
        "path_smooth": 0.2,
#             "min_data_in_bin": 32,
    }
}

In [None]:
class Config:
    seed = 100
    version = VERSION

    features = [
        'logspace_w', 'linspace_w', 'linspace_w_t163', 'logspace_w_t163', 'linspace_w_t191', 'logspace_w_t191',

        'matrix_123_temporal_20_mean', 'matrix_123_temporal_20_sum', 'matrix_123_temporal_20_max',
        'matrix_123_temporal_20_logspace_mean', 'matrix_123_temporal_20_logspace_sum', 'matrix_123_temporal_20_logspace_max',
        'matrix_123_temporal_20_linspace_mean', 'matrix_123_temporal_20_linspace_sum', 'matrix_123_temporal_20_linspace_max',
        'matrix_123_type136_20_mean', 'matrix_123_type136_20_sum', 'matrix_123_type136_20_max',
        'matrix_123_type136_20_logspace_mean', 'matrix_123_type136_20_logspace_sum', 'matrix_123_type136_20_logspace_max',
        'matrix_123_type136_20_linspace_mean', 'matrix_123_type136_20_linspace_sum', 'matrix_123_type136_20_linspace_max',
        'matrix_12__20_mean', 'matrix_12__20_sum', 'matrix_12__20_max',
        'matrix_12__20_logspace_mean', 'matrix_12__20_logspace_sum', 'matrix_12__20_logspace_max',
        'matrix_12__20_linspace_mean', 'matrix_12__20_linspace_sum', 'matrix_12__20_linspace_max',
        'matrix_123_type0.590.5_20_mean', 'matrix_123_type0.590.5_20_sum', 'matrix_123_type0.590.5_20_max',
        'matrix_123_type0.590.5_20_logspace_mean', 'matrix_123_type0.590.5_20_logspace_sum', 'matrix_123_type0.590.5_20_logspace_max',
        'matrix_123_type0.590.5_20_linspace_mean', 'matrix_123_type0.590.5_20_linspace_sum', 'matrix_123_type0.590.5_20_linspace_max',
        
        'clicks_popularity_w', 'carts_popularity_w', 'orders_popularity_w',
        'view_popularity_log_w', 'view_popularity_lin_w', 
    
        'clicks_popularity', 'carts_popularity', 'orders_popularity',
        'view_popularity_log', 'view_popularity_lin',
        
        'clicks_popularity_old', 'carts_popularity_old', 'orders_popularity_old',
        'view_popularity_log_old', 'view_popularity_lin_old',

        'candidate_clicks_before', 'candidate_carts_before', 'candidate_orders_before', 'candidate_*_before',
        'n_views', 'n_clicks', 'n_carts', 'n_orders',
    ]

    cat_features = []

    target = TARGET  # "gt_orders", "gt_clicks", "gt_orders"
    pos_ratio = POS_RATIO
    model = "xgb"

    params = PARAMS[model]
    n_candidates_es = 10_000_000

    use_es = True

### Main

In [None]:
OPTIMIZE = False
TRAIN = True
DEBUG = True

#### Optimize

In [None]:
Config.params

#### Train

In [None]:
%%time

# if TRAIN:
log_folder = None
if not DEBUG:
    log_folder = prepare_log_folder(LOG_PATH + f"lvl_{LEVEL}/")
    print(f'Logging results to {log_folder}')
    save_config(Config, log_folder + 'config')
    create_logger(directory=log_folder, name="logs.txt")

df_val, ft_imp, model = train(df_train, val_regex, Config, log_folder=log_folder)

In [None]:
# plot_importances(ft_imp)

### Eval

In [None]:
gt = pd.read_parquet("../output/val_labels.parquet")

In [None]:
# preds = cudf.from_pandas(df_val)

preds = df_val[['session', 'candidates', 'pred']].copy()

preds = preds.sort_values(['session', 'pred'], ascending=[True, False])
preds = preds[['session', 'candidates', 'pred']].groupby('session').agg(list).reset_index()

preds = preds.to_pandas()
preds['candidates'] = preds['candidates'].apply(lambda x: x[:20])

In [None]:
dfs = load_sessions("../output/val_parquet/*")

if Config.target == "gt_carts":
    top = dfs.loc[dfs["type"] == 1, "aid"].value_counts().index.values[:20].tolist()
elif Config.target == "gt_orders":
    top = dfs.loc[dfs["type"] == 2, "aid"].value_counts().index.values[:20].tolist()
else:
    top = dfs.loc[dfs["type"] == 0, "aid"].value_counts().index.values[:20].tolist()

preds['candidates'] = preds['candidates'].apply(lambda x: list(x) + top[: 20 - len(x)])

del dfs
numba.cuda.current_context().deallocations.clear()
gc.collect()

In [None]:
recalls = []
for col in CLASSES:
    if "gt_" + col not in [Config.target]:
        continue

    if f"gt_{col}" not in preds.columns:
        preds = preds.merge(gt[gt["type"] == col].drop("type", axis=1), how="left").rename(
            columns={"ground_truth": f"gt_{col}"}
        )

    n_preds, n_gts, n_found = get_coverage(
        preds["candidates"].values, preds[f"gt_{col}"].values
    )

    print(
        f"- {col}\t-  Found {numerize(n_found)} GTs\t-  Recall : {n_found / n_gts :.4f}"
    )
    recalls.append(n_found / n_gts)

- orders	-  Found 205.96K GTs	-  Recall : 0.6578
- carts	-  Found 238.4K GTs	-  Recall : 0.4203
- clicks	-  Found 888.07K GTs	-  Recall : 0.5059

In [None]:
cv = np.average([0.5273, 0.4203, 0.6578], weights=WEIGHTS)
# cv = np.average([0.5059, 0.4139, 0.6540], weights=WEIGHTS)
print(f"-> CV : {cv:.4f}")

**To beat :** CV = 5643
- clicks recall = 0.5260
- carts recall = 0.4094
- orders recall = 0.6482

Done