**About :** Trains XGBoost models.

**TODO**:
- better neg sampling technique ??

In [None]:
cd ../src

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
import os
import gc
import cudf
import json
import glob
import numba
import xgboost
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from pandarallel import pandarallel

warnings.simplefilter(action="ignore", category=FutureWarning)
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
pandarallel.initialize(nb_workers=32, progress_bar=False)

In [None]:
from params import *

from utils.metrics import get_coverage
from utils.plot import plot_importances
from utils.load import *
from utils.logger import save_config, prepare_log_folder, create_logger

### Load

In [None]:
VERSION = "v3.5"
# VERSION = "v2.5"

#### Train data
- neg sampling could use candidates from lower versions

In [None]:
POS_RATIO = 0.1
TARGET = "gt_clicks"   # "gt_clicks", "gt_carts", "gt_orders"

In [None]:
# df_train[TARGET].mean(), df_val[TARGET].mean()

In [None]:
# def load_sessions(regex):
#     dfs = []
#     for idx, chunk_file in enumerate(glob.glob(regex)):
#         df = cudf.read_parquet(chunk_file, columns=["session"])
#         dfs.append(df.drop_duplicates(keep="first"))

#     return cudf.concat(dfs).reset_index(drop=True)

# sessions = load_sessions( f"../output/features/fts_val_{VERSION}/*")

# from sklearn.model_selection import KFold
# K = 4

# kf = KFold(n_splits=K, shuffle=True, random_state=42)
# splits = kf.split(sessions)

# sessions['fold'] = -1
# for i, (_, val_idx) in enumerate(splits):
#     sessions.loc[val_idx, "fold"] = i

# sessions.to_csv(f"../input/folds_{K}.csv", index=False)

In [None]:
# if MODE == "val":

#     val_regex = f"../output/features/fts_val_{VERSION}/*"
# else:  # Test
#     df_train = load_parquets_cudf_chunks(
#         f"../output/features/fts_val_{VERSION}/*",
#         pos_ratio=POS_RATIO,
#         target=TARGET,
#         n_chunks=5,
#     )
#     val_regex = f"../output/features/fts_test_{VERSION}/*"

In [None]:
# df_train, df_val = load_parquets_cudf_chunks_folds(
#     f"../output/features/fts_val_{VERSION}/*",
#     "../input/folds_4.csv",
#     fold=0,
#     pos_ratio=POS_RATIO,
#     target=TARGET,
#     n_chunks=5,
# )[0]

In [None]:
REGEX = f"../output/features/fts_val_{VERSION}/*"

In [None]:
TEST_REGEX = f"../output/features/fts_test_{VERSION}/*"

In [None]:
# df_train = cudf.concat([  # not working ??
#     load_parquets_cudf_chunks(
#         f"../output/features/fts_train_v3.5/*",
#         pos_ratio=0.1,
#         target=TARGET,
#         n_chunks=5,
#     ),
#     load_parquets_cudf_chunks(
#         f"../output/features/fts_train_v4.5/*",
#         pos_ratio=0.,
#         target=TARGET,
#         n_chunks=5,
#     ),
# ], ignore_index=True)

In [None]:
# df_val_c = load_parquets_cudf(f"../output/features/fts_val_c_{VERSION}/*")

# if POS_RATIO:
#     n_neg = int(df_val_c[TARGET].sum() / POS_RATIO)
#     pos = df_val_c.index[df_val_c[TARGET] == 1]
# #     neg = df_val_c[[TARGET]][df_val_c[TARGET] == 0].sample(n_neg).index
# #     df_val_c = df_val_c.iloc[cudf.concat([pos, neg])]
#     df_val_c = df_val_c.iloc[pos]

In [None]:
# df_train = cudf.concat([df_train, df_val_c], ignore_index=True)

# del df_val_c
# numba.cuda.current_context().deallocations.clear()
# gc.collect()

In [None]:
# df_train = df_train.to_pandas()

### Train

In [None]:
# import optuna
import cuml
from sklearn.metrics import roc_auc_score
from numerize.numerize import numerize
from utils.torch import seed_everything

from model_zoo import TRAIN_FCTS, PREDICT_FCTS

def train(df_train, val_regex, config, log_folder=None, optimize=False, fold=0):
    seed_everything(config.seed)

    txt = f"{'Optimizing' if optimize else 'Training'} {config.model.upper()} Model"
    print(f"\n-------------   {txt}   -------------\n")

    if optimize:  # TODO
        study = optuna.create_study(direction="minimize")
        objective = lambda x: objective_xgb(x, df_train, val_regex, features, target)
        study.optimize(objective, n_trials=50)
        print(study.best_params)
        return study.best_params

    val_candids = sum([len(cudf.read_parquet(f, columns=['gt_orders'])) for f in glob.glob(val_regex)])
    print(f"    -> {numerize(len(df_train))} training candidates")
    print(f"    -> {numerize(val_candids)} validation candidates\n")
    
    train_fct = TRAIN_FCTS[config.model]
    df_val, model = train_fct(
        df_train,
        val_regex,
        features=config.features,
        target=config.target,
        params=config.params,
        use_es=config.use_es,
        num_boost_round=config.num_boost_round,
        folds_file=config.folds_file,
        fold=fold
    )

    # Feature importance
    if config.model == "xgb":
        ft_imp = model.get_score()
    else:
        ft_imp = model.feature_importances_  # TODO
    try:
        ft_imp = pd.DataFrame(
            pd.Series(ft_imp, index=config.features), columns=["importance"]
        )
    except:
        ft_imp = None
        
    if config.mode == "test":
        return df_val, ft_imp, model

    # Score
    try:
        auc = roc_auc_score(df_val[config.target], df_val["pred"])
    except:
        auc = cuml.metrics.roc_auc_score(df_val[config.target].astype('int32'), df_val["pred"].values)
    
    print(f'\n -> AUC : {auc:.4f}\n')

    if log_folder is None:
        return df_val, ft_imp, model

    # Save model
    if config.model == "xgb":
        model.save_model(log_folder + f"{config.model}_{fold}.json")
    elif config.model == "lgbm":
        try:
            model.booster_.save_model(log_folder + f"{config.model}_{fold}.txt")
        except Exception:
            model.save_model(log_folder + f"{config.model}_{fold}.txt")
    else:   # catboost, verif
        model.save_model(log_folder + f"{config.model}_{fold}.txt")

    return df_val, ft_imp, model

In [None]:
def kfold(regex, test_regex, config, log_folder):
    dfs_val, ft_imps, dfs_test = [], [], []
    for fold in range(config.k):
        print(f"\n-------------   Fold {fold + 1} / {config.k}  -------------\n")

        df_train = load_parquets_cudf_chunks_folds(
            regex,
            config.folds_file,
            fold=fold,
            pos_ratio=config.pos_ratio,
            target=config.target,
            n_chunks=5,
            train_only=True
        )

        df_val, ft_imp, model = train(df_train, regex, config, log_folder=log_folder, fold=fold)
        dfs_val.append(df_val)
        ft_imps.append(ft_imp)

        predict_fct = PREDICT_FCTS[config.model]
        pred_test = predict_fct(model, test_regex, config.features)
        dfs_test.append(pred_test)
        
        if log_folder is not None:
            df_val[['session', 'candidates', 'pred']].to_parquet(log_folder + f"df_val_{fold}.parquet")

    dfs_test = cudf.concat(dfs_test).groupby(['session', 'candidates']).mean().reset_index()
    dfs_val =  cudf.concat(dfs_val).sort_values(['session', 'candidates'], ignore_index=True)
    ft_imps = pd.concat(ft_imps).reset_index().groupby('index').mean()

    if log_folder is not None:
        ft_imp.to_csv(log_folder + "ft_imp.csv")
        dfs_test[['session', 'candidates', 'pred']].to_parquet(log_folder + f"df_test.parquet")

    return dfs_val, ft_imps, dfs_test

### Params

In [None]:
PARAMS = {
    "xgb":
    {
        "learning_rate": 0.01,
        'max_depth': 5,
        "subsample": 0.25,
        'colsample_bytree': 0.9,
        'reg_alpha': 0.01,
        'reg_lambda': 0.1,
#         "min_child_weight": 0.01,
#         "gamma": 0.01,
        'eval_metric':'auc',  # map
        'objective':'binary:logistic',  # 'rank:pairwise',
        'tree_method':'gpu_hist',
        'predictor':'gpu_predictor',
    },
    "catboost":
        {
        'depth': 12,
        "l2_leaf_reg": 0.1,
        "min_data_in_leaf": 2000,
        'reg_lambda': 0.1,
        "model_size_reg": 0.5,
        "border_count": 256,
        },
    "lgbm": {
        "learning_rate": 0.05,
        "num_leaves": 511,
        "colsample_bytree": 0.5,
        "reg_alpha": 1,
        "reg_lambda": 70,
        "min_child_samples": 2000,  # MODIF  # 2000
        "min_split_gain": 0.02,
        "min_child_weight": 0.03,
        "path_smooth": 0.2,
#             "min_data_in_bin": 32,
    }
}

In [None]:
class Config:
    seed = 100
    version = VERSION
    
    folds_file = "../input/folds_4.csv"
    k = 4
    mode = ""

    features = [
        'logspace_w', 'linspace_w', 'linspace_w_t163', 'logspace_w_t163', 'linspace_w_t191', 'logspace_w_t191',

        'matrix_123_temporal_20_mean', 'matrix_123_temporal_20_sum', 'matrix_123_temporal_20_max',
        'matrix_123_temporal_20_logspace_mean', 'matrix_123_temporal_20_logspace_sum', 'matrix_123_temporal_20_logspace_max',
        'matrix_123_temporal_20_linspace_mean', 'matrix_123_temporal_20_linspace_sum', 'matrix_123_temporal_20_linspace_max',
        'matrix_123_type136_20_mean', 'matrix_123_type136_20_sum', 'matrix_123_type136_20_max',
        'matrix_123_type136_20_logspace_mean', 'matrix_123_type136_20_logspace_sum', 'matrix_123_type136_20_logspace_max',
        'matrix_123_type136_20_linspace_mean', 'matrix_123_type136_20_linspace_sum', 'matrix_123_type136_20_linspace_max',
        'matrix_12__20_mean', 'matrix_12__20_sum', 'matrix_12__20_max',
        'matrix_12__20_logspace_mean', 'matrix_12__20_logspace_sum', 'matrix_12__20_logspace_max',
        'matrix_12__20_linspace_mean', 'matrix_12__20_linspace_sum', 'matrix_12__20_linspace_max',
        'matrix_123_type0.590.5_20_mean', 'matrix_123_type0.590.5_20_sum', 'matrix_123_type0.590.5_20_max',
        'matrix_123_type0.590.5_20_logspace_mean', 'matrix_123_type0.590.5_20_logspace_sum', 'matrix_123_type0.590.5_20_logspace_max',
        'matrix_123_type0.590.5_20_linspace_mean', 'matrix_123_type0.590.5_20_linspace_sum', 'matrix_123_type0.590.5_20_linspace_max',
        
        'clicks_popularity_w', 'carts_popularity_w', 'orders_popularity_w',
        'view_popularity_log_w', 'view_popularity_lin_w', 
    
        'clicks_popularity', 'carts_popularity', 'orders_popularity',
        'view_popularity_log', 'view_popularity_lin',
        
        'clicks_popularity_old', 'carts_popularity_old', 'orders_popularity_old',
        'view_popularity_log_old', 'view_popularity_lin_old',

        'candidate_clicks_before', 'candidate_carts_before', 'candidate_orders_before', 'candidate_*_before',
        'n_views', 'n_clicks', 'n_carts', 'n_orders',
    ]

    cat_features = []

    target = TARGET
    pos_ratio = POS_RATIO
    model = "xgb"

    params = PARAMS[model]

    use_es = True
    num_boost_round = 10000

### Main

#### Optimize

In [None]:
Config.params

#### Train

In [None]:
DEBUG = False

In [None]:
%%time

log_folder = None
if not DEBUG:
    log_folder = prepare_log_folder(LOG_PATH)
    print(f'Logging results to {log_folder}')
    save_config(Config, log_folder + 'config')
    create_logger(directory=log_folder, name="logs.txt")
else:
    TEST_REGEX = '../output/features/fts_test_v3.5/1_005*'

df_val, ft_imp, df_test = kfold(REGEX, TEST_REGEX, Config, log_folder=log_folder)

In [None]:
# plot_importances(ft_imp)

### Process
- I have missing sessions ! 

In [None]:
preds = df_val[['session', 'candidates', 'pred']].copy()

preds = preds.sort_values(['session', 'pred'], ascending=[True, False])
preds = preds[['session', 'candidates', 'pred']].groupby('session').agg(list).reset_index()

preds = preds.to_pandas()
preds['candidates'] = preds['candidates'].apply(lambda x: x[:20])

In [None]:
# Fill less than 20 candidates. This should be useless in the future

dfs = load_sessions(f"../output/val_parquet/*")

if Config.target == "gt_carts":
    top = dfs.loc[dfs["type"] == 1, "aid"].value_counts().index.values[:20].tolist()
elif Config.target == "gt_orders":
    top = dfs.loc[dfs["type"] == 2, "aid"].value_counts().index.values[:20].tolist()
else:
    top = dfs.loc[dfs["type"] == 0, "aid"].value_counts().index.values[:20].tolist()

preds['candidates'] = preds['candidates'].apply(lambda x: list(x) + top[: 20 - len(x)])

del dfs
numba.cuda.current_context().deallocations.clear()
gc.collect()

In [None]:
gt = pd.read_parquet("../output/val_labels.parquet")

recalls = []
print()
for col in CLASSES:
    if "gt_" + col not in [Config.target]:
        continue

    if f"gt_{col}" not in preds.columns:
        preds = preds.merge(gt[gt["type"] == col].drop("type", axis=1), how="left").rename(
            columns={"ground_truth": f"gt_{col}"}
        )

    n_preds, n_gts, n_found = get_coverage(
        preds["candidates"].values, preds[f"gt_{col}"].values
    )

    print(
        f"- {col}\t-  Found {numerize(n_found)} GTs\t-  Recall : {n_found / n_gts :.4f}"
    )
    recalls.append(n_found / n_gts)

- orders	-  Found 206.1K GTs	-  Recall : 0.6580
- carts	-  Found 242.41K GTs	-  Recall : 0.4208
- clicks	-  Found 927.04K GTs	-  Recall : 0.5281

In [None]:
# cv = np.average([0.5270, 0.4203, 0.6577], weights=WEIGHTS)
# # cv = np.average([0.5059, 0.4139, 0.6540], weights=WEIGHTS)
# print(f"-> CV : {cv:.4f}")

### Test

In [None]:
preds = df_test[['session', 'candidates', 'pred']].copy()

preds = preds.sort_values(['session', 'pred'], ascending=[True, False])
preds = preds[['session', 'candidates', 'pred']].groupby('session').agg(list).reset_index()

preds = preds.to_pandas()
preds['candidates'] = preds['candidates'].apply(lambda x: x[:20])

In [None]:
# Fill less than 20 candidates. This should be useless in the future

dfs = load_sessions(f"../output/test_parquet/*")

if Config.target == "gt_carts":
    top = dfs.loc[dfs["type"] == 1, "aid"].value_counts().index.values[:20].tolist()
elif Config.target == "gt_orders":
    top = dfs.loc[dfs["type"] == 2, "aid"].value_counts().index.values[:20].tolist()
else:
    top = dfs.loc[dfs["type"] == 0, "aid"].value_counts().index.values[:20].tolist()

preds['candidates'] = preds['candidates'].apply(lambda x: list(x) + top[: 20 - len(x)])

del dfs
numba.cuda.current_context().deallocations.clear()
gc.collect()

In [None]:
log_folder_2 = LOG_PATH + f"{VERSION}.1/"
os.makedirs(log_folder_2, exist_ok=True)
save_config(Config, log_folder_2 + 'config')

In [None]:
if not DEBUG:
    sub = preds[['session', 'candidates']].copy()
    assert len(sub) == 1671803

    sub['candidates'] = sub['candidates'].parallel_apply(lambda x: " ".join(map(str, x)))
    sub['session'] =  sub['session'].astype(str) + "_" + TARGET[3:]
    sub.columns = ["session_type", "labels"]

    sub.to_csv(log_folder + f'sub_{TARGET}.csv', index=False)
    print(f"\n-> Saved sub to {log_folder + f'sub_{TARGET}.csv'}")

    sub.to_csv(log_folder_2 + f'sub_{TARGET}.csv', index=False)
    print(f"-> Saved sub to {log_folder + f'sub_{TARGET}.csv'}\n")

    display(sub.head())

In [None]:
# if all([os.path.exists(log_folder_2 + f'sub_gt_{c}.csv') for c in CLASSES]):
#     sub_final = cudf.concat([
#         cudf.read_csv(log_folder_2 + f'sub_gt_{c}.csv') for c in CLASSES
#     ], ignore_index=True)

#     assert len(sub_final) == 5015409
#     sub_final.to_csv(log_folder_2 + f"submission.csv", index=False)

#     print(f"\n-> Saved final sub to {log_folder_2 + f'submission.csv'}\n")

#     display(sub_final.sample(5))

Done