**About :** Trains XGBoost models.

**TODO**:
- Tweak pos prop, lr, bs, model size
- change arch
- retrain fullfit for test ? or other folds ?

In [1]:
cd ../src

/workspace/kaggle_otto_rs/src


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [4]:
import os
import gc
import cudf
import json
import glob
import numba
import xgboost
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from pandarallel import pandarallel
from numerize.numerize import numerize

warnings.simplefilter(action="ignore", category=FutureWarning)
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
pandarallel.initialize(nb_workers=32, progress_bar=False)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [5]:
from params import *

from data.dataset import FeaturesDataset
from model_zoo.mlp import define_model
from training.mlp import train

from utils.load import *
from utils.metrics import get_coverage
from utils.logger import save_config, prepare_log_folder, create_logger

### Load

In [6]:
VERSION = "c-orders-v4.7"

In [7]:
TARGET = "gt_*"   # "gt_clicks", "gt_carts", "gt_orders", "gt_*"

In [8]:
POS_RATIO = -1

In [9]:
FEATURES = [  # REMOVE CORRELATED
    'clicks_popularity_w_pos-log', 'clicks_popularity_w_type-163', 'clicks_popularity_w_lastday', 'clicks_popularity_w_recsys', 
    'carts_popularity_w_pos-log', 'carts_popularity_w_type-163', 'carts_popularity_w_lastday', 'carts_popularity_w_recsys', 
    'orders_popularity_w_pos-log', 'orders_popularity_w_type-163', 'orders_popularity_w_lastday', 'orders_popularity_w_recsys', 
    'clicks_popularity_w_pos-log_w', 'clicks_popularity_w_type-163_w', 'clicks_popularity_w_recsys_w', 
    'carts_popularity_w_pos-log_w', 'carts_popularity_w_type-163_w', 'carts_popularity_w_recsys_w', 
    'orders_popularity_w_pos-log_w', 'orders_popularity_w_type-163_w', 'orders_popularity_w_recsys_w',
    'w_pos-log', 'w_type-163', 'w_lastday', 'w_time', 'w_recsys',
    'matrix_123_temporal_20_mean', 'matrix_123_temporal_20_sum', 'matrix_123_temporal_20_max', 'matrix_123_temporal_20_pos-log_mean', 'matrix_123_temporal_20_pos-log_sum', 'matrix_123_temporal_20_pos-log_max', 'matrix_123_temporal_20_type-163_mean', 'matrix_123_temporal_20_type-163_sum', 'matrix_123_temporal_20_type-163_max', 'matrix_123_temporal_20_lastday_mean', 'matrix_123_temporal_20_lastday_sum', 'matrix_123_temporal_20_lastday_max', 'matrix_123_temporal_20_time_mean', 'matrix_123_temporal_20_time_sum', 'matrix_123_temporal_20_time_max', 'matrix_123_temporal_20_recsys_mean', 'matrix_123_temporal_20_recsys_sum', 'matrix_123_temporal_20_recsys_max',
    'matrix_123_type136_20_mean', 'matrix_123_type136_20_sum', 'matrix_123_type136_20_max', 'matrix_123_type136_20_pos-log_mean', 'matrix_123_type136_20_pos-log_sum', 'matrix_123_type136_20_pos-log_max', 'matrix_123_type136_20_type-163_mean', 'matrix_123_type136_20_type-163_sum', 'matrix_123_type136_20_type-163_max', 'matrix_123_type136_20_lastday_mean', 'matrix_123_type136_20_lastday_sum', 'matrix_123_type136_20_lastday_max', 'matrix_123_type136_20_time_mean', 'matrix_123_type136_20_time_sum', 'matrix_123_type136_20_time_max', 'matrix_123_type136_20_recsys_mean', 'matrix_123_type136_20_recsys_sum', 'matrix_123_type136_20_recsys_max',
    'matrix_12__20_mean', 'matrix_12__20_sum', 'matrix_12__20_max', 'matrix_12__20_pos-log_mean', 'matrix_12__20_pos-log_sum', 'matrix_12__20_pos-log_max', 'matrix_12__20_type-163_mean', 'matrix_12__20_type-163_sum', 'matrix_12__20_type-163_max', 'matrix_12__20_lastday_mean', 'matrix_12__20_lastday_sum', 'matrix_12__20_lastday_max', 'matrix_12__20_time_mean', 'matrix_12__20_time_sum', 'matrix_12__20_time_max', 'matrix_12__20_recsys_mean', 'matrix_12__20_recsys_sum', 'matrix_12__20_recsys_max',
    'matrix_123_type0.590.5_20_mean', 'matrix_123_type0.590.5_20_sum', 'matrix_123_type0.590.5_20_max', 'matrix_123_type0.590.5_20_pos-log_mean', 'matrix_123_type0.590.5_20_pos-log_sum', 'matrix_123_type0.590.5_20_pos-log_max', 'matrix_123_type0.590.5_20_type-163_mean', 'matrix_123_type0.590.5_20_type-163_sum', 'matrix_123_type0.590.5_20_type-163_max', 'matrix_123_type0.590.5_20_lastday_mean', 'matrix_123_type0.590.5_20_lastday_sum', 'matrix_123_type0.590.5_20_lastday_max', 'matrix_123_type0.590.5_20_time_mean', 'matrix_123_type0.590.5_20_time_sum', 'matrix_123_type0.590.5_20_time_max', 'matrix_123_type0.590.5_20_recsys_mean', 'matrix_123_type0.590.5_20_recsys_sum', 'matrix_123_type0.590.5_20_recsys_max',
    'matrix_cpu-90_mean', 'matrix_cpu-90_sum', 'matrix_cpu-90_max', 'matrix_cpu-90_pos-log_mean', 'matrix_cpu-90_pos-log_sum', 'matrix_cpu-90_pos-log_max', 'matrix_cpu-90_type-163_mean', 'matrix_cpu-90_type-163_sum', 'matrix_cpu-90_type-163_max', 'matrix_cpu-90_lastday_mean', 'matrix_cpu-90_lastday_sum', 'matrix_cpu-90_lastday_max', 'matrix_cpu-90_time_mean', 'matrix_cpu-90_time_sum', 'matrix_cpu-90_time_max', 'matrix_cpu-90_recsys_mean', 'matrix_cpu-90_recsys_sum', 'matrix_cpu-90_recsys_max',
    'matrix_cpu-95_mean', 'matrix_cpu-95_sum', 'matrix_cpu-95_max', 'matrix_cpu-95_pos-log_mean', 'matrix_cpu-95_pos-log_sum', 'matrix_cpu-95_pos-log_max', 'matrix_cpu-95_type-163_mean', 'matrix_cpu-95_type-163_sum', 'matrix_cpu-95_type-163_max', 'matrix_cpu-95_lastday_mean', 'matrix_cpu-95_lastday_sum', 'matrix_cpu-95_lastday_max', 'matrix_cpu-95_time_mean', 'matrix_cpu-95_time_sum', 'matrix_cpu-95_time_max', 'matrix_cpu-95_recsys_mean', 'matrix_cpu-95_recsys_sum', 'matrix_cpu-95_recsys_max',
    'matrix_cpu-99_mean', 'matrix_cpu-99_sum', 'matrix_cpu-99_max', 'matrix_cpu-99_pos-log_mean', 'matrix_cpu-99_pos-log_sum', 'matrix_cpu-99_pos-log_max', 'matrix_cpu-99_type-163_mean', 'matrix_cpu-99_type-163_sum', 'matrix_cpu-99_type-163_max', 'matrix_cpu-99_lastday_mean', 'matrix_cpu-99_lastday_sum', 'matrix_cpu-99_lastday_max', 'matrix_cpu-99_time_mean', 'matrix_cpu-99_time_sum', 'matrix_cpu-99_time_max', 'matrix_cpu-99_recsys_mean', 'matrix_cpu-99_recsys_sum', 'matrix_cpu-99_recsys_max',
    'matrix_gpu-116_mean', 'matrix_gpu-116_sum', 'matrix_gpu-116_max', 'matrix_gpu-116_pos-log_mean', 'matrix_gpu-116_pos-log_sum', 'matrix_gpu-116_pos-log_max', 'matrix_gpu-116_type-163_mean', 'matrix_gpu-116_type-163_sum', 'matrix_gpu-116_type-163_max', 'matrix_gpu-116_lastday_mean', 'matrix_gpu-116_lastday_sum', 'matrix_gpu-116_lastday_max', 'matrix_gpu-116_time_mean', 'matrix_gpu-116_time_sum', 'matrix_gpu-116_time_max', 'matrix_gpu-116_recsys_mean', 'matrix_gpu-116_recsys_sum', 'matrix_gpu-116_recsys_max',
    'matrix_gpu-115_mean', 'matrix_gpu-115_sum', 'matrix_gpu-115_max', 'matrix_gpu-115_pos-log_mean', 'matrix_gpu-115_pos-log_sum', 'matrix_gpu-115_pos-log_max', 'matrix_gpu-115_type-163_mean', 'matrix_gpu-115_type-163_sum', 'matrix_gpu-115_type-163_max', 'matrix_gpu-115_lastday_mean', 'matrix_gpu-115_lastday_sum', 'matrix_gpu-115_lastday_max', 'matrix_gpu-115_time_mean', 'matrix_gpu-115_time_sum', 'matrix_gpu-115_time_max', 'matrix_gpu-115_recsys_mean', 'matrix_gpu-115_recsys_sum', 'matrix_gpu-115_recsys_max',
    'matrix_gpu-93_mean', 'matrix_gpu-93_sum', 'matrix_gpu-93_max', 'matrix_gpu-93_pos-log_mean', 'matrix_gpu-93_pos-log_sum', 'matrix_gpu-93_pos-log_max', 'matrix_gpu-93_type-163_mean', 'matrix_gpu-93_type-163_sum', 'matrix_gpu-93_type-163_max', 'matrix_gpu-93_lastday_mean', 'matrix_gpu-93_lastday_sum', 'matrix_gpu-93_lastday_max', 'matrix_gpu-93_time_mean', 'matrix_gpu-93_time_sum', 'matrix_gpu-93_time_max', 'matrix_gpu-93_recsys_mean', 'matrix_gpu-93_recsys_sum', 'matrix_gpu-93_recsys_max',
    'matrix_gpu-217_mean', 'matrix_gpu-217_sum', 'matrix_gpu-217_max', 'matrix_gpu-217_pos-log_mean', 'matrix_gpu-217_pos-log_sum', 'matrix_gpu-217_pos-log_max', 'matrix_gpu-217_type-163_mean', 'matrix_gpu-217_type-163_sum', 'matrix_gpu-217_type-163_max', 'matrix_gpu-217_lastday_mean', 'matrix_gpu-217_lastday_sum', 'matrix_gpu-217_lastday_max', 'matrix_gpu-217_time_mean', 'matrix_gpu-217_time_sum', 'matrix_gpu-217_time_max', 'matrix_gpu-217_recsys_mean', 'matrix_gpu-217_recsys_sum', 'matrix_gpu-217_recsys_max',
    'matrix_gpu-226_mean','matrix_gpu-226_sum','matrix_gpu-226_max','matrix_gpu-226_pos-log_mean','matrix_gpu-226_pos-log_sum','matrix_gpu-226_pos-log_max','matrix_gpu-226_type-163_mean','matrix_gpu-226_type-163_sum','matrix_gpu-226_type-163_max','matrix_gpu-226_lastday_mean','matrix_gpu-226_lastday_sum','matrix_gpu-226_lastday_max','matrix_gpu-226_time_mean','matrix_gpu-226_time_sum','matrix_gpu-226_time_max','matrix_gpu-226_recsys_mean','matrix_gpu-226_recsys_sum','matrix_gpu-226_recsys_max',
    'matrix_gpu-232_mean', 'matrix_gpu-232_sum', 'matrix_gpu-232_max', 'matrix_gpu-232_pos-log_mean', 'matrix_gpu-232_pos-log_sum', 'matrix_gpu-232_pos-log_max', 'matrix_gpu-232_type-163_mean', 'matrix_gpu-232_type-163_sum', 'matrix_gpu-232_type-163_max', 'matrix_gpu-232_lastday_mean', 'matrix_gpu-232_lastday_sum', 'matrix_gpu-232_lastday_max', 'matrix_gpu-232_time_mean', 'matrix_gpu-232_time_sum', 'matrix_gpu-232_time_max', 'matrix_gpu-232_recsys_mean', 'matrix_gpu-232_recsys_sum', 'matrix_gpu-232_recsys_max',
    'matrix_gpu-239_mean', 'matrix_gpu-239_sum', 'matrix_gpu-239_max', 'matrix_gpu-239_pos-log_mean', 'matrix_gpu-239_pos-log_sum', 'matrix_gpu-239_pos-log_max', 'matrix_gpu-239_type-163_mean', 'matrix_gpu-239_type-163_sum', 'matrix_gpu-239_type-163_max', 'matrix_gpu-239_lastday_mean', 'matrix_gpu-239_lastday_sum', 'matrix_gpu-239_lastday_max', 'matrix_gpu-239_time_mean', 'matrix_gpu-239_time_sum', 'matrix_gpu-239_time_max', 'matrix_gpu-239_recsys_mean', 'matrix_gpu-239_recsys_sum', 'matrix_gpu-239_recsys_max',
    'matrix_gpu-700_mean', 'matrix_gpu-700_sum', 'matrix_gpu-700_max', 'matrix_gpu-700_pos-log_mean', 'matrix_gpu-700_pos-log_sum', 'matrix_gpu-700_pos-log_max', 'matrix_gpu-700_type-163_mean', 'matrix_gpu-700_type-163_sum', 'matrix_gpu-700_type-163_max', 'matrix_gpu-700_lastday_mean', 'matrix_gpu-700_lastday_sum', 'matrix_gpu-700_lastday_max', 'matrix_gpu-700_time_mean', 'matrix_gpu-700_time_sum', 'matrix_gpu-700_time_max', 'matrix_gpu-700_recsys_mean', 'matrix_gpu-700_recsys_sum', 'matrix_gpu-700_recsys_max',
    'matrix_gpu-701_mean', 'matrix_gpu-701_sum', 'matrix_gpu-701_max', 'matrix_gpu-701_pos-log_mean', 'matrix_gpu-701_pos-log_sum', 'matrix_gpu-701_pos-log_max', 'matrix_gpu-701_type-163_mean', 'matrix_gpu-701_type-163_sum', 'matrix_gpu-701_type-163_max', 'matrix_gpu-701_lastday_mean', 'matrix_gpu-701_lastday_sum', 'matrix_gpu-701_lastday_max', 'matrix_gpu-701_time_mean', 'matrix_gpu-701_time_sum', 'matrix_gpu-701_time_max', 'matrix_gpu-701_recsys_mean', 'matrix_gpu-701_recsys_sum', 'matrix_gpu-701_recsys_max',
    'candidate_clicks_before', 'candidate_carts_before', 'candidate_orders_before', 'candidate_*_before', 'n_views', 'n_clicks', 'n_carts', 'n_orders',
]


In [10]:
TO_REMOVE = []
TO_REMOVE = [f for f in FEATURES if "type-191" in f]
TO_REMOVE += [f for f in FEATURES if "matrix_gpu-220" in f]
TO_REMOVE += [f for f in FEATURES if "matrix_gpu-235" in f]
TO_REMOVE += [f for f in FEATURES if "lasthour" in f]
TO_REMOVE += [f for f in FEATURES if "lastmin" in f]
TO_REMOVE += [f for f in FEATURES if f.startswith("popularity")]
TO_REMOVE += [f for f in FEATURES if "_old" in f]
TO_REMOVE += [f for f in FEATURES if "popularity_w_time" in f]
TO_REMOVE += [f for f in FEATURES if "popularity_w_lastday_w" in f]

FEATURES = [f for f in FEATURES if f not in TO_REMOVE]

#### Data prep

In [11]:
def prepare_train_val_data(regex, folds_file, pos_ratio=0, target="", train_only=False, use_gt=False, columns=None, save_folder=""):
    files = sorted(glob.glob(regex))    
    folds = cudf.read_csv(folds_file)
    n_folds = int(folds['fold'].max()) + 1

    for idx, file in enumerate(tqdm(files)):
        df = cudf.read_parquet(file, columns=columns)
        df = df.merge(folds, on="session", how="left")
        
        for fold in range(n_folds):
            os.makedirs(save_folder + f"{fold}", exist_ok=True)
            os.makedirs(save_folder + f"{fold}/train/", exist_ok=True)
            os.makedirs(save_folder + f"{fold}/val/", exist_ok=True)

            if not train_only:
                df_val = df[df['fold'] == fold].reset_index(drop=True)

            df_train = df[df['fold'] != fold].reset_index(drop=True)

            if target:  # Subsample
                df_train['gt_*'] = df_train[['gt_carts', "gt_clicks", "gt_orders"]].max(axis=1)

                if use_gt:
                    gt = cudf.read_parquet("../output/val_labels.parquet")
                    kept_sessions = gt[gt['type'] == target[3:]].drop('ground_truth', axis=1)
                    df_train = df_train.merge(kept_sessions, on="session", how="left").dropna(0).drop('type', axis=1).reset_index(drop=True)

                pos = df_train.index[df_train[target] == 1]

                if pos_ratio > 0:
                    try:
                        n_neg = int(df_train[target].sum() / pos_ratio)
                        neg = df_train[[target]][df_train[target] == 0].sample(n_neg).index
                        df_train = df_train.iloc[cudf.concat([pos, neg])]
                    except:
                        pass
                elif pos_ratio == -1:  # only positives
                    df_train = df_train.iloc[pos]
                else:
                    pass

                df_train.drop('gt_*', axis=1, inplace=True)
                
            if not train_only:
                df_val.to_parquet(save_folder + f"{fold}/val/" + file.split('/')[-1])
                del df_val

            df_train.to_parquet(save_folder + f"{fold}/train/" + file.split('/')[-1])
            
            del df_train
            numba.cuda.current_context().deallocations.clear()
            gc.collect()
            
            break
            
        del df
        numba.cuda.current_context().deallocations.clear()
        gc.collect()

In [12]:
# prepare_train_val_data(
#     f"../output/features/fts_val_{VERSION}/*",
#     "../input/folds_4.csv",
#     pos_ratio=-1,
#     target=TARGET,
#     use_gt=False,
#     train_only=True,
#     columns=['session','candidates','gt_clicks','gt_carts','gt_orders'] + FEATURES,
#     save_folder= f"../output/features/fts_val_{VERSION}_split/"
# )

In [13]:
FOLD = 0  # "*"

val_regex = f"../output/features/fts_val_{VERSION}_split/{FOLD}/val/*"
val_files = glob.glob(val_regex)

train_regex = f"../output/features/fts_val_{VERSION}_split/{FOLD}/train/*"
train_files = glob.glob(train_regex)

In [14]:
test_regex = f"../output/features/fts_test_{VERSION}/*"
test_files = glob.glob(test_regex)

In [15]:
n_val = sum([len(cudf.read_parquet(f, columns=['gt_orders'])) for f in val_files[:10]])
n_train  = sum([len(cudf.read_parquet(f, columns=['gt_orders'])) for f in train_files])

print(n_train, n_val)

1062858 3981185


#### Val data

In [16]:
# from merlin.io import Dataset
# from merlin.loader.torch import Loader

In [17]:
# dataset = Dataset(train_files, engine="parquet")
# loader = Loader(
#     dataset,
#     batch_size=2**16,
#     shuffle=False,
# #     pin_memory=True,
# #     worker_init_fn=worker_init_fn,
# #     persistent_workers=True,
# #     num_workers=NUM_WORKERS,
# )

In [18]:
# %%time
# batch = next(iter(loader))[0]

In [19]:
# import torch

In [20]:
# y = cudf.read_parquet(path, columns=['session', "candidates"] + [TARGET])

In [21]:
# y

In [22]:
# %%time
# x = torch.cat([batch[k] for k in batch.keys()], 1)

In [23]:
# dataset = FeaturesDataset(df_train, ["gt_carts"], df_train.columns[5:])

In [24]:
# %%time
# df_val = pd.read_csv(f'../output/fts_train_{VERSION}.csv', nrows=10_000_000)

In [25]:
# model = define_model("res")
# x = torch.rand(5, 50)
# model(x)

### Params

In [26]:
class Config:
    seed = 100
    version = VERSION
    pos_ratio = POS_RATIO

    features = FEATURES

    target = [TARGET] if TARGET != "gt_*" else ["gt_clicks", "gt_carts", "gt_orders"]
    pos_ratio = POS_RATIO
    
    # Model
    model = "mlp"
    nb_ft = len(features)
    d = 768
    p = 0.1
    num_layers = 3
    num_classes = len(target)

    # Training    
    loss_config = {
        "name": "bce",
        "smoothing": 0.,
        "activation": "sigmoid",
    }

    data_config = {
        "target": target,
        "features": features,
        "batch_size": 2 ** 17,
        "val_bs": 2 ** 16,
    }

    optimizer_config = {
        "name": "Adam",
        "lr": 1e-3,
        "warmup_prop": 0.,
        "betas": (0.9, 0.999),
    }

    epochs = 50  # 70

    use_fp16 = True

    verbose = 1
    verbose_eval = 200

### Main

In [27]:
DEBUG = False
DEBUG_MORE = False
df_val = None

#### Train

In [28]:
log_folder = None
if not DEBUG:
    log_folder = prepare_log_folder(LOG_PATH)
    print(f'Logging results to {log_folder}')
    save_config(Config, log_folder + 'config')
    create_logger(directory=log_folder, name="logs.txt")

df_val = train(train_files, val_files, test_files, Config, log_folder=log_folder, debug=DEBUG_MORE)

Logging results to ../logs/2023-01-16/4/

-------------   Training MLP Model   -------------

    -> 1.06M training candidates
    -> 3.98M validation candidates subset
    -> 639.62K trainable parameters

Epoch 50/50 (step 0451)	lr=0.0e+00	 t=2166s 	 loss=0.351	 val_loss=0.351	 auc=0.8108  (0.599, 0.679, 0.912)

    -> Inferring 31.77M val candidates

100%|##########| 485/485 [04:13<00:00,  1.91it/s]

-> gt_clicks - AUC : 0.5975
-> gt_carts - AUC : 0.6821
-> gt_orders - AUC : 0.9128

- clicks 	-  Found 134.8K GTs	-  Recall : 0.3072
- carts 	-  Found 34.3K GTs	-  Recall : 0.2386
- orders 	-  Found 44.95K GTs	-  Recall : 0.5763

-> CV : 0.4481

    -> Inferring 119.78M test candidates per chunk

100%|##########| 193/193 [00:43<00:00,  4.40it/s]
100%|##########| 213/213 [00:50<00:00,  4.26it/s]
100%|##########| 228/228 [00:55<00:00,  4.12it/s]
100%|##########| 141/141 [00:31<00:00,  4.43it/s]
100%|##########| 219/219 [00:51<00:00,  4.24it/s]
100%|##########| 188/188 [00:45<00:00,  4.10it

- clicks 	-  Found 239.1K GTs	-  Recall : 0.5450
- carts 	-  Found 62.01K GTs	-  Recall : 0.4313
- orders 	-  Found 51.63K GTs	-  Recall : 0.6620


XGB : 0.554 / 0.439 / 0.666

### Process

In [29]:
# EXP_FOLDERS = [
#     "../logs/2023-01-08/13/",
#     "../logs/2023-01-08/16/",
#     "../logs/2023-01-08/17/",
# ]

# if df_val is None:
#     for exp_folder in tqdm(EXP_FOLDERS):
#         df_val_ = cudf.read_parquet(exp_folder + "results.parquet")
#         df_val_ = df_val_.sort_values(['session', 'candidates'])
#         if df_val is None:
#             df_val = df_val_.copy()
#         else:
#     #         assert df_val
#             for c in df_val.columns[2:]:
#                 df_val[c] += df_val_[c]

#     for c in df_val.columns[2:]:
#         df_val[c] /= len(EXP_FOLDERS)

#     df_val[['session', 'candidates']] = df_val[['session', 'candidates']].astype("int32")

Done