**About :** Trains XGBoost models.

**TODO**:
- Merlin loader

In [None]:
cd ../src

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [None]:
import os
import gc
import cudf
import json
import glob
import numba
import xgboost
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from pandarallel import pandarallel
from numerize.numerize import numerize

warnings.simplefilter(action="ignore", category=FutureWarning)
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
pandarallel.initialize(nb_workers=32, progress_bar=False)

In [None]:
from params import *

from data.dataset import FeaturesDataset
from model_zoo.mlp import define_model
from training.mlp import train

from utils.load import *
from utils.metrics import get_coverage
from utils.logger import save_config, prepare_log_folder, create_logger

### Load

In [None]:
VERSION = "v3.5"
# VERSION = "v2.5"

In [None]:
MODE = "val"

#### Train data
- neg sampling could use candidates from lower versions

In [None]:
POS_RATIO = 0.1
TARGET = "gt_*"   # "gt_clicks", "gt_carts", "gt_orders", "gt_*"

In [None]:
path = f'../output/tmp/df_{MODE}_{POS_RATIO}_{TARGET}.parquet'

if not os.path.exists(path):
    if MODE == "val":
        df_train = load_parquets_cudf_chunks(
            f"../output/features/fts_train_{VERSION}/*",
            pos_ratio=POS_RATIO,
            target=TARGET,
            n_chunks=5,
        )
    
    else:  # Test
        df_train = load_parquets_cudf_chunks(
            f"../output/features/fts_val_{VERSION}/*",
            pos_ratio=POS_RATIO,
            target=TARGET,
            n_chunks=5,
        )
        val_regex = f"../output/features/fts_test_{VERSION}/*"
        
    print(f'-> Saving to {path}')
    df_train.to_pandas().to_parquet(path)

In [None]:
if MODE == "val":
    val_regex = f"../output/features/fts_val_{VERSION}/*"
else:
    val_regex = f"../output/features/fts_test_{VERSION}/*"

In [None]:
val_files = glob.glob(val_regex)
train_files = [path]

#### Val data

In [None]:
from merlin.io import Dataset
from merlin.loader.torch import Loader

In [None]:
dataset = Dataset(train_files, engine="parquet")
loader = Loader(
    dataset,
    batch_size=2**16,
    shuffle=False,
#     pin_memory=True,
#     worker_init_fn=worker_init_fn,
#     persistent_workers=True,
#     num_workers=NUM_WORKERS,
)

In [None]:
# %%time
# batch = next(iter(loader))[0]

In [None]:
# import torch

In [None]:
# y = cudf.read_parquet(path, columns=['session', "candidates"] + [TARGET])

In [None]:
# y

In [None]:
# %%time
# x = torch.cat([batch[k] for k in batch.keys()], 1)

In [None]:
# dataset = FeaturesDataset(df_train, ["gt_carts"], df_train.columns[5:])

In [None]:
# %%time
# df_val = pd.read_csv(f'../output/fts_train_{VERSION}.csv', nrows=10_000_000)

### Features

In [None]:
# model = define_model("res")
# x = torch.rand(5, 50)
# model(x)

### Params

In [None]:
class Config:
    seed = 100
    version = VERSION
    mode = MODE
    pos_ratio = POS_RATIO

    features = [
        'logspace_w', 'linspace_w', 'linspace_w_t163', 'logspace_w_t163', 'linspace_w_t191', 'logspace_w_t191',

        'matrix_123_temporal_20_mean', 'matrix_123_temporal_20_sum', 'matrix_123_temporal_20_max',
        'matrix_123_temporal_20_logspace_mean', 'matrix_123_temporal_20_logspace_sum', 'matrix_123_temporal_20_logspace_max',
        'matrix_123_temporal_20_linspace_mean', 'matrix_123_temporal_20_linspace_sum', 'matrix_123_temporal_20_linspace_max',
        'matrix_123_type136_20_mean', 'matrix_123_type136_20_sum', 'matrix_123_type136_20_max',
        'matrix_123_type136_20_logspace_mean', 'matrix_123_type136_20_logspace_sum', 'matrix_123_type136_20_logspace_max',
        'matrix_123_type136_20_linspace_mean', 'matrix_123_type136_20_linspace_sum', 'matrix_123_type136_20_linspace_max',
        'matrix_12__20_mean', 'matrix_12__20_sum', 'matrix_12__20_max',
        'matrix_12__20_logspace_mean', 'matrix_12__20_logspace_sum', 'matrix_12__20_logspace_max',
        'matrix_12__20_linspace_mean', 'matrix_12__20_linspace_sum', 'matrix_12__20_linspace_max',
        'matrix_123_type0.590.5_20_mean', 'matrix_123_type0.590.5_20_sum', 'matrix_123_type0.590.5_20_max',
        'matrix_123_type0.590.5_20_logspace_mean', 'matrix_123_type0.590.5_20_logspace_sum', 'matrix_123_type0.590.5_20_logspace_max',
        'matrix_123_type0.590.5_20_linspace_mean', 'matrix_123_type0.590.5_20_linspace_sum', 'matrix_123_type0.590.5_20_linspace_max',
        
        'clicks_popularity_w', 'carts_popularity_w', 'orders_popularity_w',
        'view_popularity_log_w', 'view_popularity_lin_w', 
    
        'clicks_popularity', 'carts_popularity', 'orders_popularity',
        'view_popularity_log', 'view_popularity_lin',
        
        'clicks_popularity_old', 'carts_popularity_old', 'orders_popularity_old',
        'view_popularity_log_old', 'view_popularity_lin_old',

        'candidate_clicks_before', 'candidate_carts_before', 'candidate_orders_before', 'candidate_*_before',
        'n_views', 'n_clicks', 'n_carts', 'n_orders',
    ]

    target = [TARGET] if TARGET != "gt_*" else ["gt_clicks", "gt_carts", "gt_orders"]
    pos_ratio = POS_RATIO
    
    # Model
    model = "mlp"
    nb_ft = len(features)
    d = 768
    p = 0.1
    num_layers = 3
    num_classes = len(target)

    # Training    
    loss_config = {
        "name": "bce",
        "smoothing": 0.,
        "activation": "sigmoid",
    }

    data_config = {
        "target": target,
        "features": features,
        "batch_size": 2 ** 17,
        "val_bs": 2 ** 17,
    }

    optimizer_config = {
        "name": "Adam",
        "lr": 1e-3,
        "warmup_prop": 0.,
        "betas": (0.9, 0.999),
    }

    epochs = 50  # 70

    use_fp16 = True

    verbose = 1
    verbose_eval = 1000

### Main

In [None]:
DEBUG = False
df_val = None

#### Train

In [None]:
# log_folder = None
# if not DEBUG:
#     log_folder = prepare_log_folder(LOG_PATH)
#     print(f'Logging results to {log_folder}')
#     save_config(Config, log_folder + 'config')
#     create_logger(directory=log_folder, name="logs.txt")

# df_val = train(train_files, val_files, Config, log_folder=log_folder)

- XGB ES AUC carts : 0.96893
- Best CV :  0.5696
- CV to beat : 0.5735
### Process

In [None]:
EXP_FOLDERS = [
    "../logs/2023-01-08/13/",
    "../logs/2023-01-08/16/",
    "../logs/2023-01-08/17/",
]

In [None]:
df_val = None

for exp_folder in tqdm(EXP_FOLDERS):
    df_val_ = cudf.read_parquet(exp_folder + "results.parquet")
    df_val_ = df_val_.sort_values(['session', 'candidates'])
    if df_val is None:
        df_val = df_val_.copy()
    else:
#         assert df_val
        for c in df_val.columns[2:]:
            df_val[c] += df_val_[c]

for c in df_val.columns[2:]:
    df_val[c] /= len(EXP_FOLDERS)

df_val[['session', 'candidates']] = df_val[['session', 'candidates']].astype("int64")

In [None]:
dfs = load_sessions(f"../output/{MODE}_parquet/*")
preds = df_val[['session']].drop_duplicates(keep="first").sort_values('session', ignore_index=True).to_pandas()

for idx, c in enumerate(CLASSES):
    if "gt_" + c not in Config.target:
        continue
            
    preds_c = df_val.sort_values(['session', f'pred_{c}'], ascending=[True, False])
    preds_c = preds_c[['session', 'candidates', f'pred_{c}']].groupby('session').agg(list).reset_index()

    preds_c = preds_c.to_pandas()
    preds_c['candidates'] = preds_c['candidates'].apply(lambda x: x[:20])
    
    # Fill less than 20 candidates. This should be useless in the future
    top = dfs.loc[dfs["type"] == idx, "aid"].value_counts().index.values[:20].tolist()
    preds_c['candidates'] = preds_c['candidates'].apply(lambda x: list(x) + top[:20 - len(x)])
    
    preds_c = preds_c.sort_values('session')
    preds[f"candidates_{c}"] = preds_c["candidates"].values
    preds[f'pred_{c}'] = preds_c[f'pred_{c}'].values

In [None]:
del dfs, preds_c
numba.cuda.current_context().deallocations.clear()
gc.collect()

### Eval

In [None]:
if MODE != "test":
    gt = pd.read_parquet("../output/val_labels.parquet")

    recalls = []
    for col in CLASSES:
        if "gt_" + col not in Config.target:
            continue

        if f"gt_{col}" not in preds.columns:
            preds = preds.merge(gt[gt["type"] == col].drop("type", axis=1), how="left").rename(
                columns={"ground_truth": f"gt_{col}"}
            )

        n_preds, n_gts, n_found = get_coverage(
            preds[f"candidates_{col}"].values, preds[f"gt_{col}"].values
        )

        print(
            f"- {col} \t-  Found {numerize(n_found)} GTs\t-  Recall : {n_found / n_gts :.4f}"
        )
        recalls.append(n_found / n_gts)
        
    cv = np.average(recalls, weights=WEIGHTS)
    # cv = np.average([0.5059, 0.4139, 0.6540], weights=WEIGHTS)
    print(f"\n-> CV : {cv:.4f}")

- clicks 	-  Found 924.24K GTs	-  Recall : 0.5265
- carts 	-  Found 241.15K GTs	-  Recall : 0.4186
- orders 	-  Found 205.97K GTs	-  Recall : 0.6575

-> CV : 0.5728

### Save
TODO 

In [None]:
if MODE == "test":
    sub = preds[['session', 'candidates']].copy()
    assert len(sub) == 1671803

    sub['candidates'] = sub['candidates'].parallel_apply(lambda x: " ".join(map(str, x)))
    sub['session'] =  sub['session'].astype(str) + "_" + TARGET[3:]
    sub.columns = ["session_type", "labels"]
    
    sub.to_csv(log_folder + f'sub_{TARGET}.csv', index=False)
    print(f"-> Saved sub to {log_folder + f'sub_{TARGET}.csv'}\n")

    display(sub.head())

In [None]:
if MODE == "test":
    if all([os.path.exists(log_folder + f'sub_gt_{c}.csv') for c in CLASSES]):
        
        sub_final = cudf.concat([
            cudf.read_csv(log_folder + f'sub_gt_{c}.csv') for c in CLASSES
        ], ignore_index=True)
        
        assert len(sub_final) == 5015409
        sub_final.to_csv(log_folder + f"submission_{cv:.4f}.csv", index=False)
        
        print(f"-> Saved final sub to {log_folder + f'submission_{cv:.4f}.csv'}\n")
        
        display(sub_final.sample(5))

Done