**About :** Trains XGBoost models.

**TODO**:
- better neg sampling technique ??

In [1]:
cd ../src

/workspace/kaggle_otto_rs/src


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [26]:
import os
import gc
import cudf
import json
import glob
import numba
import xgboost
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from numerize.numerize import numerize
from pandarallel import pandarallel

warnings.simplefilter(action="ignore", category=FutureWarning)
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
pandarallel.initialize(nb_workers=32, progress_bar=False)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [9]:
from params import *

from utils.metrics import get_coverage
from utils.plot import plot_importances
from utils.load import *
from utils.logger import Config

### Load

In [6]:
VERSION = "c-orders-v4.7"
GT_VERSION = "gt.7"

#### Train data
- neg sampling could use candidates from lower versions

In [7]:
POS_RATIO = 0.2
TARGET = "gt_orders"   # "gt_clicks", "gt_carts", "gt_orders"

In [15]:
EXP_FOLDERS = [
    "../logs/2023-01-14/6/"
]

In [18]:
df_val, df_test = None, None
for exp_folder in tqdm(EXP_FOLDERS):
    config = Config(json.load(open(exp_folder + "config.json", "r")))
    print(f'-> Exp {exp_folder}', end="\t")
    print("Target :", config.target, "\n")
    dfs_val, dfs_test = [], []
    
    for fold in range(config.k):
        try:
            dfs_val.append(cudf.read_parquet(exp_folder + f"df_val_{fold}.parquet"))
            dfs_test.append(cudf.read_parquet(exp_folder + f"df_test_{fold}.parquet"))
        except:
            print(f'Fold {fold} missing !')
        
    dfs_val = cudf.concat(dfs_val, ignore_index=True)
    dfs_test = cudf.concat(dfs_test, ignore_index=True).groupby(['session', 'candidates']).mean().reset_index()

    if df_val is None: 
        df_val = dfs_val
    else:
        df_val = df_val.add(dfs_val)  # TODO, probably need to set some index

    if df_test is None:
        df_test = dfs_test
    else:
        df_test = df_test.add(dfs_val)  # TODO, probably need to set some index

  0%|          | 0/1 [00:00<?, ?it/s]

-> Exp ../logs/2023-01-14/6/	Target : gt_orders 



100%|██████████| 1/1 [00:06<00:00,  6.23s/it]


### Process

In [19]:
preds = df_val[['session', 'candidates', 'pred']].copy()

preds = preds.sort_values(['session', 'pred'], ascending=[True, False])
preds = preds[['session', 'candidates', 'pred']].groupby('session').agg(list).reset_index()

preds = preds.to_pandas()
preds['candidates'] = preds['candidates'].apply(lambda x: x[:20])

In [21]:
# Fill less than 20 candidates.

dfs = load_sessions(f"../output/val_parquet/*")

if config.target == "gt_carts":
    top = dfs.loc[dfs["type"] == 1, "aid"].value_counts().index.values[:20].tolist()
elif config.target == "gt_orders":
    top = dfs.loc[dfs["type"] == 2, "aid"].value_counts().index.values[:20].tolist()
else:
    top = dfs.loc[dfs["type"] == 0, "aid"].value_counts().index.values[:20].tolist()

preds['candidates'] = preds['candidates'].apply(lambda x: list(x) + top[: 20 - len(x)])

del dfs
numba.cuda.current_context().deallocations.clear()
gc.collect()

35

In [23]:
folds = pd.read_csv(f"../input/folds_4.csv")
len(folds), len(preds)

(1801254, 1801251)

In [27]:
gt = pd.read_parquet("../output/val_labels.parquet")

recalls = []
print()
for col in CLASSES:
    if "gt_" + col not in [config.target]:
        continue

    if f"gt_{col}" not in preds.columns:
        preds = preds.merge(gt[gt["type"] == col].drop("type", axis=1), how="left").rename(
            columns={"ground_truth": f"gt_{col}"}
        )

    n_preds, n_gts, n_found = get_coverage(
        preds["candidates"].values, preds[f"gt_{col}"].values
    )

    print(
        f"- {col}\t-  Found {numerize(n_found)} GTs\t-  Recall : {n_found / n_gts :.4f}"
    )
    recalls.append(n_found / n_gts)


- orders	-  Found 208.51K GTs	-  Recall : 0.6657


- orders	-  Found 51.98K GTs	-  Recall : 0.6664  MORE CANDIDS

- orders	-  Found 207.74K GTs	-  Recall : 0.6632
- carts	-  Found 242.41K GTs	-  Recall : 0.4208
- clicks	-  Found 927.04K GTs	-  Recall : 0.5281

CHRIS :
- orders - CV 0.666 - LB 0.678
- carts - CV 0.437 - LB 0.450
- clicks - CV 0.554 - LB 0.560

In [None]:
# cv = np.average([0.5270, 0.4203, 0.6577], weights=WEIGHTS)
# # cv = np.average([0.5059, 0.4139, 0.6540], weights=WEIGHTS)
# print(f"-> CV : {cv:.4f}")

### Test

In [None]:
preds = df_test[['session', 'candidates', 'pred']].copy()

preds = preds.sort_values(['session', 'pred'], ascending=[True, False])
preds = preds[['session', 'candidates', 'pred']].groupby('session').agg(list).reset_index()

preds = preds.to_pandas()
preds['candidates'] = preds['candidates'].apply(lambda x: x[:20])

In [None]:
# Fill less than 20 candidates. This should be useless in the future

dfs = load_sessions(f"../output/test_parquet/*")

if Config.target == "gt_carts":
    top = dfs.loc[dfs["type"] == 1, "aid"].value_counts().index.values[:20].tolist()
elif Config.target == "gt_orders":
    top = dfs.loc[dfs["type"] == 2, "aid"].value_counts().index.values[:20].tolist()
else:
    top = dfs.loc[dfs["type"] == 0, "aid"].value_counts().index.values[:20].tolist()

preds['candidates'] = preds['candidates'].apply(lambda x: list(x) + top[: 20 - len(x)])

del dfs
numba.cuda.current_context().deallocations.clear()
gc.collect()

In [None]:
log_folder_2 = LOG_PATH + f"{VERSION}.0/"
os.makedirs(log_folder_2, exist_ok=True)
save_config(Config, log_folder_2 + 'config')

In [None]:
if not DEBUG:
    sub = preds[['session', 'candidates']].copy()
    assert len(sub) == 1671803

    sub['candidates'] = sub['candidates'].parallel_apply(lambda x: " ".join(map(str, x)))
    sub['session'] =  sub['session'].astype(str) + "_" + TARGET[3:]
    sub.columns = ["session_type", "labels"]

    sub.to_csv(log_folder + f'sub_{TARGET}.csv', index=False)
    print(f"\n-> Saved sub to {log_folder + f'sub_{TARGET}.csv'}")

    sub.to_csv(log_folder_2 + f'sub_{TARGET}.csv', index=False)
    print(f"-> Saved sub to {log_folder_2 + f'sub_{TARGET}.csv'}\n")

    display(sub.head())

In [None]:
# if all([os.path.exists(log_folder_2 + f'sub_gt_{c}.csv') for c in CLASSES]):
#     sub_final = cudf.concat([
#         cudf.read_csv(log_folder_2 + f'sub_gt_{c}.csv') for c in CLASSES
#     ], ignore_index=True)

#     assert len(sub_final) == 5015409
#     sub_final.to_csv(log_folder_2 + f"submission.csv", index=False)

#     print(f"\n-> Saved final sub to {log_folder_2 + f'submission.csv'}\n")

#     display(sub_final.sample(5))

Done