**About :** Trains XGBoost models.

**TODO**:
- better neg sampling technique ??

In [None]:
cd ../src

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
import os
import gc
import cudf
import json
import glob
import numba
import xgboost
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from numerize.numerize import numerize
from pandarallel import pandarallel

warnings.simplefilter(action="ignore", category=FutureWarning)
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
pandarallel.initialize(nb_workers=32, progress_bar=False)

In [None]:
from params import *

from utils.metrics import get_coverage
from utils.plot import plot_importances
from utils.load import *
from utils.logger import *

### Load

#### Train data
- neg sampling could use candidates from lower versions

In [None]:
EXP_FOLDERS = [  # ORDERS
#     "../logs/2023-01-14/6/",  #   0.6657
    "../logs/2023-01-14/9/",    #  0.6668
]

In [None]:
EXP_FOLDERS = [  # CARTS
#     "../logs/2023-01-14/7/",  #   0.4368
    "../logs/2023-01-14/8/",    #  0.4382
]

In [None]:
df_val = None
for exp_folder in EXP_FOLDERS:
    config = Config(json.load(open(exp_folder + "config.json", "r")))
    print(f' -> Exp {exp_folder} \t Target {config.target}\n')
    TARGET = config.target
    dfs_val = []

    for fold in range(config.k):
#         try:
        dfs_val.append(cudf.read_parquet(exp_folder + f"df_val_{fold}.parquet"))
#         except FileNotFoundError:
#             print(f'Fold {fold} missing !')
#         break
        
    dfs_val = cudf.concat(dfs_val, ignore_index=True)

    print(f'- Retrieved {len(dfs_val)} val candidates.')

    if df_val is None: 
        df_val = dfs_val
        df_val['pred']
    else:
        df_val = df_val.set_index(['session', 'candidates']).add(
            dfs_val.set_index(['session', 'candidates']), fill_value=0
        ).reset_index()
        
    del dfs_val
    numba.cuda.current_context().deallocations.clear()
    gc.collect()

In [None]:
# df_test = None
# for exp_folder in EXP_FOLDERS:
#     config = Config(json.load(open(exp_folder + "config.json", "r")))
#     print(f' -> Exp {exp_folder} \t Target {config.target}\n')
#     TARGET = config.target
#     dfs_test = []

#     for fold in range(config.k):
#         try:
#             dfs_test.append(cudf.read_parquet(exp_folder + f"df_test_{fold}.parquet"))
#         except:
#             print(f'Fold {fold} missing !')
        
#     dfs_test = cudf.concat(dfs_test, ignore_index=True).groupby(['session', 'candidates']).mean().reset_index()

#     print(f'- Retrieved {len(dfs_test)} test candidates.\n')

#     if df_test is None:
#         df_test = dfs_test
#     else:
#         df_test = df_test.set_index(['session', 'candidates']).add(
#             dfs_test.set_index(['session', 'candidates']), fill_value=0
#         ).reset_index()
        
#     del dfs_test
#     numba.cuda.current_context().deallocations.clear()
#     gc.collect()

### Process

In [None]:
preds = df_val[['session', 'candidates', 'pred']].copy()

preds = preds.sort_values(['session', 'pred'], ascending=[True, False])
preds = preds[['session', 'candidates', 'pred']].groupby('session').agg(list).reset_index()

preds = preds.to_pandas()
preds['candidates'] = preds['candidates'].apply(lambda x: x[:20])

In [None]:
# Fill less than 20 candidates.

dfs = load_sessions(f"../output/val_parquet/*")

if config.target == "gt_carts":
    top = dfs.loc[dfs["type"] == 1, "aid"].value_counts().index.values[:20].tolist()
elif config.target == "gt_orders":
    top = dfs.loc[dfs["type"] == 2, "aid"].value_counts().index.values[:20].tolist()
else:
    top = dfs.loc[dfs["type"] == 0, "aid"].value_counts().index.values[:20].tolist()

preds['candidates'] = preds['candidates'].apply(lambda x: list(x) + top[: 20 - len(x)])

del dfs
numba.cuda.current_context().deallocations.clear()
gc.collect()

In [None]:
folds = pd.read_csv(f"../input/folds_4.csv")
len(folds), len(preds)

In [None]:
gt = pd.read_parquet("../output/val_labels.parquet")

recalls = []
print()
for col in CLASSES:
    if "gt_" + col not in [config.target]:
        continue

    if f"gt_{col}" not in preds.columns:
        preds = preds.merge(gt[gt["type"] == col].drop("type", axis=1), how="left").rename(
            columns={"ground_truth": f"gt_{col}"}
        )

    n_preds, n_gts, n_found = get_coverage(
        preds["candidates"].values, preds[f"gt_{col}"].values
    )

    print(
        f"- {col}\t-  Found {numerize(n_found)} GTs\t-  Recall : {n_found / n_gts :.4f}"
    )
    recalls.append(n_found / n_gts)

In [None]:
cv = np.average([0.554, 0.4382, 0.6668], weights=WEIGHTS)
# cv = np.average([0.5059, 0.4139, 0.6540], weights=WEIGHTS)
print(f"-> CV : {cv:.4f}")

In [None]:
# cv = np.average([0.554, 0.4368, 0.6657], weights=WEIGHTS)
# # cv = np.average([0.5059, 0.4139, 0.6540], weights=WEIGHTS)
# print(f"-> CV : {cv:.4f}")

### Test

In [None]:
preds = df_test[['session', 'candidates', 'pred']].copy()

preds = preds.sort_values(['session', 'pred'], ascending=[True, False])
preds = preds[['session', 'candidates', 'pred']].groupby('session').agg(list).reset_index()

preds = preds.to_pandas()
preds['candidates'] = preds['candidates'].apply(lambda x: x[:20])

In [None]:
# Fill less than 20 candidates. This should be useless in the future

dfs = load_sessions(f"../output/test_parquet/*")

if config.target == "gt_carts":
    top = dfs.loc[dfs["type"] == 1, "aid"].value_counts().index.values[:20].tolist()
elif config.target == "gt_orders":
    top = dfs.loc[dfs["type"] == 2, "aid"].value_counts().index.values[:20].tolist()
else:
    top = dfs.loc[dfs["type"] == 0, "aid"].value_counts().index.values[:20].tolist()

preds['candidates'] = preds['candidates'].apply(lambda x: list(x) + top[: 20 - len(x)])

del dfs
numba.cuda.current_context().deallocations.clear()
gc.collect()

### Save

In [None]:
FT_VERSION = "c-orders-v4.7"
MODEL_VERSION = "1"

In [None]:
log_folder_2 = LOG_PATH + f"{FT_VERSION}.{MODEL_VERSION}/"

os.makedirs(log_folder_2, exist_ok=True)
save_config(config, log_folder_2 + 'config')

In [None]:
sub = preds[['session', 'candidates']].copy()
assert len(sub) == 1671803

sub['candidates'] = sub['candidates'].parallel_apply(lambda x: " ".join(map(str, x)))
sub['session'] =  sub['session'].astype(str) + "_" + TARGET[3:]
sub.columns = ["session_type", "labels"]

sub.to_csv(log_folder_2 + f'sub_{TARGET}.csv', index=False)
print(f"-> Saved sub to {log_folder_2 + f'sub_{TARGET}.csv'}\n")

display(sub.head())

In [None]:
if all([os.path.exists(log_folder_2 + f'sub_gt_{c}.csv') for c in CLASSES]):
    sub_final = cudf.concat([
        cudf.read_csv(log_folder_2 + f'sub_gt_{c}.csv') for c in CLASSES
    ], ignore_index=True)

    assert len(sub_final) == 5015409
    sub_final.to_csv(log_folder_2 + f"submission.csv", index=False)

    print(f"\n-> Saved final sub to {log_folder_2 + f'submission.csv'}\n")

    display(sub_final.sample(5))

In [None]:
# kaggle competitions submit -c otto-recommender-system -f submission.csv -m "Message"

Done