**About :** Trains XGBoost models.

**TODO**:
- better neg sampling technique ??

In [2]:
cd ../src

/workspace/kaggle_otto_rs/src


In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [5]:
import os
import gc
import cudf
import json
import glob
import numba
import xgboost
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from numerize.numerize import numerize
from pandarallel import pandarallel

warnings.simplefilter(action="ignore", category=FutureWarning)
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
pandarallel.initialize(nb_workers=32, progress_bar=False)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [21]:
from params import *

from utils.metrics import get_coverage
from utils.plot import plot_importances
from utils.load import *
from utils.logger import *

### Load

#### Train data
- neg sampling could use candidates from lower versions

In [37]:
EXP_FOLDERS = [  # ORDERS
#     "../logs/2023-01-14/6/",  #   0.6657
    "../logs/2023-01-14/9/",    #  0.6668
]

In [38]:
EXP_FOLDERS = [  # CARTS
#     "../logs/2023-01-14/7/",  #   0.4368
    "../logs/2023-01-14/8/",    #  0.4382
]

In [39]:
df_val, df_test = None, None
for exp_folder in EXP_FOLDERS:
    config = Config(json.load(open(exp_folder + "config.json", "r")))
    print(f' -> Exp {exp_folder} \t Target {config.target}\n')
    TARGET = config.target
    dfs_val, dfs_test = [], []

    for fold in range(config.k):
        try:
            dfs_val.append(cudf.read_parquet(exp_folder + f"df_val_{fold}.parquet"))
            dfs_test.append(cudf.read_parquet(exp_folder + f"df_test_{fold}.parquet"))
        except:
            print(f'Fold {fold} missing !')
        
    dfs_val = cudf.concat(dfs_val, ignore_index=True)
    dfs_test = cudf.concat(dfs_test, ignore_index=True).groupby(['session', 'candidates']).mean().reset_index()

    print(f'- Retrieved {len(dfs_val)} val candidates.')
    print(f'- Retrieved {len(dfs_test)} test candidates.\n')

    if df_val is None: 
        df_val = dfs_val
        df_val['pred']
    else:
        df_val = df_val.set_index(['session', 'candidates']).add(
            dfs_val.set_index(['session', 'candidates']), fill_value=0
        ).reset_index()

    if df_test is None:
        df_test = dfs_test
    else:
        df_test = df_test.set_index(['session', 'candidates']).add(
            dfs_test.set_index(['session', 'candidates']), fill_value=0
        ).reset_index()
        
    del dfs_val
    numba.cuda.current_context().deallocations.clear()
    gc.collect()

 -> Exp ../logs/2023-01-14/8/ 	 Target gt_carts

- Retrieved 127052488 val candidates.
- Retrieved 119776598 test candidates.



### Process

In [40]:
preds = df_val[['session', 'candidates', 'pred']].copy()

preds = preds.sort_values(['session', 'pred'], ascending=[True, False])
preds = preds[['session', 'candidates', 'pred']].groupby('session').agg(list).reset_index()

preds = preds.to_pandas()
preds['candidates'] = preds['candidates'].apply(lambda x: x[:20])

In [41]:
# Fill less than 20 candidates.

dfs = load_sessions(f"../output/val_parquet/*")

if config.target == "gt_carts":
    top = dfs.loc[dfs["type"] == 1, "aid"].value_counts().index.values[:20].tolist()
elif config.target == "gt_orders":
    top = dfs.loc[dfs["type"] == 2, "aid"].value_counts().index.values[:20].tolist()
else:
    top = dfs.loc[dfs["type"] == 0, "aid"].value_counts().index.values[:20].tolist()

preds['candidates'] = preds['candidates'].apply(lambda x: list(x) + top[: 20 - len(x)])

del dfs
numba.cuda.current_context().deallocations.clear()
gc.collect()

35

In [42]:
folds = pd.read_csv(f"../input/folds_4.csv")
len(folds), len(preds)

(1801254, 1801251)

In [43]:
gt = pd.read_parquet("../output/val_labels.parquet")

recalls = []
print()
for col in CLASSES:
    if "gt_" + col not in [config.target]:
        continue

    if f"gt_{col}" not in preds.columns:
        preds = preds.merge(gt[gt["type"] == col].drop("type", axis=1), how="left").rename(
            columns={"ground_truth": f"gt_{col}"}
        )

    n_preds, n_gts, n_found = get_coverage(
        preds["candidates"].values, preds[f"gt_{col}"].values
    )

    print(
        f"- {col}\t-  Found {numerize(n_found)} GTs\t-  Recall : {n_found / n_gts :.4f}"
    )
    recalls.append(n_found / n_gts)


- carts	-  Found 252.4K GTs	-  Recall : 0.4382


In [46]:
cv = np.average([0.554, 0.4382, 0.6668], weights=WEIGHTS)
# cv = np.average([0.5059, 0.4139, 0.6540], weights=WEIGHTS)
print(f"-> CV : {cv:.4f}")

-> CV : 0.5869


In [48]:
# cv = np.average([0.554, 0.4368, 0.6657], weights=WEIGHTS)
# # cv = np.average([0.5059, 0.4139, 0.6540], weights=WEIGHTS)
# print(f"-> CV : {cv:.4f}")

### Test

In [49]:
preds = df_test[['session', 'candidates', 'pred']].copy()

preds = preds.sort_values(['session', 'pred'], ascending=[True, False])
preds = preds[['session', 'candidates', 'pred']].groupby('session').agg(list).reset_index()

preds = preds.to_pandas()
preds['candidates'] = preds['candidates'].apply(lambda x: x[:20])

In [50]:
# Fill less than 20 candidates. This should be useless in the future

dfs = load_sessions(f"../output/test_parquet/*")

if config.target == "gt_carts":
    top = dfs.loc[dfs["type"] == 1, "aid"].value_counts().index.values[:20].tolist()
elif config.target == "gt_orders":
    top = dfs.loc[dfs["type"] == 2, "aid"].value_counts().index.values[:20].tolist()
else:
    top = dfs.loc[dfs["type"] == 0, "aid"].value_counts().index.values[:20].tolist()

preds['candidates'] = preds['candidates'].apply(lambda x: list(x) + top[: 20 - len(x)])

del dfs
numba.cuda.current_context().deallocations.clear()
gc.collect()

35

### Save

In [51]:
FT_VERSION = "c-orders-v4.7"
MODEL_VERSION = "1"

In [52]:
log_folder_2 = LOG_PATH + f"{FT_VERSION}.{MODEL_VERSION}/"

os.makedirs(log_folder_2, exist_ok=True)
save_config(config, log_folder_2 + 'config')

In [53]:
sub = preds[['session', 'candidates']].copy()
assert len(sub) == 1671803

sub['candidates'] = sub['candidates'].parallel_apply(lambda x: " ".join(map(str, x)))
sub['session'] =  sub['session'].astype(str) + "_" + TARGET[3:]
sub.columns = ["session_type", "labels"]

sub.to_csv(log_folder_2 + f'sub_{TARGET}.csv', index=False)
print(f"-> Saved sub to {log_folder_2 + f'sub_{TARGET}.csv'}\n")

display(sub.head())

-> Saved sub to ../logs/c-orders-v4.7.1/sub_gt_carts.csv



Unnamed: 0,session_type,labels
0,12899779_carts,59625 731692 1253524 1790770 737445 1340695 94...
1,12899780_carts,1142000 582732 736515 77422 1419849 618310 150...
2,12899781_carts,918667 199008 1200570 1248748 141736 811084 75...
3,12899782_carts,479970 595994 127404 1033148 834354 987399 889...
4,12899783_carts,1817895 1811433 300127 58861 1729553 1218 2552...


In [54]:
if all([os.path.exists(log_folder_2 + f'sub_gt_{c}.csv') for c in CLASSES]):
    sub_final = cudf.concat([
        cudf.read_csv(log_folder_2 + f'sub_gt_{c}.csv') for c in CLASSES
    ], ignore_index=True)

    assert len(sub_final) == 5015409
    sub_final.to_csv(log_folder_2 + f"submission.csv", index=False)

    print(f"\n-> Saved final sub to {log_folder_2 + f'submission.csv'}\n")

    display(sub_final.sample(5))


-> Saved final sub to ../logs/c-orders-v4.7.1/submission.csv



Unnamed: 0,session_type,labels
3496178,13052351_orders,147875 1264792 890322 1709370 1603308 153333 1...
3971247,13527420_orders,30503 1143 203733 446495 151116 977797 1174808...
3950659,13506832_orders,546864 1421960 1718006 538242 695040 994623 18...
324387,13224166_clicks,485256 485256 152547 33343 1551213 33343 17650...
3221279,14449255_carts,544645 1406365 1809109 1680961 1325223 840655 ...


In [55]:
# kaggle competitions submit -c otto-recommender-system -f submission.csv -m "Message"

Done