**About :** Trains XGBoost models.

**TODO**:
- better neg sampling technique ??

In [1]:
cd ../src

/workspace/kaggle_otto_rs/src


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [4]:
import os
import gc
import cudf
import json
import glob
import numba
import xgboost
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from numerize.numerize import numerize
from pandarallel import pandarallel

warnings.simplefilter(action="ignore", category=FutureWarning)
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
pandarallel.initialize(nb_workers=32, progress_bar=False)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [5]:
from params import *

from utils.metrics import get_coverage, evaluate
from utils.plot import plot_importances
from utils.load import *
from utils.logger import *

from inference.xgb import xgb_inference

### Inf

In [6]:
EXP_FOLDER = "../logs/2023-01-20/9/"
VERSION = "cv3-tv5.10"
TARGET = "gt_orders"

In [7]:
if TARGET != "gt_clicks":
    REGEX = f"../output/features/fts_val_{VERSION}_{TARGET}/*"
else:
    REGEX = f"../output/features/fts_val_{VERSION}/*"

TEST_REGEX = f"../output/features/fts_test_{VERSION}/*"

In [8]:
# xgb_inference(REGEX, TEST_REGEX, EXP_FOLDER, debug=False)

In [18]:
# ft_imp = pd.read_csv("../logs/2023-01-20/1/ft_imp.csv")
# list(ft_imp.sort_values('importance')["index"])[:100]
# plot_importances(ft_imp.set_index("index"))

### Load

#### Train data
- neg sampling could use candidates from lower versions

In [9]:
EXP_FOLDERS = [  # CARTS - 0.4392
#     "../logs/2023-01-17/5/",   # 0.4389
#     "../logs/2023-01-17/7/",   # 0.4390
    "../logs/2023-01-17/8/",   # 0.4391  (0.4405 fold 0)
]

In [10]:
EXP_FOLDERS = [  # ORDERS - 0.6673
    "../logs/2023-01-17/4/",    # 0.6674    (0.6672 fold 0)
#     "../logs/2023-01-17/6/",    # 0.6672
#     "../logs/2023-01-17/9/",    # 0.6670
#     "../logs/2023-01-17/11/",     # 0.6669  -  Rank
]  # Rank avg blend ?

In [11]:
EXP_FOLDERS = [  # ORDERS - 0.6680
    ["../logs/2023-01-20/1/", "../logs/2023-01-20/9/", "../logs/2023-01-20/13/", "../logs/2023-01-20/12/"],
]

In [None]:
EXP_FOLDERS = [  # Carts - 0.4404
    ["../logs/2023-01-20/17/", "../logs/2023-01-20/29/", "../logs/2023-01-20/28/", "../logs/2023-01-20/27/"],
]

In [None]:
EXP_FOLDERS = [  # Clicks - 0.5593
    ["../logs/2023-01-20/26/", "../logs/2023-01-20/23/", "../logs/2023-01-20/30/", "../logs/2023-01-20/32/"],
]

In [84]:
EXP_FOLDERS = [  # ORDERS - 0.6680
    ["../logs/2023-01-20/1/"], #"../logs/2023-01-20/9/", "../logs/2023-01-20/13/", "../logs/2023-01-20/12/"],
    ["../logs/2023-01-21/41/"],
    ["../logs/2023-01-21/45/"],
]

WEIGHTS = [1, 0.2, 0.2]

In [85]:
df_val_ = None
for exp_folders, w in zip(EXP_FOLDERS, WEIGHTS):
    
    if not isinstance(exp_folders, list):
        exp_folders = [exp_folders]
    
    dfs_val = []
    for exp_folder in exp_folders:
        config = Config(json.load(open(exp_folder + "config.json", "r")))
        print(f'\t === Exp {exp_folder}\t Target {config.target} ===\n')
        TARGET = config.target

        for fold in range(config.k):
            try:
                df_val = cudf.read_parquet(exp_folder + f"df_val_{fold}.parquet")
                df_val['pred'] = df_val.groupby('session')['pred'].rank()
#                 df_val['pred'] = (df_val['pred'] - df_val['pred'].min()) / (df_val['pred'].max() - df_val['pred'].min())
                df_val['pred'] *= w
                dfs_val.append(df_val)
                print(f'-> Retrieved fold {fold}', end="")

                evaluate(df_val, TARGET)
            except FileNotFoundError:
                pass
        
#     print('\n ===> CV :')
    dfs_val = cudf.concat(dfs_val, ignore_index=True)
#     evaluate(dfs_val, TARGET)    

    if df_val_ is None: 
        df_val_ = dfs_val.copy()
    else:
        df_val_ = df_val_.set_index(['session', 'candidates']).add(
            dfs_val.set_index(['session', 'candidates']), fill_value=0
        ).reset_index()
        
    del dfs_val
    numba.cuda.current_context().deallocations.clear()
    gc.collect()

	 === Exp ../logs/2023-01-20/1/	 Target gt_orders ===

-> Retrieved fold 0
-> gt_orders  -  Recall : 0.6676

	 === Exp ../logs/2023-01-21/41/	 Target gt_orders ===

-> Retrieved fold 0
-> gt_orders  -  Recall : 0.6666

	 === Exp ../logs/2023-01-21/45/	 Target gt_orders ===

-> Retrieved fold 0
-> gt_orders  -  Recall : 0.6668



In [86]:
evaluate(df_val_, TARGET)


-> gt_orders  -  Recall : 0.6677



0.6677094284468833

In [77]:
evaluate(df_val_, TARGET)


-> gt_orders  -  Recall : 0.6676



0.667606861714403

In [None]:
cv = np.average([0.5593, 0.4404, 0.6680], weights=WEIGHTS)  # LB 0.599  - High
print(f"-> CV : {cv:.4f}")

In [None]:
cv = np.average([0.554, 0.4404, 0.6680], weights=WEIGHTS)  # LB 0.599  - High
print(f"-> CV : {cv:.4f}")

In [None]:
cv = np.average([0.554, 0.4392, 0.6673], weights=WEIGHTS)  # LB 0.599  - Low 
print(f"-> CV : {cv:.4f}")

In [None]:
cv = np.average([0.554, 0.4382, 0.6668], weights=WEIGHTS)  # LB 0.598  - Mid
print(f"-> CV : {cv:.4f}")

### Test

In [None]:
df_test = None
for exp_folders in EXP_FOLDERS:
    dfs_test = []

    if not isinstance(exp_folders, list):
        exp_folders = [exp_folders]
    
    for exp_folder in exp_folders:
        config = Config(json.load(open(exp_folder + "config.json", "r")))
        print(f'\t === Exp {exp_folder}\t Target {config.target} ===\n')
        TARGET = config.target

        for fold in range(config.k):
            try:
                dfs_test.append(cudf.read_parquet(exp_folder + f"df_test_{fold}.parquet"))
                print(f'-> Retrieved fold {fold}\n')
            except FileNotFoundError:
                pass

    dfs_test = cudf.concat(dfs_test, ignore_index=True).groupby(['session', 'candidates']).mean().reset_index()

    print(f'- Retrieved {len(dfs_test)} test candidates.\n')

    if df_test is None:
        df_test = dfs_test
    else:
        df_test = df_test.set_index(['session', 'candidates']).add(
            dfs_test.set_index(['session', 'candidates']), fill_value=0
        ).reset_index()

    del dfs_test
    numba.cuda.current_context().deallocations.clear()
    gc.collect()

In [None]:
preds = df_test[['session', 'candidates', 'pred']].copy()

preds = preds.sort_values(['session', 'pred'], ascending=[True, False])
preds = preds[['session', 'candidates', 'pred']].groupby('session').agg(list).reset_index()

preds = preds.to_pandas()
preds['candidates'] = preds['candidates'].apply(lambda x: x[:20])

In [None]:
# Fill less than 20 candidates. This should be useless in the future
dfs = load_sessions(f"../output/test_parquet/*")

if config.target == "gt_carts":
    top = dfs.loc[dfs["type"] == 1, "aid"].value_counts().index.values[:20].tolist()
elif config.target == "gt_orders":
    top = dfs.loc[dfs["type"] == 2, "aid"].value_counts().index.values[:20].tolist()
else:
    top = dfs.loc[dfs["type"] == 0, "aid"].value_counts().index.values[:20].tolist()

preds['candidates'] = preds['candidates'].apply(lambda x: list(x) + top[: 20 - len(x)])

del dfs
numba.cuda.current_context().deallocations.clear()
gc.collect()

### Save

In [None]:
FT_VERSION = "cv3-tv5.10"
MODEL_VERSION = "2"

In [None]:
log_folder_2 = LOG_PATH + f"{FT_VERSION}.{MODEL_VERSION}/"

os.makedirs(log_folder_2, exist_ok=True)
save_config(config, log_folder_2 + 'config')

In [None]:
sub = preds[['session', 'candidates']].copy()
assert len(sub) == 1671803

sub['candidates'] = sub['candidates'].parallel_apply(lambda x: " ".join(map(str, x)))
sub['session'] =  sub['session'].astype(str) + "_" + TARGET[3:]
sub.columns = ["session_type", "labels"]

sub.to_csv(log_folder_2 + f'sub_{TARGET}.csv', index=False)
print(f"-> Saved sub to {log_folder_2 + f'sub_{TARGET}.csv'}\n")

display(sub.head())

In [None]:
if all([os.path.exists(log_folder_2 + f'sub_gt_{c}.csv') for c in CLASSES]):
    sub_final = cudf.concat([
        cudf.read_csv(log_folder_2 + f'sub_gt_{c}.csv') for c in CLASSES
    ], ignore_index=True)

    assert len(sub_final) == 5015409
    sub_final.to_csv(log_folder_2 + f"submission.csv", index=False)

    print(f"\n-> Saved final sub to {log_folder_2 + f'submission.csv'}\n")

    display(sub_final.sample(5))

In [None]:
# kaggle competitions submit -c otto-recommender-system -f submission.csv -m "Message"

Done