**About :** Blend & evaluate models.

In [None]:
cd ../src

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import gc
import cudf
import json
import glob
import numba
import xgboost
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from numerize.numerize import numerize
from pandarallel import pandarallel

warnings.simplefilter(action="ignore", category=FutureWarning)
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
pandarallel.initialize(nb_workers=32, progress_bar=False)

In [None]:
from params import *

from utils.metrics import get_coverage, evaluate
from utils.plot import plot_importances
from utils.load import *
from utils.logger import *

from inference.boosting import inference

### Inference
- Not needed if done during training.

In [None]:
EXP_FOLDER = "your exp"

VERSION = "cv7+-tv5.12"

REGEX = f"../output/features/fts_val_{VERSION}/*"
TEST_REGEX = f"../output/features/fts_test_{VERSION}/*"

In [None]:
# inference(REGEX, TEST_REGEX, EXP_FOLDER, debug=False, save=True)

#### Experiments
 - Use your own folders here ! 

In [None]:
EXP_FOLDERS = [  # Clicks - 0.5593
    ["Fold 0", "Fold 1", "Fold 2 & 3"],  # Model 1
    ["Fold 0 & 1", "Fold 2 & 3"],  # Model 2
]
WEIGHTS = [1, 1]

#### Train data

In [None]:
df_val_ = None
dfs_val_list = []
for exp_folders, w in zip(EXP_FOLDERS, WEIGHTS):
    
    if not isinstance(exp_folders, list):
        exp_folders = [exp_folders]

    dfs_val = []
    for exp_folder in exp_folders:
        config = Config(json.load(open(exp_folder + "config.json", "r")))
        print(f'\t === Exp {exp_folder}\t Target {config.target} ===\n')
        TARGET = config.target
        try:
            print(f"{config.version} - Pos ratio {config.pos_ratio} - Extra_prop {config.extra_prop if config.use_extra else None}\n")
        except:
            print(f"{config.version} - Pos ratio {config.pos_ratio} - Extra_prop None\n")

        for fold in range(config.k):
            try:
                df_val = cudf.read_parquet(exp_folder + f"df_val_{fold}.parquet")
                df_val['pred'] *= w

                dfs_val.append(df_val)
                if len(EXP_FOLDERS) == 1:
                    print(f'-> Retrieved fold {fold}', end="")
                    evaluate(df_val, TARGET)
            except FileNotFoundError:
                pass

    dfs_val = cudf.concat(dfs_val, ignore_index=True)
    dfs_val_list.append(dfs_val.sort_values(['session', 'candidates'], ignore_index=True))

    print('\n ===> CV :')
    cv = evaluate(dfs_val, TARGET)

    if df_val_ is None: 
        df_val_ = dfs_val.copy()
    else:
        df_val_ = df_val_.set_index(['session', 'candidates']).add(
            dfs_val.set_index(['session', 'candidates']), fill_value=0
        ).reset_index()
        
    

    del dfs_val
    numba.cuda.current_context().deallocations.clear()
    gc.collect()

In [None]:
score = evaluate(df_val_, TARGET, verbose=0)
print(f' ===> CV : {score:.7f}')

### Test

In [None]:
df_test = None
for exp_folders, w  in zip(EXP_FOLDERS, WEIGHTS):
    dfs_test = None

    if not isinstance(exp_folders, list):
        exp_folders = [exp_folders]
    
    for exp_folder in exp_folders:
        config = Config(json.load(open(exp_folder + "config.json", "r")))
        print(f'\t === Exp {exp_folder}\t Target {config.target} ===\n')
        TARGET = config.target

        for fold in range(config.k):
            try:
                df_test_ = cudf.read_parquet(exp_folder + f"df_test_{fold}.parquet")
                df_test_['pred'] *= (w / 4)
                print(f'-> Retrieved fold {fold}\n')
                
                if dfs_test is None:
                    dfs_test = df_test_
                else:
                    dfs_test = dfs_test.set_index(['session', 'candidates']).add(
                        df_test_.set_index(['session', 'candidates']), fill_value=0
                    ).reset_index()
                
                del df_test_
                numba.cuda.current_context().deallocations.clear()
                gc.collect()

            except FileNotFoundError:
                pass

    print(f'- Retrieved {len(dfs_test)} test candidates.\n')

    if df_test is None:
        df_test = dfs_test
    else:
        df_test = df_test.set_index(['session', 'candidates']).add(
            dfs_test.set_index(['session', 'candidates']), fill_value=0
        ).reset_index()

    del dfs_test
    numba.cuda.current_context().deallocations.clear()
    gc.collect()

In [None]:
preds = df_test[['session', 'candidates', 'pred']].copy()

preds = preds.sort_values(['session', 'pred'], ascending=[True, False])
preds = preds[['session', 'candidates', 'pred']].groupby('session').agg(list).reset_index()

preds = preds.to_pandas()
preds['candidates'] = preds['candidates'].apply(lambda x: x[:20])

In [None]:
dfs = load_sessions(f"../output/test_parquet/*")

if config.target == "gt_carts":
    top = dfs.loc[dfs["type"] == 1, "aid"].value_counts().index.values[:20].tolist()
elif config.target == "gt_orders":
    top = dfs.loc[dfs["type"] == 2, "aid"].value_counts().index.values[:20].tolist()
else:
    top = dfs.loc[dfs["type"] == 0, "aid"].value_counts().index.values[:20].tolist()

preds['candidates'] = preds['candidates'].apply(lambda x: list(x) + top[: 20 - len(x)])

del dfs
numba.cuda.current_context().deallocations.clear()
gc.collect()

### Save

In [None]:
FT_VERSION = "cv7+-tv5.11"
MODEL_VERSION = "1"

In [None]:
log_folder_2 = LOG_PATH + f"{FT_VERSION}.{MODEL_VERSION}/"

os.makedirs(log_folder_2, exist_ok=True)
save_config(config, log_folder_2 + 'config')

In [None]:
sub = preds[['session', 'candidates']].copy()
assert len(sub) == 1671803

sub['candidates'] = sub['candidates'].parallel_apply(lambda x: " ".join(map(str, x)))
sub['session'] =  sub['session'].astype(str) + "_" + TARGET[3:]
sub.columns = ["session_type", "labels"]

sub.to_csv(log_folder_2 + f'sub_{TARGET}.csv', index=False)
print(f"-> Saved sub to {log_folder_2 + f'sub_{TARGET}.csv'}\n")

display(sub.head())

In [None]:
if all([os.path.exists(log_folder_2 + f'sub_gt_{c}.csv') for c in CLASSES]):
    sub_final = cudf.concat([
        cudf.read_csv(log_folder_2 + f'sub_gt_{c}.csv') for c in CLASSES
    ], ignore_index=True)

    assert len(sub_final) == 5015409
    sub_final.to_csv(log_folder_2 + f"submission.csv", index=False)

    print(f"\n-> Saved final sub to {log_folder_2 + f'submission.csv'}\n")

    display(sub_final.sample(5))

In [None]:
# kaggle competitions submit -c otto-recommender-system -f submission.csv -m "Message"

Done !