**About :** Blend & evaluate models.

In [1]:
cd ../src

/workspace/kaggle_otto_rs/src


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [4]:
import os
import gc
import cudf
import json
import glob
import numba
import xgboost
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from numerize.numerize import numerize
from pandarallel import pandarallel

warnings.simplefilter(action="ignore", category=FutureWarning)
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
pandarallel.initialize(nb_workers=32, progress_bar=False)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [5]:
from params import *

from utils.metrics import get_coverage, evaluate
from utils.plot import plot_importances
from utils.load import *
from utils.logger import *

from inference.boosting import inference

### Inf

In [6]:
# EXP_FOLDER = "../logs/2023-01-25/15/"
EXP_FOLDER = "../logs/2023-01-28/19/"
#     ["../logs/2023-01-28/18/", "../logs/2023-01-28/17/"]   # CARTS Pos ratio 0.2  - CV xx
#         ["../logs/2023-01-28/19/", "../logs/2023-01-28/20/"]   # CARTS Pos ratio 0.2  - CV xx

# VERSION = "cv7+-tv5.12_2"  # 
VERSION = "cv7+-tv5.12_cb"  #  0.669248  0.671884  0.668683  0.66967


REGEX = f"../output/features/fts_val_{VERSION}/*"
TEST_REGEX = None  # f"../output/features/fts_test_{VERSION}/*"

In [7]:
# inference(REGEX, TEST_REGEX, EXP_FOLDER, debug=False, save=True)  # CV 0.6696  //  0.6692 - 0.6716 - 0.6684 - 0.6693

In [8]:
# ft_imp = pd.read_csv("../logs/2023-01-25/13/ft_imp.csv")
# ft_imp = ft_imp[ft_imp['index'].apply(lambda x: "embed" in x or "w2v" in x)]

# list(ft_imp.sort_values('importance')["index"])[:200]
# plot_importances(ft_imp.set_index("index"))

In [9]:
# REGEX = "../output/val_parquet/*"

# sessions = load_sessions(REGEX)
# sessions_g = sessions[["session", "aid"]].groupby('session').agg("count").reset_index()
# sessions_len_1 = sessions_g[sessions_g['aid'] == 1]

# sessions_len_more = sessions_g[sessions_g['aid'] > 1]
# # sessions_g = sessions_g.merge(sessions_g2, on="session", suffixes=('', '_'), how="left")
# # sessions_len_1 = sessions_g.dropna(0)

### Load

#### Train data
- neg sampling could use candidates from lower versions

In [10]:
EXP_FOLDERS = [  # Clicks - 0.5593
    ["../logs/2023-01-20/26/", "../logs/2023-01-20/23/", "../logs/2023-01-20/30/", "../logs/2023-01-20/32/"],
]

WEIGHTS = [1] * len(EXP_FOLDERS)

In [11]:
EXP_FOLDERS = [
#     "../logs/2023-01-24/29/",  # CARTS - Pos ratio 0.2 - Extra_prop 0                CV 0.4420
#     "../logs/2023-01-24/28/",  # ORDER - Pos ratio 0.2 - Extra_prop 0.5              CV 0.6694
#     "../logs/2023-01-24/27/",  # ORDER - Pos ratio 0.5 - Extra_prop 1                CV 0.6696
#     "../logs/2023-01-24/22/",  # ORDER - Pos ratio 0.2 - Extra_prop 0                CV 0.6691
#     "../logs/2023-01-24/17/",  # ORDER - Pos ratio 0.2 - Extra_prop 0.5  rm low imp  CV 0.6694
#     ["../logs/2023-01-25/0/", "../logs/2023-01-25/4/",  "../logs/2023-01-25/6/", "../logs/2023-01-25/7/"],  # CARTS Pos ratio 0.5 - Extra_prop 1.0    CV 0.4424
#     ["../logs/2023-01-26/3/", "../logs/2023-01-26/2/"],                                                     # CARTS Pos ratio 0.5 - Extra_prop 2.0    CV 0.4426
    
#     "../logs/2023-01-25/11/",  # ORDER - Pos ratio 0.5 - Extra_prop 2.0               CV 0.6697
    "../logs/2023-01-25/15/",  # ORDER - Pos ratio 0.25 - Extra_prop *                CV 0.6699
    ## cv7+-tv5.12
#     ["../logs/2023-01-28/7/", "../logs/2023-01-28/30/"],  # ORDER Pos ratio 0.5  - CV 0.6703
#     ["../logs/2023-01-28/22/", "../logs/2023-01-28/29/", "../logs/2023-01-28/26/"],  # ORDER Pos ratio 0.25  - CV 0.6703
    ["../logs/2023-01-28/6/", "../logs/2023-01-28/27/"],   # ORDER Pos ratio 0.2  - CV 0.6704
#     ["../logs/2023-01-28/18/", "../logs/2023-01-28/17/"],   # CARTS Pos ratio 0.2  - CV 0.4436
#     ["../logs/2023-01-28/19/", "../logs/2023-01-28/20/"],   # CARTS Pos ratio 0.5  - CV 0.4436
#     ["../logs/2023-01-28/15/", "../logs/2023-01-28/16/"],  # Clicks  Pos ratio 0.2   - CV 0.5621
#     ["../logs/2023-01-20/26/", "../logs/2023-01-20/23/", "../logs/2023-01-20/30/", "../logs/2023-01-20/32/"],  # 0.5593
]

WEIGHTS = [1] * len(EXP_FOLDERS)

In [102]:
EXP_FOLDERS = [  # 0.4438
    "../logs/2023-01-24/29/",  # CARTS - Pos ratio 0.2 - Extra_prop 0                CV 0.4420
    ["../logs/2023-01-25/0/", "../logs/2023-01-25/4/",  "../logs/2023-01-25/6/", "../logs/2023-01-25/7/"],  # CARTS Pos ratio 0.5 - Extra_prop 1.0    CV 0.4424
    ["../logs/2023-01-26/3/", "../logs/2023-01-26/2/"],                                                     # CARTS Pos ratio 0.5 - Extra_prop 2.0    CV 0.4426
    ["../logs/2023-01-28/18/", "../logs/2023-01-28/17/"],   # CARTS Pos ratio 0.2  - CV 0.4436
    ["../logs/2023-01-28/19/", "../logs/2023-01-28/20/"],   # CARTS Pos ratio 0.5  - CV 0.4436
]

WEIGHTS = [1, 1, 2, 10, 10]  # Carts 0.44376

In [14]:
EXP_FOLDERS = [  # 0.5621
    ["../logs/2023-01-28/15/", "../logs/2023-01-28/16/"],  # Clicks  Pos ratio 0.2   - CV 0.5621
#     ["../logs/2023-01-20/26/", "../logs/2023-01-20/23/", "../logs/2023-01-20/30/", "../logs/2023-01-20/32/"], # 0.5593
]

WEIGHTS = [1]

In [134]:
EXP_FOLDERS = [  # 0.6706
    "../logs/2023-01-25/15/",  # ORDER - Pos ratio 0.25 - Extra_prop *                CV 0.6699
    ## cv7+-tv5.12
    ["../logs/2023-01-28/7/", "../logs/2023-01-28/30/"],  # ORDER Pos ratio 0.5  - CV 0.6703
    ["../logs/2023-01-28/22/", "../logs/2023-01-28/29/", "../logs/2023-01-28/26/"],  # ORDER Pos ratio 0.25  - CV 0.6703
    ["../logs/2023-01-28/6/", "../logs/2023-01-28/27/"],   # ORDER Pos ratio 0.2  - CV 0.6704
]
WEIGHTS = [2, 1, 2, 2]

In [144]:
df_val_ = None
dfs_val_list = []
for exp_folders, w in zip(EXP_FOLDERS, WEIGHTS):
    
    if not isinstance(exp_folders, list):
        exp_folders = [exp_folders]

    dfs_val = []
    for exp_folder in exp_folders:
        config = Config(json.load(open(exp_folder + "config.json", "r")))
        print(f'\t === Exp {exp_folder}\t Target {config.target} ===\n')
        TARGET = config.target
        try:
            print(f"{config.version} - Pos ratio {config.pos_ratio} - Extra_prop {config.extra_prop if config.use_extra else None}\n")
        except:
            print(f"{config.version} - Pos ratio {config.pos_ratio} - Extra_prop None\n")

        for fold in range(config.k):
#             if fold == 3:
#                 continue
            try:
                df_val = cudf.read_parquet(exp_folder + f"df_val_{fold}.parquet")
#                 df_val = df_val.merge(sessions_len_1, on="session", how="left").dropna(0)
#                 df_val = df_val.merge(sessions_len_more, on="session", how="left").dropna(0)

#                 df_val['pred'] = df_val.groupby('session')['pred'].rank()
#                 df_val['pred'] = (df_val['pred'] - df_val['pred'].min()) / (df_val['pred'].max() - df_val['pred'].min())
                df_val['pred'] *= w

                dfs_val.append(df_val)
                if len(EXP_FOLDERS) == 1:
                    print(f'-> Retrieved fold {fold}', end="")
                    evaluate(df_val, TARGET)
            except FileNotFoundError:
                pass

#             break

    dfs_val = cudf.concat(dfs_val, ignore_index=True)
    dfs_val_list.append(dfs_val.sort_values(['session', 'candidates'], ignore_index=True))

#     print('\n ===> CV :')
#     cv = evaluate(dfs_val, TARGET)    
#     print(cv)

    if df_val_ is None: 
        df_val_ = dfs_val.copy()
    else:
        df_val_ = df_val_.set_index(['session', 'candidates']).add(
            dfs_val.set_index(['session', 'candidates']), fill_value=0
        ).reset_index()
        
    

    del dfs_val
    numba.cuda.current_context().deallocations.clear()
    gc.collect()

	 === Exp ../logs/2023-01-25/15/	 Target gt_orders ===

cv7-tv5.11 - Pos ratio 0.25 - Extra_prop 0.0

	 === Exp ../logs/2023-01-28/7/	 Target gt_orders ===

cv7+-tv5.12_cb - Pos ratio 0.5 - Extra_prop 0.0

	 === Exp ../logs/2023-01-28/30/	 Target gt_orders ===

cv7+-tv5.12_cb - Pos ratio 0.5 - Extra_prop 0.0

	 === Exp ../logs/2023-01-28/22/	 Target gt_orders ===

cv7+-tv5.12_cb - Pos ratio 0.25 - Extra_prop 0.0

	 === Exp ../logs/2023-01-28/29/	 Target gt_orders ===

cv7+-tv5.12_cb - Pos ratio 0.25 - Extra_prop 0.0

	 === Exp ../logs/2023-01-28/26/	 Target gt_orders ===

cv7+-tv5.12_cb - Pos ratio 0.25 - Extra_prop 0.0

	 === Exp ../logs/2023-01-28/6/	 Target gt_orders ===

cv7+-tv5.12_cb - Pos ratio 0.2 - Extra_prop 0.0

	 === Exp ../logs/2023-01-28/27/	 Target gt_orders ===

cv7+-tv5.12_cb - Pos ratio 0.2 - Extra_prop 0.0



In [145]:
score = evaluate(df_val_, TARGET, verbose=0)
print(f' ===> CV : {score:.7f}')

 ===> CV : 0.6706136


### Subs

In [60]:
from params import WEIGHTS
import numpy as np

In [82]:
cv = np.average([0.5621, 0.4438, 0.6706], weights=WEIGHTS)  # LB 0.6029
print(f"-> CV : {cv:.4f}")

-> CV : 0.5917


In [80]:
cv = np.average([0.5593, 0.4428, 0.6699], weights=WEIGHTS)  # LB 0.6020
print(f"-> CV : {cv:.4f}")

-> CV : 0.5907


In [63]:
cv = np.average([0.5593, 0.4424, 0.6694], weights=WEIGHTS)  # LB 0.6016
print(f"-> CV : {cv:.4f}")

-> CV : 0.5903


In [64]:
cv = np.average([0.5593, 0.4420, 0.6691], weights=WEIGHTS)  # LB 0.6013
print(f"-> CV : {cv:.4f}")

-> CV : 0.5900


In [65]:
cv = np.average([0.5593, 0.4404, 0.6680], weights=WEIGHTS)  # LB 0.6002
print(f"-> CV : {cv:.4f}")

-> CV : 0.5888


In [66]:
cv = np.average([0.554, 0.4404, 0.6680], weights=WEIGHTS)  # LB 0.5997
print(f"-> CV : {cv:.4f}")  

-> CV : 0.5883


In [67]:
cv = np.average([0.554, 0.4392, 0.6673], weights=WEIGHTS)  # LB 0.5990
print(f"-> CV : {cv:.4f}")

-> CV : 0.5875


In [68]:
cv = np.average([0.554, 0.4382, 0.6668], weights=WEIGHTS)  # LB 0.5984
print(f"-> CV : {cv:.4f}")

-> CV : 0.5869


### Test

In [157]:
df_test = None
for exp_folders, w  in zip(EXP_FOLDERS, WEIGHTS):
    dfs_test = None

    if not isinstance(exp_folders, list):
        exp_folders = [exp_folders]
    
    for exp_folder in exp_folders:
        config = Config(json.load(open(exp_folder + "config.json", "r")))
        print(f'\t === Exp {exp_folder}\t Target {config.target} ===\n')
        TARGET = config.target

        for fold in range(config.k):
            try:
                df_test_ = cudf.read_parquet(exp_folder + f"df_test_{fold}.parquet")
                df_test_['pred'] *= (w / 4)
                print(f'-> Retrieved fold {fold}\n')
                
                if dfs_test is None:
                    dfs_test = df_test_
                else:
                    dfs_test = dfs_test.set_index(['session', 'candidates']).add(
                        df_test_.set_index(['session', 'candidates']), fill_value=0
                    ).reset_index()
                
                del df_test_
                numba.cuda.current_context().deallocations.clear()
                gc.collect()

            except FileNotFoundError:
                pass

    print(f'- Retrieved {len(dfs_test)} test candidates.\n')

    if df_test is None:
        df_test = dfs_test
    else:
        df_test = df_test.set_index(['session', 'candidates']).add(
            dfs_test.set_index(['session', 'candidates']), fill_value=0
        ).reset_index()

    del dfs_test
    numba.cuda.current_context().deallocations.clear()
    gc.collect()

	 === Exp ../logs/2023-01-25/15/	 Target gt_orders ===

-> Retrieved fold 0

-> Retrieved fold 1

-> Retrieved fold 2

-> Retrieved fold 3

- Retrieved 120194976 test candidates.

	 === Exp ../logs/2023-01-28/7/	 Target gt_orders ===

-> Retrieved fold 0

-> Retrieved fold 1

	 === Exp ../logs/2023-01-28/30/	 Target gt_orders ===

-> Retrieved fold 2

-> Retrieved fold 3

- Retrieved 139471776 test candidates.

	 === Exp ../logs/2023-01-28/22/	 Target gt_orders ===

-> Retrieved fold 0

	 === Exp ../logs/2023-01-28/29/	 Target gt_orders ===

-> Retrieved fold 1

	 === Exp ../logs/2023-01-28/26/	 Target gt_orders ===

-> Retrieved fold 2

-> Retrieved fold 3

- Retrieved 139471776 test candidates.

	 === Exp ../logs/2023-01-28/6/	 Target gt_orders ===

-> Retrieved fold 0

	 === Exp ../logs/2023-01-28/27/	 Target gt_orders ===

-> Retrieved fold 2

-> Retrieved fold 3

- Retrieved 139471776 test candidates.



In [70]:
preds = df_test[['session', 'candidates', 'pred']].copy()

preds = preds.sort_values(['session', 'pred'], ascending=[True, False])
preds = preds[['session', 'candidates', 'pred']].groupby('session').agg(list).reset_index()

preds = preds.to_pandas()
preds['candidates'] = preds['candidates'].apply(lambda x: x[:20])

In [71]:
dfs = load_sessions(f"../output/test_parquet/*")

if config.target == "gt_carts":
    top = dfs.loc[dfs["type"] == 1, "aid"].value_counts().index.values[:20].tolist()
elif config.target == "gt_orders":
    top = dfs.loc[dfs["type"] == 2, "aid"].value_counts().index.values[:20].tolist()
else:
    top = dfs.loc[dfs["type"] == 0, "aid"].value_counts().index.values[:20].tolist()

preds['candidates'] = preds['candidates'].apply(lambda x: list(x) + top[: 20 - len(x)])

del dfs
numba.cuda.current_context().deallocations.clear()
gc.collect()

35

### Save

In [72]:
FT_VERSION = "cv7+-tv5.11"
MODEL_VERSION = "1"

In [73]:
log_folder_2 = LOG_PATH + f"{FT_VERSION}.{MODEL_VERSION}/"

os.makedirs(log_folder_2, exist_ok=True)
save_config(config, log_folder_2 + 'config')

In [74]:
sub = preds[['session', 'candidates']].copy()
assert len(sub) == 1671803

sub['candidates'] = sub['candidates'].parallel_apply(lambda x: " ".join(map(str, x)))
sub['session'] =  sub['session'].astype(str) + "_" + TARGET[3:]
sub.columns = ["session_type", "labels"]

sub.to_csv(log_folder_2 + f'sub_{TARGET}.csv', index=False)
print(f"-> Saved sub to {log_folder_2 + f'sub_{TARGET}.csv'}\n")

display(sub.head())

-> Saved sub to ../logs/cv7+-tv5.11.1/sub_gt_orders.csv



Unnamed: 0,session_type,labels
0,12899779_orders,59625 731692 1253524 941596 1790770 737445 448...
1,12899780_orders,1142000 582732 736515 973453 487136 1758603 12...
2,12899781_orders,199008 918667 141736 759436 754412 950718 8110...
3,12899782_orders,1711180 127404 987399 1344773 740494 1494780 7...
4,12899783_orders,1817895 1811433 1729553 573058 58861 408787 14...


In [77]:
if all([os.path.exists(log_folder_2 + f'sub_gt_{c}.csv') for c in CLASSES]):
    sub_final = cudf.concat([
        cudf.read_csv(log_folder_2 + f'sub_gt_{c}.csv') for c in CLASSES
    ], ignore_index=True)

    assert len(sub_final) == 5015409
    sub_final.to_csv(log_folder_2 + f"submission.csv", index=False)

    print(f"\n-> Saved final sub to {log_folder_2 + f'submission.csv'}\n")

    display(sub_final.sample(5))


-> Saved final sub to ../logs/cv7+-tv5.11.1/submission.csv



Unnamed: 0,session_type,labels
537132,13436911_clicks,1587410 484515 1829472 1810431 1597139 825269 ...
2032970,13260946_carts,1544612 1492725 727301 1244717 1797902 1531805...
2966281,14194257_carts,1243313 890996 1230601 779056 1377352 979687 6...
427884,13327663_clicks,825591 500073 876456 1778843 822497 835958 852...
4397794,13953967_orders,961113 1838158 1543589 216668 1001469 234245 1...


In [None]:
# kaggle competitions submit -c otto-recommender-system -f submission.csv -m "Message"

Done !