**About :** Trains XGBoost models.

**TODO**:
- better neg sampling technique ??

In [1]:
cd ../src

/workspace/kaggle_otto_rs/src


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [4]:
import os
import gc
import cudf
import json
import glob
import numba
import xgboost
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from pandarallel import pandarallel

warnings.simplefilter(action="ignore", category=FutureWarning)
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
pandarallel.initialize(nb_workers=32, progress_bar=False)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [5]:
from params import *

from data.dataset import FeaturesDataset
from model_zoo.mlp import define_model

from utils.load import *
from utils.metrics import get_coverage
from utils.logger import save_config

### Load

In [6]:
VERSION = "v3.5"
# VERSION = "v2.5"

In [7]:
MODE = "val"

#### Train data
- neg sampling could use candidates from lower versions

In [8]:
POS_RATIO = 0.5
TARGET = "gt_*"   # "gt_clicks", "gt_carts", "gt_orders", "gt_*"

In [9]:
if MODE == "val":
    df_train = load_parquets_cudf_chunks(
        f"../output/features/fts_train_{VERSION}/*",
        pos_ratio=POS_RATIO,
        target=TARGET,
        n_chunks=5,
    )
    val_regex = f"../output/features/fts_val_{VERSION}/*"
else:  # Test
    df_train = load_parquets_cudf_chunks(
        f"../output/features/fts_val_{VERSION}/*",
        pos_ratio=POS_RATIO,
        target=TARGET,
        n_chunks=5,
    )
    val_regex = f"../output/features/fts_test_{VERSION}/*"

100%|██████████| 5/5 [00:26<00:00,  5.23s/it]


In [10]:
df_train = df_train.to_pandas()

#### Val data

In [11]:
dataset = FeaturesDataset(df_train, ["gt_carts"], df_train.columns[5:])

In [12]:
# %%time
# df_val = pd.read_csv(f'../output/fts_train_{VERSION}.csv', nrows=10_000_000)

### Features

In [13]:
# model = define_model("res")
# x = torch.rand(5, 50)
# model(x)

### Train

In [14]:
import cuml
import cudf
from numerize.numerize import numerize
from sklearn.metrics import roc_auc_score

from training.train import fit
from inference.predict import predict
from model_zoo.mlp import define_model
from data.dataset import FeaturesDataset
from utils.load import load_parquets_cudf
from utils.torch import seed_everything, count_parameters


def train(df_train, val_regex, config, log_folder=None):
    seed_everything(config.seed)

    print(f"\n-------------   Training {config.model.upper()} Model   -------------\n")
    
    train_dataset = FeaturesDataset(df_train, config.target, config.features)

    if config.mode != "test":
        df_val = load_parquets_cudf(val_regex, max_n=1).to_pandas()
        val_dataset = FeaturesDataset(df_val, config.target, config.features)
    
    model = define_model(
        name=config.model,
        nb_ft=config.nb_ft,
        d=config.d,
        p=config.p,
        num_layers=config.num_layers,
        num_classes=config.num_classes
    ).cuda()
    model.zero_grad()
        
    print(f"    -> {numerize(len(df_train))} training candidates")
    print(f"    -> {numerize(len(df_val))} validation candidates subset")
    print(f"    -> {numerize(count_parameters(model))} trainable parameters\n")
    
    pred_val = fit(
        model,
        train_dataset,
        val_dataset,
        config.data_config,
        config.loss_config,
        config.optimizer_config,
        epochs=config.epochs,
        verbose_eval=config.verbose_eval,
        use_fp16=config.use_fp16,
        run=None,
    )

    val_candids = sum([len(cudf.read_parquet(f, columns=['gt_orders'])) for f in glob.glob(val_regex)])
    print(f"\n    -> Inferring {numerize(val_candids)} candidates\n")

    cols = ['session', 'candidates', 'gt_clicks', 'gt_carts', 'gt_orders', 'pred_clicks', 'pred_carts', 'pred_orders']

    dfs = []
    for path in tqdm(glob.glob(val_regex)):
        dfg = cudf.read_parquet(path)
        dataset =  FeaturesDataset(dfg.to_pandas(), None, config.features)
        preds = predict(model, dataset, config.loss_config, batch_size=config.data_config["val_bs"])
        
        for i, tgt in enumerate(config.target):
            dfg["pred" + tgt[2:]] = preds[:, i]
        dfs.append(dfg[[c for c in cols if c in dfg.columns]])
#         break

    results = cudf.concat(dfs, ignore_index=True).sort_values(['session', 'candidates'])

    if config.mode == "test":
        return results

    # Score
    print()
    for i, tgt in enumerate(config.target):
        auc = cuml.metrics.roc_auc_score(results[tgt].astype('int32'), results["pred" + tgt[2:]].values)
        print(f'-> {tgt} - AUC : {auc:.4f}')

    if log_folder is None:
        return results

    # Save stuff
    # TODO
#     results.to_csv(log_folder + "df_val.csv", index=False)
    
    return results

### Params

In [15]:
class Config:
    seed = 100
    version = VERSION
    mode = MODE

    features = [
        'logspace_w', 'linspace_w', 'linspace_w_t163', 'logspace_w_t163', 'linspace_w_t191', 'logspace_w_t191',

        'matrix_123_temporal_20_mean', 'matrix_123_temporal_20_sum', 'matrix_123_temporal_20_max',
        'matrix_123_temporal_20_logspace_mean', 'matrix_123_temporal_20_logspace_sum', 'matrix_123_temporal_20_logspace_max',
        'matrix_123_temporal_20_linspace_mean', 'matrix_123_temporal_20_linspace_sum', 'matrix_123_temporal_20_linspace_max',
        'matrix_123_type136_20_mean', 'matrix_123_type136_20_sum', 'matrix_123_type136_20_max',
        'matrix_123_type136_20_logspace_mean', 'matrix_123_type136_20_logspace_sum', 'matrix_123_type136_20_logspace_max',
        'matrix_123_type136_20_linspace_mean', 'matrix_123_type136_20_linspace_sum', 'matrix_123_type136_20_linspace_max',
        'matrix_12__20_mean', 'matrix_12__20_sum', 'matrix_12__20_max',
        'matrix_12__20_logspace_mean', 'matrix_12__20_logspace_sum', 'matrix_12__20_logspace_max',
        'matrix_12__20_linspace_mean', 'matrix_12__20_linspace_sum', 'matrix_12__20_linspace_max',
        'matrix_123_type0.590.5_20_mean', 'matrix_123_type0.590.5_20_sum', 'matrix_123_type0.590.5_20_max',
        'matrix_123_type0.590.5_20_logspace_mean', 'matrix_123_type0.590.5_20_logspace_sum', 'matrix_123_type0.590.5_20_logspace_max',
        'matrix_123_type0.590.5_20_linspace_mean', 'matrix_123_type0.590.5_20_linspace_sum', 'matrix_123_type0.590.5_20_linspace_max',
        
        'clicks_popularity_w', 'carts_popularity_w', 'orders_popularity_w',
        'view_popularity_log_w', 'view_popularity_lin_w', 
    
        'clicks_popularity', 'carts_popularity', 'orders_popularity',
        'view_popularity_log', 'view_popularity_lin',
        
        'clicks_popularity_old', 'carts_popularity_old', 'orders_popularity_old',
        'view_popularity_log_old', 'view_popularity_lin_old',

        'candidate_clicks_before', 'candidate_carts_before', 'candidate_orders_before', 'candidate_*_before',
        'n_views', 'n_clicks', 'n_carts', 'n_orders',
    ]

    target = [TARGET] if TARGET != "gt_*" else ["gt_clicks", "gt_carts", "gt_orders"]
    pos_ratio = POS_RATIO
    
    # Model
    model = "mlp"
    nb_ft = len(features)
    d = 512
    p = 0.1
    num_layers = 3
    num_classes = len(target)

    # Training    
    loss_config = {
        "name": "bce",
        "smoothing": 0.,
        "activation": "sigmoid",
    }

    data_config = {
        "batch_size": 2 ** 17,
        "val_bs": 2 ** 17,
        "use_balanced_sampler": False,  # TODO
        "use_weighted_sampler": False,  # TODO
        "sampler_weights": [1, 3],  # TODO
    }

    optimizer_config = {
        "name": "Adam",
        "lr": 3e-3,
        "warmup_prop": 0.,
        "betas": (0.9, 0.999),
    }

    epochs = 50  # 70

    use_fp16 = True

    verbose = 1
    verbose_eval = 500

### Main

In [16]:
OPTIMIZE = False
TRAIN = True
DEBUG = True

#### Train

In [None]:
log_folder = None
if not DEBUG:
    log_folder = prepare_log_folder(LOG_PATH)
    print(f'Logging results to {log_folder}')
    save_config(Config, log_folder + 'config')
    create_logger(directory=log_folder, name="logs.txt")

df_val = train(df_train, val_regex, Config, log_folder=log_folder)


-------------   Training MLP Model   -------------

    -> 4.88M training candidates
    -> 9.82M validation candidates subset
    -> 208.52K trainable parameters



XGB ES AUC : 0.96893
### Process

In [26]:
dfs = load_sessions(f"../output/{MODE}_parquet/*")
preds = df_val[['session']].drop_duplicates(keep="first").sort_values('session', ignore_index=True).to_pandas()

for idx, c in enumerate(CLASSES):
    if "gt_" + c not in Config.target:
        continue
            
    preds_c = df_val.sort_values(['session', f'pred_{c}'], ascending=[True, False])
    preds_c = preds_c[['session', 'candidates', f'pred_{c}']].groupby('session').agg(list).reset_index()

    preds_c = preds_c.to_pandas()
    preds_c['candidates'] = preds_c['candidates'].apply(lambda x: x[:20])
    
    # Fill less than 20 candidates. This should be useless in the future
    top = dfs.loc[dfs["type"] == idx, "aid"].value_counts().index.values[:20].tolist()
    preds_c['candidates'] = preds_c['candidates'].apply(lambda x: list(x) + top[:20 - len(x)])
    
    preds_c = preds_c.sort_values('session')
    preds[f"candidates_{c}"] = preds_c["candidates"].values
    preds[f'pred_{c}'] = preds_c[f'pred_{c}'].values

In [28]:
del dfs, preds_c
numba.cuda.current_context().deallocations.clear()
gc.collect()

35

### Eval
- 0.6576 5 leaves
- 0.6577 4 leaves
- 0.6575 5 leaves mcw0.01
- 0.6576 5 leaves mcw0.0001
- 0.6576 6 leaves mcw0.0001
- 0.6576 6 leaves mcw0.001
- 0.657. 6 leaves mcw0.01

In [29]:
if MODE != "test":
    gt = pd.read_parquet("../output/val_labels.parquet")

    recalls = []
    for col in CLASSES:
        if "gt_" + col not in Config.target:
            continue

        if f"gt_{col}" not in preds.columns:
            preds = preds.merge(gt[gt["type"] == col].drop("type", axis=1), how="left").rename(
                columns={"ground_truth": f"gt_{col}"}
            )

        n_preds, n_gts, n_found = get_coverage(
            preds[f"candidates_{col}"].values, preds[f"gt_{col}"].values
        )

        print(
            f"- {col} \t-  Found {numerize(n_found)} GTs\t-  Recall : {n_found / n_gts :.4f}"
        )
        recalls.append(n_found / n_gts)
        
        
    cv = np.average(recalls, weights=WEIGHTS)
    # cv = np.average([0.5059, 0.4139, 0.6540], weights=WEIGHTS)
    print(f"\n-> CV : {cv:.4f}")

- clicks	-  Found 32.62K GTs	-  Recall : 0.3348
- carts	-  Found 5.53K GTs	-  Recall : 0.1938
- orders	-  Found 3.22K GTs	-  Recall : 0.2094


- orders	-  Found 205.66K GTs	-  Recall : 0.6566

In [30]:
cv = np.average([0.5270, 0.4203, 0.6577], weights=WEIGHTS)
# cv = np.average([0.5059, 0.4139, 0.6540], weights=WEIGHTS)
print(f"-> CV : {cv:.4f}")

-> CV : 0.5734


### Save
TODO 

In [None]:
LOG_PATH

In [None]:
if MODE == "test":
    log_folder = LOG_PATH + f"{VERSION}.0/"
    os.makedirs(log_folder, exist_ok=True)
    save_config(Config, log_folder + 'config')

    sub = preds[['session', 'candidates']].copy()
    assert len(sub) == 1671803

    sub['candidates'] = sub['candidates'].parallel_apply(lambda x: " ".join(map(str, x)))
    sub['session'] =  sub['session'].astype(str) + "_" + TARGET[3:]
    sub.columns = ["session_type", "labels"]
    
    sub.to_csv(log_folder + f'sub_{TARGET}.csv', index=False)
    print(f"-> Saved sub to {log_folder + f'sub_{TARGET}.csv'}\n")

    display(sub.head())

In [None]:
# len(cudf.read_csv("../input/sample_submission.csv"))

In [None]:
if MODE == "test":
    if all([os.path.exists(log_folder + f'sub_gt_{c}.csv') for c in CLASSES]):
        
        sub_final = cudf.concat([
            cudf.read_csv(log_folder + f'sub_gt_{c}.csv') for c in CLASSES
        ], ignore_index=True)
        
        assert len(sub_final) == 5015409
        sub_final.to_csv(log_folder + f"submission_{cv:.4f}.csv", index=False)
        
        print(f"-> Saved final sub to {log_folder + f'submission_{cv:.4f}.csv'}\n")
        
        display(sub_final.sample(5))

Done