In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
cd ../src

/tmp/kaggle/kaggle_otto_rs/src


In [3]:
import os
import ast
import json
import glob
import torch
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import nvtabular as nvt
import matplotlib.pyplot as plt

from tqdm import tqdm
from datetime import datetime
from collections import Counter

os.environ['CUDA_VISIBLE_DEVICES'] = "0"
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
from merlin.io import Dataset
from merlin.loader.torch import Loader 

from nvtabular.ops import *
from transformers4rec.torch.ranking_metric import RecallAt

In [5]:
# from data.dataset import OttoDataset
# from data.preparation import prepare_data
# from training.main import k_fold
# from models import OttoTransformer

from utils.metrics import *
from utils.logger import prepare_log_folder, save_config, create_logger

from params import *

In [6]:
# df = pd.read_parquet("../input/parquets/train_0.parquet")
# df.head(1)

## Data
Done :
- Use Merlin Dataloader with big batches
- Use len sampler and create small batches efficiently

TODO :
- Crop randomly
- Ignore length 1 cases
- leftover batches may result in too small cropping

In [48]:
path = "../input/parquets/train_0.parquet"
paths = glob.glob("../input/parquets/train_*.parquet")

In [49]:
# train_ds = Dataset(path)
train_ds = Dataset(paths)

train_dl_merlin = Loader(
    train_ds,
    batch_size=2**14,
    shuffle=False,
)

In [50]:
# %%time
# batch = next(iter(train_dl_merlin))

In [51]:
# %%time
# processed_batch = process_batch(batch[0])

In [52]:
def process_batch(batch):
    processed_batch = {}

    to_split = []
    for k in batch.keys():
        try:
            x, s = batch[k]
            to_split.append(x)
        except:
            processed_batch[k] = batch[k]
            continue
        
    x = torch.stack(to_split, 0)
    x = x.tensor_split(s[1:].cpu().long().view(-1), axis=1)

    processed_batch["aid"] = [x_[0] for x_ in x]
    processed_batch["ts"] = [x_[1] for x_ in x]
    processed_batch["type"] = [x_[2] for x_ in x]
    
    processed_batch["labels_clicks"] = [x_[3] for x_ in x]
    processed_batch["labels_carts"] = [x_[4::2] for x_ in x]
    processed_batch["labels_orders"] = [x_[5::2] for x_ in x]

    processed_batch['len'] = [x.size(0) for x in processed_batch['aid']]
    
    processed_batch['session'] = processed_batch['session'].cpu().numpy().tolist()
        
    return processed_batch

In [53]:
def len_sampler(lens, batch_size=32, drop_last=False, exclude_too_short=True):
    batches = []
    buckets = [[]] * 1000
    yielded = 0

    for idx, len_ in enumerate(lens):
        if len_ == 1 and exclude_too_short:
            continue

        count_zeros = int(5 * np.log(len_))
        if len(buckets[count_zeros]) == 0:
            buckets[count_zeros] = []

        buckets[count_zeros].append(idx)

        if len(buckets[count_zeros]) == batch_size:
            batch = list(buckets[count_zeros])
            batches.append(batch)
            buckets[count_zeros] = []

    batch = []
    leftover = [idx for bucket in buckets for idx in bucket][::-1]

    for idx in leftover:
        batch.append(idx)
        if len(batch) == batch_size:
            batches.append(batch)
            batch = []

    if len(batch) > 0 and not drop_last:
        yielded += 1
        batches.append(batch)

    return batches

In [54]:
%%time

crop_idxs = []

for batch in tqdm(train_dl_merlin):
    processed_batch = process_batch(batch[0])
    
    batches = len_sampler(
        processed_batch['len'],
        batch_size=32,
        drop_last=False,
        exclude_too_short=True
    )

    for batch in batches:
        lens = [processed_batch['len'][idx] for idx in batch]
        
        min_len = np.min(lens)
        crop_idx = np.random.randint(1, min_len)
        min_lens.append(min_len)
#         print(min_len, crop_idx)
        
        x = {k: torch.stack([processed_batch[k][idx][:crop_idx] for idx in batch]) for k in ['aid', 'ts', 'type']}
        y = {k: torch.stack([processed_batch[k][idx][..., crop_idx - 1] for idx in batch]) for k in ['labels_clicks', 'labels_carts', 'labels_orders']}

        crop_idxs.append(crop_idx)

#         break
#     break

100%|██████████| 170/170 [14:19<00:00,  5.05s/it]

CPU times: user 19min 32s, sys: 1min 25s, total: 20min 58s
Wall time: 14min 21s





### Model TODO

In [None]:
# model = NERTransformer("microsoft/deberta-v3-base", num_classes=3)
model = OttoTransformer("roberta-base", num_classes=3, n_ids=N_IDS)

In [None]:
x = data['ids'].unsqueeze(0)
types = data['token_type_ids'].unsqueeze(0).cuda()

x = torch.cat([x] * 16, 0)
types = torch.cat([types] * 16, 0)

In [None]:
model = model.cuda()
x = x.cuda()
types = types.cuda()

In [None]:
pred = model(x, types)

In [None]:
pred.size()

# Training

In [None]:
BATCH_SIZES = {
    "microsoft/deberta-v3-base": 32,
    "microsoft/deberta-v3-large": 32,
}

LRS = {
    "microsoft/deberta-v3-base": 3e-5,
    "microsoft/deberta-v3-large": 3e-5,
}

In [None]:
class Config:
    # General
    seed = 2222
    device = "cuda"
    
    # Splits
    k = 4
    random_state = 2222
    selected_folds = [0, 1, 2, 3]
    folds_file = "/workspace/folds_kgd_4.csv"

    # Architecture
    name = "microsoft/deberta-v3-base"

    pretrained_weights = None 

    no_dropout = False
    use_conv = False
    use_lstm = False
    nb_layers = 1
    nb_ft = 128
    conv_kernel = 5
    drop_p = 0 if no_dropout else 0.1
    multi_sample_dropout = False

    num_classes = 3
    n_ids = N_IDS

    # Texts
    max_len_train = 410
    max_len = 410

#     extra_data_path = OUT_PATH + "pl_case5/"
    extra_data_path = None  # OUT_PATH + "pl_6/df_pl.csv"

    # Training    
    loss_config = {
        "name": "bce",  # ce, bce
        "smoothing": 0,  # 0.01
        "activation": "sigmoid",  # "sigmoid", "softmax"
    }

    data_config = {
        "batch_size": BATCH_SIZES[name],
        "val_bs": BATCH_SIZES[name] * 2,
        "use_len_sampler": True,
        "pad_token": 1 if "roberta" in name else 0,
    }

    optimizer_config = {
        "name": "AdamW",
        "lr": 5e-5,
        "lr_transfo": LRS[name],
        "lr_decay": 0.99,
        "warmup_prop": 0.1,
        "weight_decay": 1,
        "betas": (0.5, 0.99),
        "max_grad_norm": 1.,
        # AWP
        "use_awp": False,
        "awp_start_step": 1000,
        "awp_lr": 1,
        "awp_eps": 5e-5 if "xlarge" in name else 1e-3,
        "awp_period": 3,
        # SWA
        "use_swa": False,
        "swa_start": 9400,
        "swa_freq": 500,
    }

    gradient_checkpointing = False
    acc_steps = 1
    epochs = 1

    use_fp16 = True

    verbose = 1
    verbose_eval = 1000

In [None]:
DEBUG = True
log_folder = None

In [None]:
if not DEBUG:
    log_folder = prepare_log_folder(LOG_PATH)
    print(f"Logging results to {log_folder}")
    save_config(Config, log_folder + "config.json")
    create_logger(directory=log_folder, name="logs.txt")

pred_val, pred_test = k_fold(
    Config,
    df,
    df_test=df_test,
    log_folder=log_folder
)

Done