In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
cd ../src

/tmp/kaggle/kaggle_otto_rs/src


In [3]:
import os
import ast
import json
import glob
import torch
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import nvtabular as nvt
import matplotlib.pyplot as plt

from tqdm import tqdm
from datetime import datetime
from collections import Counter

os.environ['CUDA_VISIBLE_DEVICES'] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
# from utils.tokenizers import update_tokenizers
# package_path = "/opt/conda/lib/python3.8/site-packages/transformers"
# input_dir = "../input/deberta_fast_tokenizer"

# update_tokenizers(package_path, input_dir)

In [5]:
from data.dataset import OttoDataset
from data.preparation import prepare_data
# from training.main import k_fold
from models import OttoTransformer

from utils.metrics import *
from utils.logger import prepare_log_folder, save_config, create_logger

from params import *

In [6]:
from nvtabular.ops import *
from merlin.schema.tags import Tags
from merlin_standard_lib import Schema
from transformers4rec import torch as tr
from transformers4rec.torch.ranking_metric import RecallAt
from nvtabular.loader.torch import TorchAsyncItr, DLDataLoader

from trainer import Trainer
from transformers4rec.config.trainer import T4RecTrainingArguments
from transformers4rec.torch.utils.data_utils import NVTabularDataLoader

In [7]:
df = pd.read_parquet("../input/parquets/train_0.parquet")
df_ref = df.copy()

In [None]:
import math

def convert_size(size_bytes):
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return "%s %s" % (s, size_name[i])

## Data

In [70]:
df_ref = "../input/parquets/train_0.parquet"

In [71]:
from merlin.loader.torch import Loader 
from merlin.io import Dataset

train_ds = Dataset(df_ref)

train_dl_merlin = Loader(
    train_ds,
    batch_size=32,
    shuffle=False,
    
)

In [72]:
%%time
batch = next(iter(train_dl_merlin))

CPU times: user 45.7 s, sys: 2.65 s, total: 48.4 s
Wall time: 49.2 s


In [73]:
def process_batch(batch):
    processed_batch = {}
    lens = []
    
    for k in batch.keys():
        try:
            x, s = batch[k]
        except:
            processed_batch[k] = batch[k]
            continue

        x_ = list(x.tensor_split(s[1:].cpu().long().squeeze()))
        processed_batch[k] = x_

        if "len" not in processed_batch.keys():
            processed_batch['len'] = [x.size(0) for x in x_]

#         max_size = np.max([x.size(0) for x in x_])
#         ph = torch.zeros((len(x_), max_size), dtype=x.dtype)
#         for i, x in enumerate(x_):
#             ph[i, : x.size(0)] = x
#         processed_batch[k] = ph[:max_size]

    return processed_batch

In [74]:
%%time
processed_batch = process_batch(batch[0])

CPU times: user 6.78 ms, sys: 353 µs, total: 7.13 ms
Wall time: 6.3 ms


In [75]:
%%time

processed_batches = {}

for batch in tqdm(train_dl_merlin):
    processed_batch = process_batch(batch[0])

    for k in processed_batch:
        try:
            processed_batches[k] += processed_batch[k]
        except KeyError:
            processed_batches[k] = processed_batch[k]

100%|██████████| 8125/8125 [02:25<00:00, 55.80it/s] 

CPU times: user 2min 12s, sys: 13 s, total: 2min 25s
Wall time: 2min 25s





In [128]:
# x = np.arange(1, 500)
# plt.plot(x, 5 * np.log(x))
# plt.grid(True)
# plt.plot(x, x / 3)

In [133]:
def len_sampler(lens, batch_size=32, drop_last=False):
    batches = []
    buckets = [[]] * 1000
    yielded = 0

    for idx, len_ in enumerate(lens):
        count_zeros = int(5 * np.log(len_))
        if len(buckets[count_zeros]) == 0:
            buckets[count_zeros] = []

        buckets[count_zeros].append(idx)

        if len(buckets[count_zeros]) == batch_size:
            batch = list(buckets[count_zeros])
#             yield batch
            batches.append(batch)
            yielded += 1
            buckets[count_zeros] = []

    batch = []
    leftover = [idx for bucket in buckets for idx in bucket]

    for idx in leftover:
        batch.append(idx)
        if len(batch) == batch_size:
            yielded += 1
            batches.append(batch)
#             yield batch
            batch = []

    if len(batch) > 0 and not drop_last:
        yielded += 1
        batches.append(batch)
#         yield batch

    return batches

In [134]:
batches = len_sampler(processed_batches['len'], 32, False)

In [137]:
processed_batches.keys()

dict_keys(['aid', 'len', 'ts', 'type', 'labels_clicks', 'session', 'labels_carts_0', 'labels_orders_0', 'labels_carts_1', 'labels_orders_1', 'labels_carts_2', 'labels_orders_2', 'labels_carts_3', 'labels_orders_3', 'labels_carts_4', 'labels_orders_4', 'labels_carts_5', 'labels_orders_5', 'labels_carts_6', 'labels_orders_6', 'labels_carts_7', 'labels_orders_7', 'labels_carts_8', 'labels_orders_8', 'labels_carts_9', 'labels_orders_9', 'labels_carts_10', 'labels_orders_10', 'labels_carts_11', 'labels_orders_11', 'labels_carts_12', 'labels_orders_12', 'labels_carts_13', 'labels_orders_13', 'labels_carts_14', 'labels_orders_14', 'labels_carts_15', 'labels_orders_15', 'labels_carts_16', 'labels_orders_16', 'labels_carts_17', 'labels_orders_17', 'labels_carts_18', 'labels_orders_18', 'labels_carts_19', 'labels_orders_19'])

In [139]:
min_lens = []
for batch in tqdm(batches):
    lens = [processed_batches['len'][idx] for idx in batch]

    min_len = np.min(lens)
    min_lens.append(min_len)

#     for k in processed_batches:
#         print(k)
#         x = torch.stack([processed_batches[k][idx][:min_len] for idx in batch])

    x = {k: torch.stack([processed_batches[k][idx][:min_len] for idx in batch]) for k in ['aid', 'ts', 'type']}
    y = {k: torch.stack([processed_batches[k][idx][min_len - 1] for idx in batch]) for k in processed_batches if k not in ['session', 'aid', 'len', 'ts', 'type']}
    
    break

  0%|          | 0/8125 [00:00<?, ?it/s]


In [141]:
x['aid']

tensor([[ 818697, 1296642, 1407078,  ...,  776384, 1415436,  806996],
        [ 329936, 1506982,  671392,  ...,  624925,  204728,  324620],
        [ 927101, 1727393,  282587,  ...,  101986,  176666,  176666],
        ...,
        [1517226,  200827,  200827,  ..., 1403545,  697270,  489181],
        [ 266237,  266237, 1390154,  ...,  323049, 1278224,  220468],
        [1072406,  372017,  372017,  ..., 1708940, 1070747,  568214]],
       device='cuda:0')

In [142]:
y

{'labels_clicks': tensor([      0, 1099388,  176666,  156247,   31508,  631899, 1363224, 1272510,
         1046491, 1111934, 1478294, 1553122, 1273977, 1460858,  804700,  858693,
         1572293, 1576766, 1447712,  796748,  463079, 1308580, 1787512,  385078,
         1257092,  225619,  226254,    5769, 1364448,  697270,  172465, 1847053],
        device='cuda:0'),
 'labels_carts_0': tensor([1699802., 1357411., 1329897.,       0.,       0.,       0.,       0.,
         1272510.,  835497.,       0.,  231386.,       0.,       0.,       0.,
               0.,       0.,       0.,       0.,       0.,       0.,       0.,
               0.,       0.,       0.,       0.,       0.,       0.,       0.,
               0.,       0.,       0., 1708940.], device='cuda:0',
        dtype=torch.float64),
 'labels_orders_0': tensor([ 806996.,       0.,       0.,       0.,       0.,       0.,       0.,
               0.,  835497.,       0.,       0.,       0.,       0., 1810824.,
               0.,      

In [57]:
# import sys
# for k in processed_batches.keys():
#     sz = convert_size(sys.getsizeof(processed_batches[k]))
#     print(k, sz)

In [105]:
%%time

for batch in tqdm(train_dl_merlin):
    processed_batch = process_batch(batch[0])

 41%|████      | 3311/8125 [03:41<05:21, 14.98it/s] 

KeyboardInterrupt



## Old

In [None]:
df, df_test = prepare_data(DATA_PATH)

In [None]:
df_val = df[df['fold'] == 0].reset_index(drop=True)

In [None]:
dataset = OttoDataset(df_test.head(10000), max_len=410, max_trunc=100, train=False, test=True, pad=False)
dataset = OttoDataset(df_val.head(10000), max_len=410, max_trunc=100, train=False, test=False, pad=False)

In [None]:
lens = []
for idx in tqdm(range(10000)):
    data = dataset[idx]
    lens.append(data['ids'].size(0))
#     break

if len(lens) > 100:
    plt.figure(figsize=(15, 5))
    sns.countplot(x=np.clip(lens, 0, 70))
    plt.show()

In [None]:
# dataset = OttoTrainDataset(df_test, max_len=410, train=False)
# lens = []

# for idx in tqdm(range(10000)):
#     data = dataset[idx]
#     lens.append(data.shape[0])
    
# plt.figure(figsize=(15, 5))
# sns.countplot(x=np.clip(lens, 0, 70))
# plt.show()

In [None]:
y = [dataset.targets[k] for k in sorted(dataset.targets.keys())]
recall(copy.deepcopy(y[:100]), copy.deepcopy(y[:100]), k=20)

### Model

In [None]:
# model = NERTransformer("microsoft/deberta-v3-base", num_classes=3)
model = OttoTransformer("roberta-base", num_classes=3, n_ids=N_IDS)

In [None]:
x = data['ids'].unsqueeze(0)
types = data['token_type_ids'].unsqueeze(0).cuda()

x = torch.cat([x] * 16, 0)
types = torch.cat([types] * 16, 0)

In [None]:
model = model.cuda()
x = x.cuda()
types = types.cuda()

In [None]:
pred = model(x, types)

In [None]:
pred.size()

# Training

In [None]:
BATCH_SIZES = {
    "microsoft/deberta-v3-base": 32,
    "microsoft/deberta-v3-large": 32,
}

LRS = {
    "microsoft/deberta-v3-base": 3e-5,
    "microsoft/deberta-v3-large": 3e-5,
}

In [None]:
class Config:
    # General
    seed = 2222
    device = "cuda"
    
    # Splits
    k = 4
    random_state = 2222
    selected_folds = [0, 1, 2, 3]
    folds_file = "/workspace/folds_kgd_4.csv"

    # Architecture
    name = "microsoft/deberta-v3-base"

    pretrained_weights = None 

    no_dropout = False
    use_conv = False
    use_lstm = False
    nb_layers = 1
    nb_ft = 128
    conv_kernel = 5
    drop_p = 0 if no_dropout else 0.1
    multi_sample_dropout = False

    num_classes = 3
    n_ids = N_IDS

    # Texts
    max_len_train = 410
    max_len = 410

#     extra_data_path = OUT_PATH + "pl_case5/"
    extra_data_path = None  # OUT_PATH + "pl_6/df_pl.csv"

    # Training    
    loss_config = {
        "name": "bce",  # ce, bce
        "smoothing": 0,  # 0.01
        "activation": "sigmoid",  # "sigmoid", "softmax"
    }

    data_config = {
        "batch_size": BATCH_SIZES[name],
        "val_bs": BATCH_SIZES[name] * 2,
        "use_len_sampler": True,
        "pad_token": 1 if "roberta" in name else 0,
    }

    optimizer_config = {
        "name": "AdamW",
        "lr": 5e-5,
        "lr_transfo": LRS[name],
        "lr_decay": 0.99,
        "warmup_prop": 0.1,
        "weight_decay": 1,
        "betas": (0.5, 0.99),
        "max_grad_norm": 1.,
        # AWP
        "use_awp": False,
        "awp_start_step": 1000,
        "awp_lr": 1,
        "awp_eps": 5e-5 if "xlarge" in name else 1e-3,
        "awp_period": 3,
        # SWA
        "use_swa": False,
        "swa_start": 9400,
        "swa_freq": 500,
    }

    gradient_checkpointing = False
    acc_steps = 1
    epochs = 1

    use_fp16 = True

    verbose = 1
    verbose_eval = 1000

In [None]:
DEBUG = True
log_folder = None

In [None]:
if not DEBUG:
    log_folder = prepare_log_folder(LOG_PATH)
    print(f"Logging results to {log_folder}")
    save_config(Config, log_folder + "config.json")
    create_logger(directory=log_folder, name="logs.txt")

pred_val, pred_test = k_fold(
    Config,
    df,
    df_test=df_test,
    log_folder=log_folder
)

Done