In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
cd ../src

/tmp/kaggle/kaggle_otto_rs/src


In [3]:
import os
import ast
import json
import glob
import torch
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import nvtabular as nvt
import matplotlib.pyplot as plt

from tqdm import tqdm
from datetime import datetime
from collections import Counter

os.environ['CUDA_VISIBLE_DEVICES'] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
# from utils.tokenizers import update_tokenizers
# package_path = "/opt/conda/lib/python3.8/site-packages/transformers"
# input_dir = "../input/deberta_fast_tokenizer"

# update_tokenizers(package_path, input_dir)

In [5]:
from data.dataset import OttoDataset
from data.preparation import prepare_data
# from training.main import k_fold
from models import OttoTransformer

from utils.metrics import *
from utils.logger import prepare_log_folder, save_config, create_logger

from params import *

In [6]:
from nvtabular.ops import *
from merlin.schema.tags import Tags
from merlin_standard_lib import Schema
from transformers4rec import torch as tr
from transformers4rec.torch.ranking_metric import RecallAt
from nvtabular.loader.torch import TorchAsyncItr, DLDataLoader

from trainer import Trainer
from transformers4rec.config.trainer import T4RecTrainingArguments

## Data

In [129]:
SCHEMA_PATH = "../output/schema.pb"
schema = Schema().from_proto_text(SCHEMA_PATH)

In [432]:
loader = NVTabularDataLoader.from_schema(
        schema,
        nvt.Dataset(dfd),
        2,
        max_sequence_length=500,
)

In [425]:
for batch in tqdm(loader):
    batch
    break
    continue

  0%|          | 0/50 [00:02<?, ?it/s]


In [428]:
batch

{'aid': tensor([[ 261360,  668472, 1621134,  776937,   62096,   62096, 1394029,  989212,
          1724520, 1010775, 1307712,  769984,  577040,  577040,  577040, 1405119,
          1405119, 1405119, 1305759, 1621134, 1410370,  510056,  696516, 1207307,
          1037138, 1299190, 1299190, 1299190, 1299190, 1207307, 1003650,  695858,
           696516, 1527102, 1527102, 1527102, 1099895, 1099895,  700402,   11830,
            11830,   11830, 1157882, 1790438, 1157882,  409620, 1621134, 1305759,
          1621134, 1007755,  169432, 1345697,  724596,   38538, 1624652,  170293,
          1391353,  581641, 1391353,  280731, 1267100, 1763549,  606959,  606959,
          1269277,  606959,  724596,  606959, 1430078,  688858, 1534690, 1116621,
          1534690, 1378733, 1378733,  985021,  985021,  307097,  307097,  542780,
            38261, 1572144, 1578676,  257912,  193535, 1656618,  193535, 1336044,
           571762, 1661189,  660231,  660231,  803044, 1684543, 1390364, 1207307,
         

### DL

In [501]:
df = pd.read_parquet("../input/parquets/train_0.parquet")
df_ref = df.copy()

In [503]:
N = 2

In [514]:
df = df_ref.head(100).copy()

In [515]:
df['labels_clicks'] = df['labels_clicks'].apply(lambda x: np.where(np.isnan(x), 0, x))
# df['labels_carts'] = df['labels_carts'].apply(lambda x_: [np.where(np.isnan(x), 0, x) for x in x_])
# df['labels_orders'] = df['labels_orders'].apply(lambda x_: [np.where(np.isnan(x), 0, x) for x in x_])

In [516]:
# List of list -> list
def crop_pad(x, max_len=N):
    if len(x) > max_len:
        return x[:max_len]
    else:
        return list(x) + [0 for i in range(max_len - len(x))]

df['labels_carts'] = df['labels_carts'].apply(lambda x_: np.array([crop_pad(x) for x in x_]).astype(float))
df['labels_orders'] = df['labels_orders'].apply(lambda x_: np.array([crop_pad(x) for x in x_]).astype(float))

for i in range(N):
    df[f'labels_carts_{i}'] = df['labels_carts'].apply(lambda x: x[:, i])
    df[f'labels_orders_{i}'] = df['labels_orders'].apply(lambda x: x[:, i])

In [517]:
CONTINUOUS_COLUMNS = ['ts']
CATEGORICAL_COLUMNS = ['target', "aid"]
LABEL_COLUMNS = ["labels_clicks"] + [f'labels_carts_{i}' for i in range(N)] + [f'labels_orders_{i}' for i in range(N)]

COLS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS + LABEL_COLUMNS
dfd = df[COLS]  #["../input/parquets/train_0.parquet"]

In [518]:
dataset = nvt.Dataset(dfd)  #["../input/parquets/train_0.parquet"]

BATCH_SIZE = 2

In [519]:
train_dataset = TorchAsyncItr(
    dataset,
    cats=CATEGORICAL_COLUMNS + LABEL_COLUMNS,
    conts=CONTINUOUS_COLUMNS,
    sparse_names=COLS,
    sparse_max={c : 500 for c in COLS},
    sparse_as_dense=True,
    batch_size=BATCH_SIZE
)

In [520]:
train_loader = DLDataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    collate_fn=lambda x: x[0][0],
    pin_memory=False,
    num_workers=0
)

In [521]:
for batch in tqdm(train_loader):
    batch
    break
    continue

  0%|          | 0/50 [00:09<?, ?it/s]


In [522]:
batch

{'target': tensor([[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      

In [513]:
df.head(1)

Unnamed: 0,session,aid,ts,type,labels_clicks,labels_carts,labels_orders,target,labels_carts_0,labels_orders_0,labels_carts_1,labels_orders_1
0,0,"[1517085, 1563459, 1309446, 16246, 1781822, 11...","[1659304800025, 1659304904511, 1659367439426, ...","[clicks, clicks, clicks, clicks, clicks, click...","[1563459.0, 1309446.0, 16246.0, 1781822.0, 115...","[[1649869.0, 461689.0], [1649869.0, 461689.0],...","[[461689.0, 305831.0], [461689.0, 305831.0], [...","[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 1, 1, 1, 1, 1, ...","[1649869.0, 1649869.0, 1649869.0, 1649869.0, 1...","[461689.0, 461689.0, 461689.0, 461689.0, 46168...","[461689.0, 461689.0, 461689.0, 461689.0, 46168...","[305831.0, 305831.0, 305831.0, 305831.0, 30583..."


## Old

In [None]:
df, df_test = prepare_data(DATA_PATH)

In [None]:
df_val = df[df['fold'] == 0].reset_index(drop=True)

In [None]:
dataset = OttoDataset(df_test.head(10000), max_len=410, max_trunc=100, train=False, test=True, pad=False)
dataset = OttoDataset(df_val.head(10000), max_len=410, max_trunc=100, train=False, test=False, pad=False)

In [None]:
lens = []
for idx in tqdm(range(10000)):
    data = dataset[idx]
    lens.append(data['ids'].size(0))
#     break

if len(lens) > 100:
    plt.figure(figsize=(15, 5))
    sns.countplot(x=np.clip(lens, 0, 70))
    plt.show()

In [None]:
# dataset = OttoTrainDataset(df_test, max_len=410, train=False)
# lens = []

# for idx in tqdm(range(10000)):
#     data = dataset[idx]
#     lens.append(data.shape[0])
    
# plt.figure(figsize=(15, 5))
# sns.countplot(x=np.clip(lens, 0, 70))
# plt.show()

In [None]:
y = [dataset.targets[k] for k in sorted(dataset.targets.keys())]
recall(copy.deepcopy(y[:100]), copy.deepcopy(y[:100]), k=20)

### Model

In [None]:
# model = NERTransformer("microsoft/deberta-v3-base", num_classes=3)
model = OttoTransformer("roberta-base", num_classes=3, n_ids=N_IDS)

In [None]:
x = data['ids'].unsqueeze(0)
types = data['token_type_ids'].unsqueeze(0).cuda()

x = torch.cat([x] * 16, 0)
types = torch.cat([types] * 16, 0)

In [None]:
model = model.cuda()
x = x.cuda()
types = types.cuda()

In [None]:
pred = model(x, types)

In [None]:
pred.size()

# Training

In [None]:
BATCH_SIZES = {
    "microsoft/deberta-v3-base": 32,
    "microsoft/deberta-v3-large": 32,
}

LRS = {
    "microsoft/deberta-v3-base": 3e-5,
    "microsoft/deberta-v3-large": 3e-5,
}

In [None]:
class Config:
    # General
    seed = 2222
    device = "cuda"
    
    # Splits
    k = 4
    random_state = 2222
    selected_folds = [0, 1, 2, 3]
    folds_file = "/workspace/folds_kgd_4.csv"

    # Architecture
    name = "microsoft/deberta-v3-base"

    pretrained_weights = None 

    no_dropout = False
    use_conv = False
    use_lstm = False
    nb_layers = 1
    nb_ft = 128
    conv_kernel = 5
    drop_p = 0 if no_dropout else 0.1
    multi_sample_dropout = False

    num_classes = 3
    n_ids = N_IDS

    # Texts
    max_len_train = 410
    max_len = 410

#     extra_data_path = OUT_PATH + "pl_case5/"
    extra_data_path = None  # OUT_PATH + "pl_6/df_pl.csv"

    # Training    
    loss_config = {
        "name": "bce",  # ce, bce
        "smoothing": 0,  # 0.01
        "activation": "sigmoid",  # "sigmoid", "softmax"
    }

    data_config = {
        "batch_size": BATCH_SIZES[name],
        "val_bs": BATCH_SIZES[name] * 2,
        "use_len_sampler": True,
        "pad_token": 1 if "roberta" in name else 0,
    }

    optimizer_config = {
        "name": "AdamW",
        "lr": 5e-5,
        "lr_transfo": LRS[name],
        "lr_decay": 0.99,
        "warmup_prop": 0.1,
        "weight_decay": 1,
        "betas": (0.5, 0.99),
        "max_grad_norm": 1.,
        # AWP
        "use_awp": False,
        "awp_start_step": 1000,
        "awp_lr": 1,
        "awp_eps": 5e-5 if "xlarge" in name else 1e-3,
        "awp_period": 3,
        # SWA
        "use_swa": False,
        "swa_start": 9400,
        "swa_freq": 500,
    }

    gradient_checkpointing = False
    acc_steps = 1
    epochs = 1

    use_fp16 = True

    verbose = 1
    verbose_eval = 1000

In [None]:
DEBUG = True
log_folder = None

In [None]:
if not DEBUG:
    log_folder = prepare_log_folder(LOG_PATH)
    print(f"Logging results to {log_folder}")
    save_config(Config, log_folder + "config.json")
    create_logger(directory=log_folder, name="logs.txt")

pred_val, pred_test = k_fold(
    Config,
    df,
    df_test=df_test,
    log_folder=log_folder
)

Done