In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
cd ../src

/tmp/kaggle/kaggle_otto_rs/src


In [3]:
import os
import ast
import json
import glob
import torch
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from datetime import datetime
from collections import Counter

os.environ['CUDA_VISIBLE_DEVICES'] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
# pip install transformers4rec[pytorch,nvtabular]

In [8]:
import nvtabular as nvt

In [28]:
# from data.dataset import OttoDataset
# from data.preparation import prepare_data
# # from training.main import k_fold
# from models import OttoTransformer

from utils.metrics import *
from utils.logger import prepare_log_folder, save_config, create_logger

from params import *

In [19]:
import torch
from nvtabular.loader.torch import TorchAsyncItr, DLDataLoader

## Data

In [127]:
for path in tqdm(glob.glob("../output/train_*.parquet")):
    df = pd.read_parquet(path)
    df['target'] = df['type'].apply(lambda x: [CLASSES.index(c) + 1 for c in x])
    df.to_parquet(path, index=False)

In [61]:
dataset = nvt.Dataset(["../output/train_0.parquet"], engine="parquet")

CONTINUOUS_COLUMNS = ['ts']
CATEGORICAL_COLUMNS = ['aid']
LABEL_COLUMNS = ['target']

BATCH_SIZE = 1

In [62]:
train_dataset = TorchAsyncItr(
   dataset,
   cats=CATEGORICAL_COLUMNS,
   conts=CONTINUOUS_COLUMNS,
   labels=LABEL_COLUMNS,
   batch_size=BATCH_SIZE
)

In [63]:
train_loader = DLDataLoader(
   train_dataset,
   batch_size=None,
#    collate_fn=collate_fn,
   pin_memory=False,
   num_workers=0
)

In [65]:
for batch in tqdm(train_loader):
    break
    continue

  0%|          | 0/200000 [01:39<?, ?it/s]


In [66]:
batch

[{'aid': [tensor([1517085, 1563459, 1309446,   16246, 1781822, 1152674, 1649869,  461689,
            305831,  461689,  362233, 1649869, 1649869,  984597, 1649869,  803544,
           1110941, 1190046, 1760685,  631008,  461689, 1190046, 1650637,  313546,
           1650637,  979517,  351157, 1062149, 1157384, 1841388, 1469630,  305831,
           1110548, 1110548,  305831, 1650114, 1604396, 1009750, 1800933,  495779,
            394655,  495779,  789245,  789245,  366890,  361317, 1700164, 1755597,
            789245,  784978, 1171505,  784978, 1700164,  784978, 1521766, 1725503,
            528847, 1816325,  984597, 1072782,  173702, 1072782, 1407538, 1629651,
           1768568, 1318324, 1840418, 1813509, 1813509,  667924, 1226444,  709550,
            709417, 1225559, 1048044, 1052813, 1225559,  240346, 1582117, 1707783,
           1624436, 1157411,  358305, 1202970,  832192, 1498443,  723931, 1436439,
           1693461, 1206554, 1110741,  346352, 1802050,  154930,  964169,  96416

### Schema

In [123]:
from transformers4rec import torch as tr
from merlin_standard_lib import Schema
from transformers4rec.torch.ranking_metric import RecallAt

In [124]:
SCHEMA_PATH = "../output/schema.pb"

schema = Schema().from_proto_text(SCHEMA_PATH)
schema

[{'name': 'session_id', 'value_count': {'min': '2', 'max': '20'}, 'type': 'INT', 'int_domain': {'name': 'session_id', 'max': '1855610', 'is_categorical': True}, 'annotation': {'tag': ['item_id', 'list', 'categorical', 'item']}}, {'name': 'target', 'value_count': {'min': '2', 'max': '20'}, 'type': 'INT', 'int_domain': {'name': 'target', 'max': '2', 'is_categorical': True}, 'annotation': {'tag': ['list', 'categorical', 'item']}}]

In [125]:
inputs = tr.TabularSequenceFeatures.from_schema(
        schema,
        max_sequence_length=20,
#         continuous_projection=64,
        d_output=100,
        masking="mlm",
)

In [126]:
inputs

TabularSequenceFeatures(
  (to_merge): ModuleDict(
    (categorical_module): SequenceEmbeddingFeatures(
      (filter_features): FilterFeatures()
      (embedding_tables): ModuleDict(
        (session_id): Embedding(1855611, 64, padding_idx=0)
        (target): Embedding(3, 64, padding_idx=0)
      )
    )
  )
  (_aggregation): ConcatFeatures()
  (projection_module): SequentialBlock(
    (0): DenseBlock(
      (0): Linear(in_features=128, out_features=100, bias=True)
      (1): ReLU(inplace=True)
    )
  )
  (_masking): MaskedLanguageModeling()
)

### Model

In [117]:
# Define XLNetConfig class and set default parameters for HF XLNet config  
transformer_config = tr.XLNetConfig.build(
    d_model=64, n_head=4, n_layer=2, total_seq_length=20
)
# Define the model block including: inputs, masking, projection and transformer block.
body = tr.SequentialBlock(
    inputs,
    tr.MLPBlock([64]),
    tr.TransformerBlock(transformer_config, masking=inputs.masking)
)

# Defines the evaluation top-N metrics and the cut-offs
metrics = [
    RecallAt(top_ks=[20], labels_onehot=False)
]

In [118]:
inputs.output_size()

torch.Size([-1, 20, 100])

In [120]:
# Define a head related to next item prediction task 
head = tr.Head(
    body,
    tr.NextItemPredictionTask(weight_tying=True, hf_format=True, metrics=metrics),
    inputs=inputs,
)

In [121]:
# Get the end-to-end Model class 
model = tr.Model(head)

### Training

In [None]:
BATCH_SIZES = {
    "microsoft/deberta-v3-base": 32,
    "microsoft/deberta-v3-large": 32,
}

LRS = {
    "microsoft/deberta-v3-base": 3e-5,
    "microsoft/deberta-v3-large": 3e-5,
}

In [None]:
class Config:
    # General
    seed = 2222
    device = "cuda"
    
    # Splits
    k = 4
    random_state = 2222
    selected_folds = [0, 1, 2, 3]
    folds_file = "/workspace/folds_kgd_4.csv"

    # Architecture
    name = "microsoft/deberta-v3-base"

    pretrained_weights = None 

    no_dropout = False
    use_conv = False
    use_lstm = False
    nb_layers = 1
    nb_ft = 128
    conv_kernel = 5
    drop_p = 0 if no_dropout else 0.1
    multi_sample_dropout = False

    num_classes = 3
    n_ids = N_IDS

    # Texts
    max_len_train = 410
    max_len = 410

#     extra_data_path = OUT_PATH + "pl_case5/"
    extra_data_path = None  # OUT_PATH + "pl_6/df_pl.csv"

    # Training    
    loss_config = {
        "name": "bce",  # ce, bce
        "smoothing": 0,  # 0.01
        "activation": "sigmoid",  # "sigmoid", "softmax"
    }

    data_config = {
        "batch_size": BATCH_SIZES[name],
        "val_bs": BATCH_SIZES[name] * 2,
        "use_len_sampler": True,
        "pad_token": 1 if "roberta" in name else 0,
    }

    optimizer_config = {
        "name": "AdamW",
        "lr": 5e-5,
        "lr_transfo": LRS[name],
        "lr_decay": 0.99,
        "warmup_prop": 0.1,
        "weight_decay": 1,
        "betas": (0.5, 0.99),
        "max_grad_norm": 1.,
        # AWP
        "use_awp": False,
        "awp_start_step": 1000,
        "awp_lr": 1,
        "awp_eps": 5e-5 if "xlarge" in name else 1e-3,
        "awp_period": 3,
        # SWA
        "use_swa": False,
        "swa_start": 9400,
        "swa_freq": 500,
    }

    gradient_checkpointing = False
    acc_steps = 1
    epochs = 1

    use_fp16 = True

    verbose = 1
    verbose_eval = 1000

In [None]:
DEBUG = True
log_folder = None

In [None]:
if not DEBUG:
    log_folder = prepare_log_folder(LOG_PATH)
    print(f"Logging results to {log_folder}")
    save_config(Config, log_folder + "config.json")
    create_logger(directory=log_folder, name="logs.txt")

pred_val, pred_test = k_fold(
    Config,
    df,
    df_test=df_test,
    log_folder=log_folder
)

Done