In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
cd ../src

/tmp/kaggle/kaggle_otto_rs/src


In [3]:
import os
import ast
import json
import glob
import torch
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import nvtabular as nvt
import matplotlib.pyplot as plt

from tqdm import tqdm
from datetime import datetime
from collections import Counter

os.environ['CUDA_VISIBLE_DEVICES'] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
from nvtabular.ops import *
from merlin.schema.tags import Tags
from merlin_standard_lib import Schema
from transformers4rec import torch as tr
from transformers4rec.torch.ranking_metric import RecallAt
from nvtabular.loader.torch import TorchAsyncItr, DLDataLoader

from trainer import Trainer
from transformers4rec.config.trainer import T4RecTrainingArguments

In [5]:
# from data.dataset import OttoDataset
# from data.preparation import prepare_data
# # from training.main import k_fold
# from models import OttoTransformer

from utils.metrics import *
from utils.logger import prepare_log_folder, save_config, create_logger

from params import *

## Data

In [6]:
# # for path in tqdm(glob.glob("../output/train_*.parquet")):
# for path in tqdm(glob.glob("../output/val.parquet")):
#     df = pd.read_parquet(path)
#     print(df['aid'].apply(len).max())
# #     df['target'] = df['type'].apply(lambda x: [CLASSES.index(c) + 1 for c in x])
# #     df.to_parquet(path, index=False)

In [7]:
dataset = nvt.Dataset(["../input/parquets/train_0.parquet"], engine="parquet")

CONTINUOUS_COLUMNS = ['ts']
CATEGORICAL_COLUMNS = ['aid']
LABEL_COLUMNS = ['target']

BATCH_SIZE = 1

In [8]:
train_dataset = TorchAsyncItr(
   dataset,
   cats=CATEGORICAL_COLUMNS,
   conts=CONTINUOUS_COLUMNS,
   labels=LABEL_COLUMNS,
   batch_size=BATCH_SIZE
)

In [9]:
train_loader = DLDataLoader(
   train_dataset,
   batch_size=None,
#    collate_fn=collate_fn,
   pin_memory=False,
   num_workers=0
)

In [10]:
# for batch in tqdm(train_loader):
#     batch
#     break
#     continue

### NVT

In [11]:
# tgt = ['target'] >> nvt.ops.AddMetadata(tags=[Tags.CATEGORICAL])
# aid = ['aid'] >> nvt.ops.AddMetadata(tags=[Tags.CATEGORICAL])
# ts = ['ts'] >> nvt.ops.AddMetadata(tags=[Tags.CONTINUOUS])


# # Truncate
# aid_truncated = aid >> nvt.ops.ListSlice(0, 20) >> nvt.ops.Rename(postfix = '_trim') >> TagAsItemID()
# tgt_truncated = tgt >> nvt.ops.ListSlice(0, 20) >> nvt.ops.Rename(postfix = '_trim')
# ts_truncated = ts >> nvt.ops.ListSlice(0, 20) >> nvt.ops.Rename(postfix = '_trim')

# # Select
# selected_features = (
#     aid_truncated +
#     tgt_truncated +
#     ts_truncated
# )

# workflow = nvt.Workflow(selected_features)

In [12]:
# dataset = nvt.Dataset(df, cpu=False)
# workflow.fit(dataset)
# sessions_ds = workflow.transform(dataset)
# sessions_gdf = sessions_ds.to_ddf().compute()

In [13]:
# dataset = nvt.Dataset(glob.glob("../output/train_*.parquet")[:1], engine="parquet", cpu=False)

In [14]:
# new_dataset = workflow.fit_transform(dataset)

# new_dataset.to_parquet("../output/worflow")

### Retrieve Schema

In [15]:
SCHEMA_PATH = "../output/schema.pb"
# SCHEMA_PATH = "../output/worflow/schema.pbtxt"

In [16]:
schema = Schema().from_proto_text(SCHEMA_PATH)
schema

[{'name': 'aid', 'value_count': {'min': '2', 'max': '512'}, 'type': 'INT', 'int_domain': {'name': 'aid', 'max': '1855610', 'is_categorical': True}, 'annotation': {'tag': ['item_id', 'list', 'categorical', 'item']}}, {'name': 'target', 'value_count': {'min': '2', 'max': '512'}, 'type': 'INT', 'int_domain': {'name': 'target', 'min': '1', 'max': '3', 'is_categorical': True}, 'annotation': {'tag': ['list', 'categorical', 'item']}}]

## Inputs

In [17]:
inputs = tr.TabularSequenceFeatures.from_schema(
        schema,
        max_sequence_length=500,
#         continuous_projection=64,
        d_output=100,
        masking="mlm",
)

In [18]:
inputs

TabularSequenceFeatures(
  (to_merge): ModuleDict(
    (categorical_module): SequenceEmbeddingFeatures(
      (filter_features): FilterFeatures()
      (embedding_tables): ModuleDict(
        (aid): Embedding(1855611, 64, padding_idx=0)
        (target): Embedding(4, 64, padding_idx=0)
      )
    )
  )
  (_aggregation): ConcatFeatures()
  (projection_module): SequentialBlock(
    (0): DenseBlock(
      (0): Linear(in_features=128, out_features=100, bias=True)
      (1): ReLU(inplace=True)
    )
  )
  (_masking): MaskedLanguageModeling()
)

### Model

In [19]:
# Define XLNetConfig class and set default parameters for HF XLNet config  
transformer_config = tr.XLNetConfig.build(
    d_model=64, n_head=4, n_layer=2, total_seq_length=500
)
# Define the model block including: inputs, masking, projection and transformer block.
body = tr.SequentialBlock(
    inputs,
    tr.MLPBlock([64]),
    tr.TransformerBlock(transformer_config, masking=inputs.masking)
)

# Defines the evaluation top-N metrics and the cut-offs
metrics = [
    RecallAt(top_ks=[20], labels_onehot=True)
]

In [20]:
# Define a head related to next item prediction task 
head = tr.Head(
    body,
    tr.NextItemPredictionTask(weight_tying=True, hf_format=True, metrics=metrics),
    inputs=inputs,
)

In [21]:
# Get the end-to-end Model class 
model = tr.Model(head)

### Training

In [22]:
TRAIN = True

In [23]:
log_folder = "/workspace/logs/"

if TRAIN:
    log_folder = prepare_log_folder(LOG_PATH)
    print(f'Logging results to {log_folder}\n')
    create_logger(log_folder)

Logging results to /workspace/logs/2022-11-09/3/

['../input/parquets/train_0.parquet', '../input/parquets/train_1.parquet', '../input/parquets/train_10.parquet', '../input/parquets/train_11.parquet', '../input/parquets/train_12.parquet', '../input/parquets/train_13.parquet', '../input/parquets/train_14.parquet', '../input/parquets/train_15.parquet', '../input/parquets/train_16.parquet', '../input/parquets/train_17.parquet', '../input/parquets/train_18.parquet', '../input/parquets/train_19.parquet', '../input/parquets/train_2.parquet', '../input/parquets/train_20.parquet', '../input/parquets/train_21.parquet', '../input/parquets/train_22.parquet', '../input/parquets/train_23.parquet', '../input/parquets/train_24.parquet', '../input/parquets/train_25.parquet', '../input/parquets/train_26.parquet', '../input/parquets/train_27.parquet', '../input/parquets/train_28.parquet', '../input/parquets/train_29.parquet', '../input/parquets/train_3.parquet', '../input/parquets/train_30.parquet', '..

***** Running training *****
  Num examples = 11098496
  Num Epochs = 1
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 86707
Saving model checkpoint to /workspace/logs/2022-11-09/3/checkpoint-10000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


In [24]:
# Set hyperparameters for training 
train_args = T4RecTrainingArguments(
    data_loader_engine='nvtabular', 
    dataloader_drop_last=True,
    gradient_accumulation_steps=1,
    per_device_train_batch_size=128, 
    per_device_eval_batch_size=128,
    output_dir=log_folder, 
    learning_rate=0.0005,
    lr_scheduler_type='cosine', 
    learning_rate_num_cosine_cycles_by_epoch=1,
    num_train_epochs=1,
    max_sequence_length=500, 
    report_to=[],
    logging_steps=1000,
    save_steps=10000,
    no_cuda=False,
)

trainer = Trainer(
    model=model,
    args=train_args,
    schema=schema,
    compute_metrics=True,
)
trainer.reset_lr_scheduler()

In [25]:
train_paths = sorted(glob.glob("../input/parquets/train_*.parquet"))
eval_paths = sorted(glob.glob("../input/parquets/val.parquet"))

# train_paths = ['../input/parquets/train_0.parquet']
# eval_paths = ['../input/parquets/train_0.parquet']

trainer.train_dataset_or_path = train_paths
trainer.eval_dataset_or_path = eval_paths

print(train_paths)
print(eval_paths)

In [None]:
if TRAIN:
    trainer.train()
    trainer._save_model_and_checkpoint(save_model_class=True)
else:
#     trainer.load_model_trainer_states_from_checkpoint('/workspace/logs/2022-11-08/6/checkpoint-86707')
    trainer.load_model_trainer_states_from_checkpoint('/workspace/logs/2022-11-09/0/checkpoint-15620')

Step,Training Loss
1000,12.2572
2000,10.658
3000,10.0675
4000,9.6723
5000,8.9277
6000,8.3497
7000,8.0527
8000,7.8098
9000,7.6415
10000,7.2982


In [None]:
train_metrics = trainer.evaluate(eval_dataset=eval_paths, metric_key_prefix='eval')

In [None]:
for key in sorted(train_metrics.keys()):
    print(" %s = %s" % (key, str(train_metrics[key])))

Done