In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
cd ../src

/tmp/kaggle/kaggle_otto_rs/src


In [None]:
import os
import ast
import json
import glob
import torch
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from datetime import datetime
from collections import Counter

os.environ['CUDA_VISIBLE_DEVICES'] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.simplefilter(action='ignore', category=FutureWarning)

In [6]:
# pip install transformers4rec[pytorch,nvtabular]

In [7]:
import nvtabular as nvt

In [8]:
# from data.dataset import OttoDataset
# from data.preparation import prepare_data
# # from training.main import k_fold
# from models import OttoTransformer

from utils.metrics import *
from utils.logger import prepare_log_folder, save_config, create_logger

from params import *

In [9]:
import torch
from nvtabular.loader.torch import TorchAsyncItr, DLDataLoader

from transformers4rec import torch as tr
from merlin_standard_lib import Schema
from transformers4rec.torch.ranking_metric import RecallAt

import nvtabular as nvt
from nvtabular.ops import *
from merlin.schema.tags import Tags

## Data

In [10]:
# # for path in tqdm(glob.glob("../output/train_*.parquet")):
# for path in tqdm(glob.glob("../output/val.parquet")):
#     df = pd.read_parquet(path)
#     print(df['aid'].apply(len).max())
# #     df['target'] = df['type'].apply(lambda x: [CLASSES.index(c) + 1 for c in x])
# #     df.to_parquet(path, index=False)

In [11]:
dataset = nvt.Dataset(["../output/train_0.parquet"], engine="parquet")

CONTINUOUS_COLUMNS = ['ts']
CATEGORICAL_COLUMNS = ['aid']
LABEL_COLUMNS = ['target']

BATCH_SIZE = 1

In [12]:
train_dataset = TorchAsyncItr(
   dataset,
   cats=CATEGORICAL_COLUMNS,
   conts=CONTINUOUS_COLUMNS,
   labels=LABEL_COLUMNS,
   batch_size=BATCH_SIZE
)

In [13]:
train_loader = DLDataLoader(
   train_dataset,
   batch_size=None,
#    collate_fn=collate_fn,
   pin_memory=False,
   num_workers=0
)

In [14]:
# for batch in tqdm(train_loader):
#     batch
#     break
#     continue

### NVT

In [15]:
# tgt = ['target'] >> nvt.ops.AddMetadata(tags=[Tags.CATEGORICAL])
# aid = ['aid'] >> nvt.ops.AddMetadata(tags=[Tags.CATEGORICAL])
# ts = ['ts'] >> nvt.ops.AddMetadata(tags=[Tags.CONTINUOUS])


# # Truncate
# aid_truncated = aid >> nvt.ops.ListSlice(0, 20) >> nvt.ops.Rename(postfix = '_trim') >> TagAsItemID()
# tgt_truncated = tgt >> nvt.ops.ListSlice(0, 20) >> nvt.ops.Rename(postfix = '_trim')
# ts_truncated = ts >> nvt.ops.ListSlice(0, 20) >> nvt.ops.Rename(postfix = '_trim')

# # Select
# selected_features = (
#     aid_truncated +
#     tgt_truncated +
#     ts_truncated
# )

# workflow = nvt.Workflow(selected_features)

In [16]:
# dataset = nvt.Dataset(df, cpu=False)
# workflow.fit(dataset)
# sessions_ds = workflow.transform(dataset)
# sessions_gdf = sessions_ds.to_ddf().compute()

In [17]:
# dataset = nvt.Dataset(glob.glob("../output/train_*.parquet")[:1], engine="parquet", cpu=False)

In [18]:
# new_dataset = workflow.fit_transform(dataset)

# new_dataset.to_parquet("../output/worflow")

### Retrieve Schema

In [19]:
SCHEMA_PATH = "../output/schema.pb"
# SCHEMA_PATH = "../output/worflow/schema.pbtxt"

In [20]:
schema = Schema().from_proto_text(SCHEMA_PATH)
schema

[{'name': 'aid', 'value_count': {'min': '2', 'max': '512'}, 'type': 'INT', 'int_domain': {'name': 'aid', 'max': '1855610', 'is_categorical': True}, 'annotation': {'tag': ['item_id', 'list', 'categorical', 'item']}}, {'name': 'target', 'value_count': {'min': '2', 'max': '512'}, 'type': 'INT', 'int_domain': {'name': 'target', 'min': '1', 'max': '3', 'is_categorical': True}, 'annotation': {'tag': ['list', 'categorical', 'item']}}]

## Inputs

In [21]:
inputs = tr.TabularSequenceFeatures.from_schema(
        schema,
        max_sequence_length=500,
#         continuous_projection=64,
        d_output=100,
        masking="mlm",
)

In [22]:
inputs

TabularSequenceFeatures(
  (to_merge): ModuleDict(
    (categorical_module): SequenceEmbeddingFeatures(
      (filter_features): FilterFeatures()
      (embedding_tables): ModuleDict(
        (aid): Embedding(1855611, 64, padding_idx=0)
        (target): Embedding(4, 64, padding_idx=0)
      )
    )
  )
  (_aggregation): ConcatFeatures()
  (projection_module): SequentialBlock(
    (0): DenseBlock(
      (0): Linear(in_features=128, out_features=100, bias=True)
      (1): ReLU(inplace=True)
    )
  )
  (_masking): MaskedLanguageModeling()
)

### Model

In [23]:
# Define XLNetConfig class and set default parameters for HF XLNet config  
transformer_config = tr.XLNetConfig.build(
    d_model=64, n_head=4, n_layer=2, total_seq_length=500
)
# Define the model block including: inputs, masking, projection and transformer block.
body = tr.SequentialBlock(
    inputs,
    tr.MLPBlock([64]),
    tr.TransformerBlock(transformer_config, masking=inputs.masking)
)

# Defines the evaluation top-N metrics and the cut-offs
metrics = [
    RecallAt(top_ks=[20], labels_onehot=True)
]

In [24]:
# Define a head related to next item prediction task 
head = tr.Head(
    body,
    tr.NextItemPredictionTask(weight_tying=True, hf_format=True, metrics=metrics),
    inputs=inputs,
)

In [25]:
# Get the end-to-end Model class 
model = tr.Model(head)

### Training

In [26]:
from trainer import Trainer
from transformers4rec.config.trainer import T4RecTrainingArguments

In [27]:
TRAIN = True

In [28]:
log_folder = None
if TRAIN:
    log_folder = prepare_log_folder(LOG_PATH)
    print(f'Logging results to {log_folder}\n')

Logging results to /workspace/logs/2022-11-08/6/



In [29]:
# Set hyperparameters for training 
train_args = T4RecTrainingArguments(
    data_loader_engine='nvtabular', 
    dataloader_drop_last = True,
    gradient_accumulation_steps = 1,
    per_device_train_batch_size = 128, 
    per_device_eval_batch_size = 32,
    output_dir = log_folder, 
    learning_rate=0.0005,
    lr_scheduler_type='cosine', 
    learning_rate_num_cosine_cycles_by_epoch=1,
    num_train_epochs=1,
    max_sequence_length=500, 
    report_to = [],
    logging_steps=500,
    no_cuda=False
)

trainer = Trainer(
    model=model,
    args=train_args,
    schema=schema,
    compute_metrics=True,
)
trainer.reset_lr_scheduler()

In [30]:
train_paths = sorted(glob.glob("../output/train_*.parquet"))
eval_paths = sorted(glob.glob("../output/val.parquet"))

trainer.train_dataset_or_path = train_paths
trainer.eval_dataset_or_path = eval_paths

print(train_paths)
print(eval_paths)

['../output/train_0.parquet', '../output/train_1.parquet', '../output/train_10.parquet', '../output/train_11.parquet', '../output/train_12.parquet', '../output/train_13.parquet', '../output/train_14.parquet', '../output/train_15.parquet', '../output/train_16.parquet', '../output/train_17.parquet', '../output/train_18.parquet', '../output/train_19.parquet', '../output/train_2.parquet', '../output/train_20.parquet', '../output/train_21.parquet', '../output/train_22.parquet', '../output/train_23.parquet', '../output/train_24.parquet', '../output/train_25.parquet', '../output/train_26.parquet', '../output/train_27.parquet', '../output/train_28.parquet', '../output/train_29.parquet', '../output/train_3.parquet', '../output/train_30.parquet', '../output/train_31.parquet', '../output/train_32.parquet', '../output/train_33.parquet', '../output/train_34.parquet', '../output/train_35.parquet', '../output/train_36.parquet', '../output/train_37.parquet', '../output/train_38.parquet', '../output/tr

In [None]:
if TRAIN:
    trainer.train()
    trainer._save_model_and_checkpoint(save_model_class=True)
else:
    trainer.load_model_trainer_states_from_checkpoint('/workspace/logs/2022-11-08/3/checkpoint-1562')

***** Running training *****
  Num examples = 11098496
  Num Epochs = 1
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 86707


Step,Training Loss
500,12.9998
1000,11.4678
1500,10.8296
2000,10.4814
2500,10.2823
3000,10.0426
3500,9.8476
4000,9.3917
4500,8.9539
5000,8.6408


Saving model checkpoint to /workspace/logs/2022-11-08/6/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /workspace/logs/2022-11-08/6/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /workspace/logs/2022-11-08/6/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /workspace/logs/2022-11-08/6/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /workspace/logs/2022-11-08/6/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /workspace/logs/2022-11-08/6/checkpoint-3500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /workspace/logs/2022-11-08/6/checkpoint-4000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving 

In [None]:
train_metrics = trainer.evaluate(eval_dataset=eval_paths, metric_key_prefix='eval')

In [38]:
for key in sorted(train_metrics.keys()):
    print(" %s = %s" % (key, str(train_metrics[key])))

 eval_/loss = 14.359477043151855
 eval_/next-item/recall_at_20 = 0.009424999356269836
 eval_runtime = 868.6284
 eval_samples_per_second = 230.248
 eval_steps_per_second = 7.195


Done