In [1]:
import os
import numpy as np
import pandas as pd
import nvtabular as nvt
from transformers4rec import torch as tr
from transformers4rec.torch.ranking_metric import NDCGAt, MeanReciprocalRankAt, AvgPrecisionAt

os.environ["CUDA_VISIBLE_DEVICES"]="0"
DATA_DIR = os.environ.get("DATA_DIR", "./yoochoose_transformed")
TRAIN_DIR = os.environ.get("TRAIN_DIR", f"./train_data")

2023-09-20 18:47:50.908357: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  warn(f"Triton dtype mappings did not load successfully due to an error: {exc.msg}")


In [2]:
train = nvt.Dataset(os.path.join(DATA_DIR, "processed_nvt", "part_0.parquet"))
schema = train.schema.select_by_name(['item_id-list'])
schema

Unnamed: 0,name,tags,dtype,is_list,is_ragged,properties.cat_path,properties.embedding_sizes.dimension,properties.embedding_sizes.cardinality,properties.freq_threshold,properties.max_size,properties.num_buckets,properties.domain.min,properties.domain.max,properties.domain.name,properties.value_count.min,properties.value_count.max
0,item_id-list,"(Tags.ITEM, Tags.CATEGORICAL, Tags.LIST, Tags.ID)","DType(name='int64', element_type=<ElementType....",True,True,../yoochoose_transformed/categories/unique.ite...,512.0,52742.0,0.0,0.0,,0,52741,item_id-list,0,20


In [3]:
max_sequence_length, d_model = 20, 128
# Define input module to process tabular input-features and to prepare masked inputs
input_module = tr.TabularSequenceFeatures.from_schema(
    schema,
    embedding_dim_default=128,
    max_sequence_length=max_sequence_length,
    d_output=d_model,
    masking="clm",
)

metrics = [
    NDCGAt(top_ks=[100], labels_onehot=True),
    MeanReciprocalRankAt(top_ks=[100], labels_onehot=True),
    AvgPrecisionAt(top_ks=[100], labels_onehot=True)
]

# Define Next item prediction-task 
prediction_task = tr.NextItemPredictionTask(weight_tying=True, metrics=metrics)

# Define the config of the XLNet Transformer architecture
transformer_config = tr.XLNetConfig.build(
    d_model=d_model, n_head=8, n_layer=2, total_seq_length=max_sequence_length
)
model = transformer_config.to_torch_model(input_module, prediction_task)

In [4]:
training_args = tr.trainer.T4RecTrainingArguments(
            output_dir=TRAIN_DIR,
            max_sequence_length=max_sequence_length,
            data_loader_engine='nvtabular',
            num_train_epochs=1, 
            dataloader_drop_last=False,
            per_device_train_batch_size = 1024,
            per_device_eval_batch_size = 1024,
            learning_rate=0.0005,
            fp16=True,
            report_to = [],
            logging_steps=500
        )

In [5]:
trainer = tr.Trainer(
    model=model,
    args=training_args,
    schema=schema,
    train_dataset_or_path=os.path.join(DATA_DIR, "split", "train.parquet"),
    eval_dataset_or_path=os.path.join(DATA_DIR, "split", "valid.parquet"),
    compute_metrics=True)

In [6]:
trainer.train()

Step,Training Loss
500,7.4093
1000,4.6456
1500,3.8605
2000,3.4006
2500,3.2698
3000,3.1386
3500,3.2612
4000,3.1726
4500,2.9983
5000,3.1545


TrainOutput(global_step=5939, training_loss=3.7613273809445507, metrics={'train_runtime': 235.3818, 'train_samples_per_second': 25836.897, 'train_steps_per_second': 25.231, 'total_flos': 0.0, 'train_loss': 3.7613273809445507})

In [7]:
evaluation = trainer.evaluate()
evaluation

{'eval_/next-item/ndcg_at_100': 0.9645484685897827,
 'eval_/next-item/mean_reciprocal_rank_at_100': 0.9590969681739807,
 'eval_/next-item/avg_precision_at_100': 0.9590969681739807,
 'eval_/loss': 0.3528675436973572,
 'eval_runtime': 30.3942,
 'eval_samples_per_second': 22235.806,
 'eval_steps_per_second': 21.715}

In [8]:
test_data = nvt.Dataset(os.path.join(DATA_DIR, "split", "test.parquet"))
predictions = trainer.predict(test_data)

In [9]:
test_df = pd.read_parquet(os.path.join(DATA_DIR, "split", "test.parquet"))
targets = test_df['target']

In [10]:
MRR = np.zeros(targets.shape, dtype=float)
for i, (prediction, target) in enumerate(zip(predictions.predictions[0], targets)):
    indices = np.where(prediction == target)[0]
    if indices.size > 0:
        MRR[i] = 1.0 / (indices[0]+1)
print("Real MRR:", np.mean(MRR))

Real MRR: 0.20237413828082323
