In [1]:
!pip install recbole
!pip install kmeans-pytorch

Collecting recbole
  Downloading recbole-1.2.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Collecting colorlog==4.7.2 (from recbole)
  Downloading colorlog-4.7.2-py2.py3-none-any.whl (10 kB)
Collecting colorama==0.4.4 (from recbole)
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting thop>=0.1.1.post2207130030 (from recbole)
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Collecting texttable>=0.9.0 (from recbole)
  Downloading texttable-1.7.0-py2.py3-none-any.whl (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->recbole)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->recbole)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10

In [2]:
import logging
from logging import getLogger
import recbole
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.sequential_recommender import GRU4Rec
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger, get_model, get_trainer

In [39]:
parameter_dict = {
    # seq_separator: ","
    'data_path':'',
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    'RATING_FIELD': 'rating',
    'TIME_FIELD': 'timestamp',

    'load_col': {'inter': ['user_id', 'item_id', 'rating', 'timestamp']},
    'device' : 'GPU',

    # model config
    'embedding_size': '64',
    'hidden_size': '128',
    'num_layers': '1',
    'dropout_prob': '0.3',
    'loss_type': 'CE',


    # 'eval_setting': TO_LS, full,
    'train_neg_sample_args': None,
    'group_by_user': True,
    'metrics': ["Recall", "MRR", "NDCG", "Hit", "Precision"],
    'topk': 10,
    'metric_decimal_place': 5,

    'learning_rate': 0.0001,
    'epochs': 50,
    'train_batch_size': '512',
    'eval_batch_size': '512',
    'valid_metric': 'MRR@10',
}


In [40]:
config = Config(model='GRU4Rec', dataset='Dianping_local', config_dict = parameter_dict)

init_seed(config['seed'], config['reproducibility'])

# logger initialization
init_logger(config)
logger = getLogger()
# Create handlers
c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)

# write config info into log
logger.info(config)


command line args [-f /root/.local/share/jupyter/runtime/kernel-f2e77bc4-e093-49ce-9063-37c6de9e3958.json] will not be used in RecBole
command line args [-f /root/.local/share/jupyter/runtime/kernel-f2e77bc4-e093-49ce-9063-37c6de9e3958.json] will not be used in RecBole
command line args [-f /root/.local/share/jupyter/runtime/kernel-f2e77bc4-e093-49ce-9063-37c6de9e3958.json] will not be used in RecBole


In [41]:
dataset = create_dataset(config)
print(dataset)
logger.info(dataset)

Dianping_local
The number of users: 121334
Average actions of users: 8.241772642232533
The number of items: 10870
Average actions of items: 92.00469224399669
The number of inters: 999999
The sparsity of the dataset: 99.92417934943967%
Remain Fields: ['user_id', 'item_id', 'rating', 'timestamp']


In [42]:
train_data, valid_data, test_data = data_preparation(config, dataset)

In [43]:
for i in train_data:
    print(i)
    break

The batch_size of interaction: 512
    user_id, torch.Size([512]), cpu, torch.int64
    item_id, torch.Size([512]), cpu, torch.int64
    rating, torch.Size([512]), cpu, torch.float32
    timestamp, torch.Size([512]), cpu, torch.float32
    item_length, torch.Size([512]), cpu, torch.int64
    item_id_list, torch.Size([512, 50]), cpu, torch.int64
    rating_list, torch.Size([512, 50]), cpu, torch.float32
    timestamp_list, torch.Size([512, 50]), cpu, torch.float32




In [44]:
gru4rec_model = get_model(config["model"])
gru4rec_model, config["device"]

(recbole.model.sequential_recommender.gru4rec.GRU4Rec, device(type='cuda'))

In [45]:
model = gru4rec_model(config, train_data.dataset).to(config['device'])
logger.info(model)
print(model)

GRU4Rec(
  (item_embedding): Embedding(10870, 64, padding_idx=0)
  (emb_dropout): Dropout(p=0.3, inplace=False)
  (gru_layers): GRU(64, 128, bias=False, batch_first=True)
  (dense): Linear(in_features=128, out_features=64, bias=True)
  (loss_fct): CrossEntropyLoss()
)
Trainable parameters: 777664


In [46]:
config["model"], config["MODEL_TYPE"], config["device"]

('GRU4Rec', <ModelType.SEQUENTIAL: 2>, device(type='cuda'))

In [47]:
for i in train_data:
    print(i)
    break

The batch_size of interaction: 512
    user_id, torch.Size([512]), cpu, torch.int64
    item_id, torch.Size([512]), cpu, torch.int64
    rating, torch.Size([512]), cpu, torch.float32
    timestamp, torch.Size([512]), cpu, torch.float32
    item_length, torch.Size([512]), cpu, torch.int64
    item_id_list, torch.Size([512, 50]), cpu, torch.int64
    rating_list, torch.Size([512, 50]), cpu, torch.float32
    timestamp_list, torch.Size([512, 50]), cpu, torch.float32




In [48]:
#  trainer loading and initialization
trainer = Trainer(config, model)
trainer

<recbole.trainer.trainer.Trainer at 0x7c3a66adb940>

In [49]:
# model training
best_valid_score, best_valid_result = trainer.fit(train_data, valid_data, verbose=1)

In [52]:
print(best_valid_score, best_valid_result)

0.0151 OrderedDict([('recall@10', 0.04475), ('mrr@10', 0.0151), ('ndcg@10', 0.02192), ('hit@10', 0.04475), ('precision@10', 0.00448)])


In [54]:
trainer = get_trainer(config["MODEL_TYPE"], config["model"])(config, model)

# When calculate ItemCoverage metrics, we need to run this code for set item_nums in eval_collector.
trainer.eval_collector.data_collect(train_data)

checkpoint_file = "saved/trained_model.pth"
test_result = trainer.evaluate(test_data, model_file=checkpoint_file)
print(test_result)

OrderedDict([('recall@10', 0.03895), ('mrr@10', 0.01325), ('ndcg@10', 0.01917), ('hit@10', 0.03895), ('precision@10', 0.0039)])
