## Recbole
Проэкспериментриуйте с различными моделями recbole и выберите лучшую. (4 балла)

In [1]:
!pip install recbole ray >> None

In [12]:
!pip install kmeans_pytorch

Collecting kmeans_pytorch
  Downloading kmeans_pytorch-0.3-py3-none-any.whl (4.4 kB)
Installing collected packages: kmeans_pytorch
Successfully installed kmeans_pytorch-0.3


In [2]:
import ast
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle

import warnings
warnings.filterwarnings('ignore')

from collections import Counter
from random import randint, random
from scipy.sparse import coo_matrix, hstack
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances, cosine_similarity


In [3]:
import warnings
warnings.filterwarnings('ignore')

# Загрузим данные

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
interactions_df = pd.read_csv('/content/drive/MyDrive/kion_train/interactions.csv')
users_df = pd.read_csv('/content/drive/MyDrive/kion_train/users.csv')
items_df = pd.read_csv('/content/drive/MyDrive/kion_train/items.csv')

In [6]:
interactions_df['t_dat'] = pd.to_datetime(interactions_df['last_watch_dt'], format="%Y-%m-%d")
interactions_df['timestamp'] = interactions_df.t_dat.values.astype(np.int64) // 10 ** 9

In [7]:
df = interactions_df[['user_id', 'item_id', 'timestamp']].rename(
    columns={'user_id': 'user_id:token', 'item_id': 'item_id:token', 'timestamp': 'timestamp:float'})

In [8]:
!mkdir recbox_data

In [9]:
df.to_csv('recbox_data/recbox_data.inter', index=False, sep='\t')

In [10]:
import logging
from logging import getLogger
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.sequential_recommender import GRU4Rec, Caser
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger
from recbole.quick_start import run_recbole

# Config

In [13]:
parameter_dict = {
    'data_path': '',
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    'TIME_FIELD': 'timestamp',
    'device': 'GPU',
    'user_inter_num_interval': "[40,inf)",
    'item_inter_num_interval': "[40,inf)",
    'load_col': {'inter': ['user_id', 'item_id', 'timestamp']},
    'neg_sampling': None,
    'epochs': 10,
    'eval_args': {
        'split': {'RS': [9, 0, 1]},
        'group_by': 'user',
        'order': 'TO',
        'mode': 'full'}
}
config = Config(model='MultiVAE', dataset='recbox_data', config_dict=parameter_dict)

# init random seed
init_seed(config['seed'], config['reproducibility'])

# logger initialization
init_logger(config)
logger = getLogger()
# Create handlers
c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)

# write config info into log
# logger.info(config)



In [14]:
dataset = create_dataset(config)
logger.info(dataset)

In [15]:
# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

In [16]:
import time

# Протестируем 3 модели: BPR, CDAE, MultiVAE

In [17]:
%%time
model_list = ['BPR']

for model_name in model_list:
    print(f"running {model_name}...")
    start = time.time()
    result_bpr = run_recbole(model=model_name, dataset = 'recbox_data',config_dict = parameter_dict)
    t = time.time() - start
    print(f"It took {t/60:.2f} mins")
    print(result_bpr)

running BPR...


command line args [-f /root/.local/share/jupyter/runtime/kernel-8addc8f8-8ddc-4e0c-9542-404831ff497a.json] will not be used in RecBole
Train     0: 100%|███████████████████████| 378/378 [00:11<00:00, 32.18it/s, GPU RAM: 0.03 G/14.75 G]
Train     1: 100%|███████████████████████| 378/378 [00:12<00:00, 30.19it/s, GPU RAM: 0.03 G/14.75 G]
Train     2: 100%|███████████████████████| 378/378 [00:12<00:00, 31.01it/s, GPU RAM: 0.03 G/14.75 G]
Train     3: 100%|███████████████████████| 378/378 [00:11<00:00, 32.06it/s, GPU RAM: 0.03 G/14.75 G]
Train     4: 100%|███████████████████████| 378/378 [00:12<00:00, 29.25it/s, GPU RAM: 0.03 G/14.75 G]
Train     5: 100%|███████████████████████| 378/378 [00:12<00:00, 31.22it/s, GPU RAM: 0.03 G/14.75 G]
Train     6: 100%|███████████████████████| 378/378 [00:12<00:00, 30.20it/s, GPU RAM: 0.03 G/14.75 G]
Train     7: 100%|███████████████████████| 378/378 [00:13<00:00, 28.34it/s, GPU RAM: 0.03 G/14.75 G]
Train     8: 100%|███████████████████████| 378/378 [00:12

It took 5.12 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0824), ('mrr@10', 0.1716), ('ndcg@10', 0.0819), ('hit@10', 0.3479), ('precision@10', 0.0457)])}
CPU times: user 4min 17s, sys: 16.6 s, total: 4min 34s
Wall time: 5min 7s


In [18]:
%%time
model_list = ['CDAE']

for model_name in model_list:
    print(f"running {model_name}...")
    start = time.time()
    result_cdae = run_recbole(model=model_name, dataset = 'recbox_data',config_dict = parameter_dict)
    t = time.time() - start
    print(f"It took {t/60:.2f} mins")
    print(result_cdae)

running CDAE...


command line args [-f /root/.local/share/jupyter/runtime/kernel-8addc8f8-8ddc-4e0c-9542-404831ff497a.json] will not be used in RecBole
Max value of user's history interaction records has reached 20.9471766848816% of the total.
Train     0: 100%|███████████████████████████| 7/7 [00:00<00:00, 11.20it/s, GPU RAM: 0.39 G/14.75 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00, 10.45it/s, GPU RAM: 0.39 G/14.75 G]
Train     2: 100%|███████████████████████████| 7/7 [00:00<00:00,  7.23it/s, GPU RAM: 0.39 G/14.75 G]
Train     3: 100%|███████████████████████████| 7/7 [00:00<00:00,  8.43it/s, GPU RAM: 0.39 G/14.75 G]
Train     4: 100%|███████████████████████████| 7/7 [00:00<00:00, 10.96it/s, GPU RAM: 0.39 G/14.75 G]
Train     5: 100%|███████████████████████████| 7/7 [00:00<00:00,  8.48it/s, GPU RAM: 0.39 G/14.75 G]
Train     6: 100%|███████████████████████████| 7/7 [00:00<00:00,  9.13it/s, GPU RAM: 0.39 G/14.75 G]
Train     7: 100%|███████████████████████████| 7/7 [00:00<00:00,  

It took 4.31 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0634), ('mrr@10', 0.1428), ('ndcg@10', 0.0657), ('hit@10', 0.2724), ('precision@10', 0.0344)])}
CPU times: user 3min 50s, sys: 11.1 s, total: 4min 1s
Wall time: 4min 18s


In [19]:
%%time
model_list = ['MultiVAE']

for model_name in model_list:
    print(f"running {model_name}...")
    start = time.time()
    result_multivae = run_recbole(model=model_name, dataset = 'recbox_data',config_dict = parameter_dict)
    t = time.time() - start
    print(f"It took {t/60:.2f} mins")
    print(result_multivae)

running MultiVAE...


command line args [-f /root/.local/share/jupyter/runtime/kernel-8addc8f8-8ddc-4e0c-9542-404831ff497a.json] will not be used in RecBole
Max value of user's history interaction records has reached 20.9471766848816% of the total.
Train     0: 100%|███████████████████████████| 7/7 [00:03<00:00,  2.30it/s, GPU RAM: 0.39 G/14.75 G]
Train     1: 100%|███████████████████████████| 7/7 [00:01<00:00,  6.08it/s, GPU RAM: 0.39 G/14.75 G]
Train     2: 100%|███████████████████████████| 7/7 [00:01<00:00,  4.30it/s, GPU RAM: 0.39 G/14.75 G]
Train     3: 100%|███████████████████████████| 7/7 [00:01<00:00,  4.66it/s, GPU RAM: 0.39 G/14.75 G]
Train     4: 100%|███████████████████████████| 7/7 [00:01<00:00,  3.95it/s, GPU RAM: 0.39 G/14.75 G]
Train     5: 100%|███████████████████████████| 7/7 [00:00<00:00, 10.48it/s, GPU RAM: 0.39 G/14.75 G]
Train     6: 100%|███████████████████████████| 7/7 [00:01<00:00,  6.52it/s, GPU RAM: 0.39 G/14.75 G]
Train     7: 100%|███████████████████████████| 7/7 [00:01<00:00,  

It took 5.21 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0834), ('mrr@10', 0.1671), ('ndcg@10', 0.0816), ('hit@10', 0.3466), ('precision@10', 0.0462)])}
CPU times: user 4min 25s, sys: 14 s, total: 4min 39s
Wall time: 5min 12s


In [20]:
result_bpr

{'best_valid_score': -inf,
 'valid_score_bigger': True,
 'best_valid_result': None,
 'test_result': OrderedDict([('recall@10', 0.0824),
              ('mrr@10', 0.1716),
              ('ndcg@10', 0.0819),
              ('hit@10', 0.3479),
              ('precision@10', 0.0457)])}

In [21]:
result_cdae

{'best_valid_score': -inf,
 'valid_score_bigger': True,
 'best_valid_result': None,
 'test_result': OrderedDict([('recall@10', 0.0634),
              ('mrr@10', 0.1428),
              ('ndcg@10', 0.0657),
              ('hit@10', 0.2724),
              ('precision@10', 0.0344)])}

In [22]:
result_multivae

{'best_valid_score': -inf,
 'valid_score_bigger': True,
 'best_valid_result': None,
 'test_result': OrderedDict([('recall@10', 0.0834),
              ('mrr@10', 0.1671),
              ('ndcg@10', 0.0816),
              ('hit@10', 0.3466),
              ('precision@10', 0.0462)])}

Видим, что по метрикам модели BPR и MultiVAE показали себя лучшим образом, при этом явного победителя определить на офлайн валидации сложно