# Домашнее задание №5. Recbole

In [1]:
!pip -q install recbole
!pip -q install ray
!pip -q install kmeans_pytorch

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.6/62.6 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [19]:
import os
import ast
import time
import json
import logging
from logging import getLogger
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from collections import Counter
from random import randint, random
import torch
from tqdm import tqdm

from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.general_recommender.recvae import RecVAE
from recbole.model.general_recommender.multivae import MultiVAE
from recbole.model.sequential_recommender import GRU4Rec, Caser
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger
from recbole.quick_start import run_recbole

from scipy.sparse import coo_matrix, hstack
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances, cosine_similarity


In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Загрузка данных

In [5]:
DATA_PATH = "/content/drive/MyDrive/recsys_course/data_kion"

In [6]:
interactions_df = pd.read_csv(f'{DATA_PATH}/interactions_processed_kion.csv')
users_df = pd.read_csv(f'{DATA_PATH}/users_processed_kion.csv')
items_df = pd.read_csv(f'{DATA_PATH}/items_processed_kion.csv')

In [7]:
interactions_df['t_dat'] = pd.to_datetime(interactions_df['last_watch_dt'], format="%Y-%m-%d")
interactions_df['timestamp'] = interactions_df.t_dat.values.astype(np.int64) // 10 ** 9

In [8]:
df = interactions_df[['user_id', 'item_id', 'timestamp']].rename(
    columns={'user_id': 'user_id:token', 'item_id': 'item_id:token', 'timestamp': 'timestamp:float'})

In [9]:
!mkdir recbox_data

In [10]:
df.to_csv('recbox_data/recbox_data.inter', index=False, sep='\t')

## Обучение

In [None]:
parameter_dict = {
    'data_path': '',
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    'TIME_FIELD': 'timestamp',
    'device': 'GPU',
    'user_inter_num_interval': "[40,inf)",
    'item_inter_num_interval': "[40,inf)",
    'load_col': {'inter': ['user_id', 'item_id', 'timestamp']},
    'neg_sampling': None,
    'epochs': 20,
    'eval_args': {
        'split': {'RS': [9, 0, 1]},
        'group_by': 'user',
        'order': 'TO',
        'mode': 'full'},
    'metrics': ['MAP', 'NDCG', 'Recall'],
    'seed': 1008,
}
config = Config(model='MultiVAE', dataset='recbox_data', config_dict=parameter_dict)

# init random seed
init_seed(config['seed'], config['reproducibility'])

# logger initialization
init_logger(config)
logger = getLogger()
# Create handlers
c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)

# write config info into log
# logger.info(config)



In [None]:
dataset = create_dataset(config)
logger.info(dataset)

In [None]:
# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

In [None]:
%%time
model_list = ['MultiVAE', 'CDAE', 'MultiDAE', 'RecVAE', 'NeuMF', 'RepeatNet']

for model_name in model_list:
    print(f"running {model_name}...")
    start = time.time()
    result = run_recbole(model=model_name, dataset='recbox_data', config_dict = parameter_dict)
    t = time.time() - start
    print(f"It took {t/60:.2f} mins")
    print(result)

running MultiVAE...


command line args [-f /root/.local/share/jupyter/runtime/kernel-182ddc5c-99a9-4bc8-aee4-b590f2f4969b.json] will not be used in RecBole
Max value of user's history interaction records has reached 20.9471766848816% of the total.
Train     0: 100%|███████████████████████████| 7/7 [00:00<00:00, 10.96it/s, GPU RAM: 0.38 G/14.75 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00, 19.87it/s, GPU RAM: 0.38 G/14.75 G]
Train     2: 100%|███████████████████████████| 7/7 [00:00<00:00, 15.01it/s, GPU RAM: 0.38 G/14.75 G]
Train     3: 100%|███████████████████████████| 7/7 [00:00<00:00, 14.83it/s, GPU RAM: 0.38 G/14.75 G]
Train     4: 100%|███████████████████████████| 7/7 [00:00<00:00, 14.39it/s, GPU RAM: 0.38 G/14.75 G]
Train     5: 100%|███████████████████████████| 7/7 [00:00<00:00, 13.60it/s, GPU RAM: 0.38 G/14.75 G]
Train     6: 100%|███████████████████████████| 7/7 [00:00<00:00, 14.14it/s, GPU RAM: 0.38 G/14.75 G]
Train     7: 100%|███████████████████████████| 7/7 [00:00<00:00, 1

It took 4.11 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('map@10', 0.0391), ('ndcg@10', 0.0802), ('recall@10', 0.087)])}
running CDAE...


command line args [-f /root/.local/share/jupyter/runtime/kernel-182ddc5c-99a9-4bc8-aee4-b590f2f4969b.json] will not be used in RecBole
Max value of user's history interaction records has reached 20.9471766848816% of the total.
Train     0: 100%|███████████████████████████| 7/7 [00:00<00:00, 22.96it/s, GPU RAM: 0.41 G/14.75 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00, 26.99it/s, GPU RAM: 0.41 G/14.75 G]
Train     2: 100%|███████████████████████████| 7/7 [00:00<00:00, 23.74it/s, GPU RAM: 0.41 G/14.75 G]
Train     3: 100%|███████████████████████████| 7/7 [00:00<00:00, 25.10it/s, GPU RAM: 0.41 G/14.75 G]
Train     4: 100%|███████████████████████████| 7/7 [00:00<00:00, 26.39it/s, GPU RAM: 0.41 G/14.75 G]
Train     5: 100%|███████████████████████████| 7/7 [00:00<00:00, 26.27it/s, GPU RAM: 0.41 G/14.75 G]
Train     6: 100%|███████████████████████████| 7/7 [00:00<00:00, 26.05it/s, GPU RAM: 0.41 G/14.75 G]
Train     7: 100%|███████████████████████████| 7/7 [00:00<00:00, 2

It took 5.73 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('map@10', 0.0327), ('ndcg@10', 0.0646), ('recall@10', 0.0641)])}
running MultiDAE...


command line args [-f /root/.local/share/jupyter/runtime/kernel-182ddc5c-99a9-4bc8-aee4-b590f2f4969b.json] will not be used in RecBole
Max value of user's history interaction records has reached 20.9471766848816% of the total.
Train     0: 100%|███████████████████████████| 7/7 [00:00<00:00, 17.91it/s, GPU RAM: 0.41 G/14.75 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00, 20.75it/s, GPU RAM: 0.41 G/14.75 G]
Train     2: 100%|███████████████████████████| 7/7 [00:00<00:00, 15.77it/s, GPU RAM: 0.41 G/14.75 G]
Train     3: 100%|███████████████████████████| 7/7 [00:00<00:00, 16.46it/s, GPU RAM: 0.41 G/14.75 G]
Train     4: 100%|███████████████████████████| 7/7 [00:00<00:00, 15.99it/s, GPU RAM: 0.41 G/14.75 G]
Train     5: 100%|███████████████████████████| 7/7 [00:00<00:00, 15.42it/s, GPU RAM: 0.41 G/14.75 G]
Train     6: 100%|███████████████████████████| 7/7 [00:00<00:00, 15.43it/s, GPU RAM: 0.41 G/14.75 G]
Train     7: 100%|███████████████████████████| 7/7 [00:00<00:00, 1

It took 5.19 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('map@10', 0.0392), ('ndcg@10', 0.0802), ('recall@10', 0.0869)])}
running RecVAE...


command line args [-f /root/.local/share/jupyter/runtime/kernel-182ddc5c-99a9-4bc8-aee4-b590f2f4969b.json] will not be used in RecBole
Max value of user's history interaction records has reached 20.9471766848816% of the total.
Train     0: 100%|███████████████████████████| 7/7 [00:00<00:00,  7.03it/s, GPU RAM: 0.53 G/14.75 G]
Train     0: 100%|███████████████████████████| 7/7 [00:00<00:00,  7.11it/s, GPU RAM: 0.53 G/14.75 G]
Train     0: 100%|███████████████████████████| 7/7 [00:00<00:00,  7.50it/s, GPU RAM: 0.53 G/14.75 G]
Train     0: 100%|███████████████████████████| 7/7 [00:00<00:00,  7.15it/s, GPU RAM: 0.53 G/14.75 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00,  7.11it/s, GPU RAM: 0.53 G/14.75 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00,  7.63it/s, GPU RAM: 0.53 G/14.75 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00, 10.37it/s, GPU RAM: 0.53 G/14.75 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00, 1

It took 8.36 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('map@10', 0.0419), ('ndcg@10', 0.0856), ('recall@10', 0.0911)])}
running NeuMF...


command line args [-f /root/.local/share/jupyter/runtime/kernel-182ddc5c-99a9-4bc8-aee4-b590f2f4969b.json] will not be used in RecBole
Train     0: 100%|███████████████████████| 755/755 [00:37<00:00, 20.21it/s, GPU RAM: 0.53 G/14.75 G]
Train     1: 100%|███████████████████████| 755/755 [00:37<00:00, 20.29it/s, GPU RAM: 0.53 G/14.75 G]
Train     2: 100%|███████████████████████| 755/755 [00:37<00:00, 20.05it/s, GPU RAM: 0.53 G/14.75 G]
Train     3: 100%|███████████████████████| 755/755 [00:38<00:00, 19.53it/s, GPU RAM: 0.53 G/14.75 G]
Train     4: 100%|███████████████████████| 755/755 [00:38<00:00, 19.80it/s, GPU RAM: 0.53 G/14.75 G]
Train     5: 100%|███████████████████████| 755/755 [00:37<00:00, 20.18it/s, GPU RAM: 0.53 G/14.75 G]
Train     6: 100%|███████████████████████| 755/755 [00:36<00:00, 20.45it/s, GPU RAM: 0.53 G/14.75 G]
Train     7: 100%|███████████████████████| 755/755 [00:37<00:00, 20.39it/s, GPU RAM: 0.53 G/14.75 G]
Train     8: 100%|███████████████████████| 755/755 [00:37

It took 19.45 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('map@10', 0.0198), ('ndcg@10', 0.0453), ('recall@10', 0.054)])}
running RepeatNet...


command line args [-f /root/.local/share/jupyter/runtime/kernel-182ddc5c-99a9-4bc8-aee4-b590f2f4969b.json] will not be used in RecBole
Train     0:   9%|██                      | 64/743 [02:37<27:55,  2.47s/it, GPU RAM: 3.60 G/14.75 G]


KeyboardInterrupt: ignored

Лучшая модель в итоге **RecVAE**.

P.S. Остановила обучение и не доучила RepeatNet, тк обучение слишком долгое. Перезапускать обучение всех моделей не стала.

## Обучаем лучшую модель для сервиса с большим кол-вом эпох


In [11]:
parameter_dict = {
    "data_path": "",
    "USER_ID_FIELD": "user_id",
    "ITEM_ID_FIELD": "item_id",
    "TIME_FIELD": "timestamp",
    "device": "GPU",
    "user_inter_num_interval": "[40,inf)",
    "item_inter_num_interval": "[40,inf)",
    "load_col": {"inter": ["user_id", "item_id", "timestamp"]},
    "neg_sampling": None,
    "epochs": 30,
    "eval_args": {
        "split": {"RS": [9, 0, 1]},
        "group_by": "user",
        "order": "TO",
        "mode": "full"
    },
    'metrics': ['MAP', 'NDCG', 'Recall'],
    'seed': 1008,
}
config = Config(model="RecVAE", dataset="recbox_data", config_dict=parameter_dict)

# init random seed
init_seed(config['seed'], config['reproducibility'])

# logger initialization
init_logger(config)
logger = getLogger()
# Create handlers
c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)

# write config info into log
# logger.info(config)



In [12]:
dataset = create_dataset(config)
logger.info(dataset)

In [13]:
# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

In [14]:
result = run_recbole(model='RecVAE', dataset='recbox_data', config_dict=parameter_dict)

command line args [-f /root/.local/share/jupyter/runtime/kernel-c3445ca0-cd6f-4491-b427-df793df8f565.json] will not be used in RecBole
Max value of user's history interaction records has reached 20.9471766848816% of the total.
Train     0: 100%|███████████████████████████| 7/7 [00:02<00:00,  3.32it/s, GPU RAM: 0.52 G/14.75 G]
Train     0: 100%|███████████████████████████| 7/7 [00:00<00:00,  8.82it/s, GPU RAM: 0.52 G/14.75 G]
Train     0: 100%|███████████████████████████| 7/7 [00:00<00:00,  8.58it/s, GPU RAM: 0.52 G/14.75 G]
Train     0: 100%|███████████████████████████| 7/7 [00:00<00:00,  8.89it/s, GPU RAM: 0.54 G/14.75 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00, 10.06it/s, GPU RAM: 0.54 G/14.75 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00,  9.05it/s, GPU RAM: 0.54 G/14.75 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00, 12.58it/s, GPU RAM: 0.54 G/14.75 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00, 1

In [15]:
result

{'best_valid_score': -inf,
 'valid_score_bigger': True,
 'best_valid_result': None,
 'test_result': OrderedDict([('map@10', 0.0427),
              ('ndcg@10', 0.088),
              ('recall@10', 0.0952)])}

In [16]:
model = RecVAE(config, dataset)
checkpoint = torch.load("/content/saved/RecVAE-Dec-20-2023_04-22-20.pth")
model.load_state_dict(checkpoint["state_dict"])

Max value of user's history interaction records has reached 23.254401942926535% of the total.


<All keys matched successfully>

In [30]:
def recommend_to_user(external_user_id, dataset, model):
    if (
        external_user_id in dataset.field2token_id[dataset.uid_field]
        and external_user_id != "[PAD]"
    ):
        model.eval()
        model.to(config["device"])
        with torch.no_grad():
            uid_series = dataset.token2id(dataset.uid_field, [external_user_id])
            index = np.isin(dataset[dataset.uid_field].numpy(), uid_series)
            new_inter = dataset[index]
            new_inter = new_inter.to(config["device"])
            new_scores = model.full_sort_predict(new_inter)
            new_scores = new_scores.view(-1, test_data.dataset.item_num)
            new_scores[:, 0] = -np.inf
            recommended_item_indices = torch.topk(new_scores, 10).indices[0].tolist()
            recos = dataset.id2token(dataset.iid_field, [recommended_item_indices]).tolist()
        return recos[0]
    return []

In [32]:
%%time

recos = {}
users = dataset.field2token_id[dataset.uid_field]
for user_id in tqdm(users):
    recos_for_user = recommend_to_user(user_id, dataset, model)
    if recos_for_user:
        recos.update({user_id: recos_for_user})

100%|██████████| 13355/13355 [2:05:04<00:00,  1.78it/s]


CPU times: user 2h 3min 53s, sys: 24.7 s, total: 2h 4min 18s
Wall time: 2h 5min 4s


In [36]:
new_recos = {}
for key, values in recos.items():
  new_reco_for_user = []
  for value in values:
    new_reco_for_user.append(int(value))
  new_recos[int(key)] = new_reco_for_user

In [38]:
print(new_recos[176549])
print(recos['176549'])

[9728, 7571, 2956, 10761, 16166, 14470, 10440, 13865, 7102, 1785]
['9728', '7571', '2956', '10761', '16166', '14470', '10440', '13865', '7102', '1785']


In [39]:
# сохраним рекомендации
RECOS_PATH = "/content/drive/MyDrive/recsys_course/recommendations"

with open(f"{RECOS_PATH}/rec_vae.json", "w") as f:
    json.dump(new_recos, f)