In [5]:
import ast
import json
import logging
import os
import pickle
import time
import warnings
from collections import Counter
from logging import getLogger
from pathlib import Path
from random import randint, random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.data.interaction import Interaction
from recbole.model.general_recommender.multivae import MultiVAE
from recbole.model.sequential_recommender import Caser, GRU4Rec
from recbole.quick_start import run_recbole
from recbole.trainer import Trainer
from recbole.utils import init_logger, init_seed
from scipy.sparse import coo_matrix, hstack
from sklearn.metrics.pairwise import (
    cosine_distances,
    cosine_similarity,
    euclidean_distances,
)

# Загрузка данных

In [6]:
users_df = pd.read_csv("../datasets/users.csv")
items_df = pd.read_csv("../datasets/items.csv")
interactions_df = pd.read_csv("../datasets/interactions.csv")

In [7]:
interactions_df["t_dat"] = pd.to_datetime(interactions_df["last_watch_dt"], format="%Y-%m-%d")
interactions_df["timestamp"] = interactions_df.t_dat.values.astype(np.int64) // 10**9

In [8]:
df = interactions_df[["user_id", "item_id", "timestamp"]].rename(
    columns={"user_id": "user_id:token", "item_id": "item_id:token", "timestamp": "timestamp:float"}
)

In [9]:
!mkdir recbox_data

In [10]:
df.to_csv("recbox_data/recbox_data.inter", index=False, sep="\t")

# Пайплайн обучения

## Создание и разделение датасета

In [11]:
parameter_dict = {
    "data_path": "",
    "USER_ID_FIELD": "user_id",
    "ITEM_ID_FIELD": "item_id",
    "TIME_FIELD": "timestamp",
    "device": "GPU",
    "user_inter_num_interval": "[40,inf)",
    "item_inter_num_interval": "[40,inf)",
    "load_col": {"inter": ["user_id", "item_id", "timestamp"]},
    "neg_sampling": None,
    "epochs": 40,
    "eval_args": {"split": {"RS": [9, 0, 1]}, "group_by": "user", "order": "TO", "mode": "full"},
}
config = Config(model="MultiVAE", dataset="recbox_data", config_dict=parameter_dict)

# init random seed
init_seed(config["seed"], config["reproducibility"])

# logger initialization
init_logger(config)
logger = getLogger()
# Create handlers
c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)

# write config info into log
# logger.info(config)

In [12]:
dataset = create_dataset(config)
logger.info(dataset)

13 Dec 12:57    INFO  recbox_data
The number of users: 13355
Average actions of users: 63.815710648494836
The number of items: 3294
Average actions of items: 258.78985727300335
The number of inters: 852195
The sparsity of the dataset: 98.06281322904924%
Remain Fields: ['user_id', 'item_id', 'timestamp']
recbox_data
The number of users: 13355
Average actions of users: 63.815710648494836
The number of items: 3294
Average actions of items: 258.78985727300335
The number of inters: 852195
The sparsity of the dataset: 98.06281322904924%
Remain Fields: ['user_id', 'item_id', 'timestamp']


In [13]:
# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

13 Dec 12:57    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
[Training]: train_batch_size = [2048] train_neg_sample_args: [{'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
13 Dec 12:57    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [9, 0, 1]}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}]
[Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [9, 0, 1]}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}]


## Эксперименты

In [14]:
%%time
model_list = ["MultiVAE", "MultiDAE", "MacridVAE", "NeuMF", "RecVAE"]

for model_name in model_list:
    print(f"running {model_name}...")
    start = time.time()
    result = run_recbole(model=model_name, dataset="recbox_data", config_dict=parameter_dict)
    t = time.time() - start
    print(f"It took {t/60:.2f} mins")
    print(result)

running MultiVAE...


13 Dec 12:57    INFO  ['C:\\Users\\Happy\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\recoservice-Q0zB1aiD-py3.10\\lib\\site-packages\\ipykernel_launcher.py', '-f', 'C:\\Users\\Happy\\AppData\\Roaming\\jupyter\\runtime\\kernel-50f90226-56ac-497b-b7da-0b7e76a69b49.json']
['C:\\Users\\Happy\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\recoservice-Q0zB1aiD-py3.10\\lib\\site-packages\\ipykernel_launcher.py', '-f', 'C:\\Users\\Happy\\AppData\\Roaming\\jupyter\\runtime\\kernel-50f90226-56ac-497b-b7da-0b7e76a69b49.json']
13 Dec 12:57    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 40
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 

It took 0.73 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0894), ('mrr@10', 0.1588), ('ndcg@10', 0.0816), ('hit@10', 0.3717), ('precision@10', 0.049)])}
running MultiDAE...


13 Dec 12:58    INFO  ['C:\\Users\\Happy\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\recoservice-Q0zB1aiD-py3.10\\lib\\site-packages\\ipykernel_launcher.py', '-f', 'C:\\Users\\Happy\\AppData\\Roaming\\jupyter\\runtime\\kernel-50f90226-56ac-497b-b7da-0b7e76a69b49.json']
['C:\\Users\\Happy\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\recoservice-Q0zB1aiD-py3.10\\lib\\site-packages\\ipykernel_launcher.py', '-f', 'C:\\Users\\Happy\\AppData\\Roaming\\jupyter\\runtime\\kernel-50f90226-56ac-497b-b7da-0b7e76a69b49.json']
13 Dec 12:58    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 40
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 

It took 0.74 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0893), ('mrr@10', 0.1578), ('ndcg@10', 0.0814), ('hit@10', 0.3713), ('precision@10', 0.0488)])}
running MacridVAE...


13 Dec 12:59    INFO  ['C:\\Users\\Happy\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\recoservice-Q0zB1aiD-py3.10\\lib\\site-packages\\ipykernel_launcher.py', '-f', 'C:\\Users\\Happy\\AppData\\Roaming\\jupyter\\runtime\\kernel-50f90226-56ac-497b-b7da-0b7e76a69b49.json']
['C:\\Users\\Happy\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\recoservice-Q0zB1aiD-py3.10\\lib\\site-packages\\ipykernel_launcher.py', '-f', 'C:\\Users\\Happy\\AppData\\Roaming\\jupyter\\runtime\\kernel-50f90226-56ac-497b-b7da-0b7e76a69b49.json']
13 Dec 12:59    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 40
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 

It took 4.30 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0907), ('mrr@10', 0.1765), ('ndcg@10', 0.0873), ('hit@10', 0.3737), ('precision@10', 0.0501)])}
running NeuMF...


13 Dec 13:03    INFO  ['C:\\Users\\Happy\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\recoservice-Q0zB1aiD-py3.10\\lib\\site-packages\\ipykernel_launcher.py', '-f', 'C:\\Users\\Happy\\AppData\\Roaming\\jupyter\\runtime\\kernel-50f90226-56ac-497b-b7da-0b7e76a69b49.json']
['C:\\Users\\Happy\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\recoservice-Q0zB1aiD-py3.10\\lib\\site-packages\\ipykernel_launcher.py', '-f', 'C:\\Users\\Happy\\AppData\\Roaming\\jupyter\\runtime\\kernel-50f90226-56ac-497b-b7da-0b7e76a69b49.json']
13 Dec 13:03    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 40
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 

It took 10.72 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0453), ('mrr@10', 0.0689), ('ndcg@10', 0.037), ('hit@10', 0.2088), ('precision@10', 0.025)])}
running RecVAE...


13 Dec 13:14    INFO  ['C:\\Users\\Happy\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\recoservice-Q0zB1aiD-py3.10\\lib\\site-packages\\ipykernel_launcher.py', '-f', 'C:\\Users\\Happy\\AppData\\Roaming\\jupyter\\runtime\\kernel-50f90226-56ac-497b-b7da-0b7e76a69b49.json']
['C:\\Users\\Happy\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\recoservice-Q0zB1aiD-py3.10\\lib\\site-packages\\ipykernel_launcher.py', '-f', 'C:\\Users\\Happy\\AppData\\Roaming\\jupyter\\runtime\\kernel-50f90226-56ac-497b-b7da-0b7e76a69b49.json']
13 Dec 13:14    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 40
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 

It took 3.66 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0955), ('mrr@10', 0.1727), ('ndcg@10', 0.0882), ('hit@10', 0.3934), ('precision@10', 0.0529)])}
CPU times: total: 1h 49min 59s
Wall time: 20min 9s


# Получение предсказаний для сервиса

Лучшей моделью по всем метрикам оказалась `MultiVAE`.

In [15]:
result = run_recbole(model="MultiVAE", dataset="recbox_data", config_dict=parameter_dict)

13 Dec 13:17    INFO  ['C:\\Users\\Happy\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\recoservice-Q0zB1aiD-py3.10\\lib\\site-packages\\ipykernel_launcher.py', '-f', 'C:\\Users\\Happy\\AppData\\Roaming\\jupyter\\runtime\\kernel-50f90226-56ac-497b-b7da-0b7e76a69b49.json']
['C:\\Users\\Happy\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\recoservice-Q0zB1aiD-py3.10\\lib\\site-packages\\ipykernel_launcher.py', '-f', 'C:\\Users\\Happy\\AppData\\Roaming\\jupyter\\runtime\\kernel-50f90226-56ac-497b-b7da-0b7e76a69b49.json']
13 Dec 13:17    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 40
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 

In [16]:
result

{'best_valid_score': -inf,
 'valid_score_bigger': True,
 'best_valid_result': None,
 'test_result': OrderedDict([('recall@10', 0.0894),
              ('mrr@10', 0.1588),
              ('ndcg@10', 0.0816),
              ('hit@10', 0.3717),
              ('precision@10', 0.049)])}

In [21]:
model = MultiVAE(config, dataset)
checkpoint = torch.load("saved/MultiVAE-Dec-13-2023_12-57-57.pth")
model.load_state_dict(checkpoint["state_dict"])

Max value of user's history interaction records has reached 23.254401942926535% of the total.


<All keys matched successfully>