In [9]:
from pprint import pprint
import os

import numpy as np
import pandas as pd

import requests
from tqdm.auto import tqdm

import zipfile as zf

from implicit.nearest_neighbours import TFIDFRecommender, BM25Recommender
from implicit.als import AlternatingLeastSquares

from rectools import Columns
from rectools.dataset import Interactions, Dataset
from rectools.metrics import MAP, NDCG, Precision, Recall, MeanInvUserFreq, Serendipity, \
    calc_metrics
from rectools.models import ImplicitItemKNNWrapperModel, RandomModel, PopularModel
from rectools.model_selection import TimeRangeSplitter

from dev_eval import calculate_metrics, INTERACTIONS, USERS, ITEMS


def headtail(df):
    return pd.concat([df.head(), df.tail()])


DATA_DIR = '../data'
KION_DIR = os.path.join(DATA_DIR, 'data_original')
INTERACTIONS_DATA = os.path.join(KION_DIR, 'interactions.csv')
USERS_DATA = os.path.join(KION_DIR, 'users.csv')
ITEMS_DATA = os.path.join(KION_DIR, 'items.csv')

# Параметры тестирования функций

Модели: `rectools.models.RandomModel(random_state=32)`, `rectools.models.PopularModel()` с параметрами по умолчанию

In [4]:
models = {
    "random": RandomModel(random_state=32),
    "popular": PopularModel(),
}

Метрики: 2 ранжирующие (MAP, NDCG), 2 классификационные (precision, recall), 2 beyond-accuracy (novelty, serendipity). Считаем по порогам 1, 5, 10. MAP обязательно

In [5]:
metrics = {
    "MAP@1": MAP(k=1),
    "MAP@5": MAP(k=5),
    "MAP@10": MAP(k=10),
    "NDCG@1": NDCG(k=1),
    "NDCG@5": NDCG(k=5),
    "NDCG@10": NDCG(k=10),
    "precision@1": Precision(k=1),
    "precision@5": Precision(k=5),
    "precision@10": Precision(k=10),
    "recall@1": Recall(k=1),
    "recall@5": Recall(k=5),
    "recall@10": Recall(k=10),
    "novelty@1": MeanInvUserFreq(k=1),
    "novelty@5": MeanInvUserFreq(k=5),
    "novelty@10": MeanInvUserFreq(k=10),
    "serendipity@1": Serendipity(k=1),
    "serendipity@5": Serendipity(k=5),
    "serendipity@10": Serendipity(k=10),
}

Сплиттер: `rectools.model_selection.TimeRangeSplitter`, 3 фолда для кросс-валидации по неделе, исключение холодных юзеров и айтемов и просмотренных айтемов

In [6]:
cv = TimeRangeSplitter(
    test_size="7D",  # по неделе
    n_splits=3,  # 3 фолда для кросс-валидации
    filter_already_seen=True,  # исключение просмотренных айтемов
    filter_cold_items=True,  # исключение холодных айтемов 
    filter_cold_users=True,  #  исключение холодных юзеров
)

Визуализация рекомендаций и историй просмотров для юзеров `[666262, 672861, 955527]`. Для айтемов обязательно отражаем названия, жанры и количество просмотров в датасете (как для айтемов из истории взаимодействий каждого юзера, так и для айтемов из его рекомендаций)

In [7]:
USERS = [666262, 672861, 955527]
K_RECOS = 10

In [10]:
calculate_metrics(models,
                  metrics,
                  cv,
                  K_RECOS)

  0%|          | 0/3 [00:00<?, ?it/s]

===== Model: random | Fold: 0 =====
Fit time: 0.0 sec.
Recommend time: 5.19 sec.
Metrics time: 1.85 sec.
===== Model: popular | Fold: 0 =====
Fit time: 1.19 sec.
Recommend time: 3.48 sec.
Metrics time: 1.96 sec.
===== Model: random | Fold: 1 =====
Fit time: 0.0 sec.
Recommend time: 6.16 sec.
Metrics time: 2.15 sec.
===== Model: popular | Fold: 1 =====
Fit time: 1.2 sec.
Recommend time: 4.03 sec.
Metrics time: 2.15 sec.
===== Model: random | Fold: 2 =====
Fit time: 0.0 sec.
Recommend time: 5.78 sec.
Metrics time: 2.17 sec.
===== Model: popular | Fold: 2 =====
Fit time: 1.37 sec.
Recommend time: 4.37 sec.
Metrics time: 2.45 sec.


Unnamed: 0_level_0,precision@1,recall@1,precision@5,recall@5,precision@10,recall@10,NDCG@1,NDCG@5,NDCG@10,MAP@1,MAP@5,MAP@10,novelty@1,novelty@5,novelty@10,serendipity@1,serendipity@5,serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
popular,0.076432,0.04272,0.052402,0.137413,0.033903,0.173492,0.076432,0.057932,0.043084,0.04272,0.078295,0.084109,2.377055,3.066979,3.71339,2e-06,3e-06,2e-06
random,0.000221,7.2e-05,0.000202,0.000365,0.000193,0.000693,0.000221,0.000208,0.0002,7.2e-05,0.000169,0.000211,15.614137,15.612989,15.613009,6e-06,7e-06,7e-06


In [45]:
# def visualize(model: ModelBase,
#               dataset: List[Interactions, pd.DataFrame],
#               # Interactions, Users, Items
#               user_list: List[int],
#               item_data: pd.DataFrame) -> None:
#     pass

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
...,...,...,...,...,...
5476246,648596,12225,2021-08-13,76,0.0
5476247,546862,9673,2021-04-13,2308,49.0
5476248,697262,15297,2021-08-20,18307,63.0
5476249,384202,16197,2021-04-19,6203,100.0
