In [None]:
!pip install rectools

In [None]:
from pprint import pprint
from typing import List, Dict, Any
import time
from IPython.display import display

import pandas as pd
import numpy as np

from tqdm.auto import tqdm

from rectools import Columns
from rectools.dataset import Interactions, Dataset
from rectools.models import RandomModel, PopularModel
from rectools.metrics import calc_metrics
from rectools.metrics.classification import Recall, MCC
from rectools.metrics.ranking import MAP, NDCG
from rectools.metrics.serendipity import Serendipity
from rectools.metrics.novelty import MeanInvUserFreq
from rectools.model_selection import TimeRangeSplitter

In [6]:
class CrossValScore():
    def __init__(self, models: Dict, metrics: Dict, splitter: TimeRangeSplitter, interactions: Interactions):
        self.models = models
        self.metrics = metrics
        self.splitter = splitter
        self.interactions = interactions

    def init(self, train_ids: List, test_ids: List):
        df_train = self.interactions.df.iloc[train_ids]
        dataset = Dataset.construct(df_train)

        df_test = self.interactions.df.iloc[test_ids][Columns.UserItem]
        test_users = np.unique(df_test[Columns.User])

        catalog = df_train[Columns.Item].unique()
        return dataset, df_train, df_test, test_users, catalog

    def evaluate(self, k: int, n_splits: int = 3):
        self.fold_iterator = self.splitter.split(
            self.interactions, collect_fold_stats=True
        )
        results = []

        for train_ids, test_ids, fold_info in tqdm(
            self.fold_iterator, total=n_splits
        ):
            print(f"\n==================== Fold {fold_info['i_split']}")
            pprint(fold_info)

            dataset, df_train, df_test, test_users, catalog = self.init(
                train_ids, test_ids
            )

            for model_name, model in self.models.items():
                start_time = time.time()
                model.fit(dataset)
                fit_time = time.time() - start_time

                recos = model.recommend(
                    users=test_users, dataset=dataset, k=k, filter_viewed=True
                )

                metric_values = calc_metrics(
                    self.metrics, reco=recos, interactions=df_test,
                    prev_interactions=df_train, catalog=catalog
                )

                res = {
                    "fold": fold_info["i_split"],
                    "model": model_name,
                    "training_time": fit_time
                }
                res.update(metric_values)
                results.append(res)

        df = pd.DataFrame(results).groupby(['model']).mean().reset_index()
        df = df.drop(columns='fold')
        return df


In [7]:
class Visualizer():
    def __init__(self, model: Any, dataset: Dataset, interactions: Interactions, items: pd.DataFrame):
        self.model = model
        self.dataset = dataset
        self.interactions = interactions
        self.items = items

    def visualize(self, users: List, k: int):
        recos = self.model.recommend(users=users, dataset=self.dataset, k=k, filter_viewed=True)
        user_viewed = self.interactions.df[
            self.interactions.df['user_id'].isin(users)
        ].merge(
            self.items[['title', 'genres', 'item_id']], on="item_id", how='left'
        )
        recos = recos.merge(
            self.items[['title', 'genres', 'item_id']], on="item_id"
        )
        return user_viewed, recos

    def visualize_with_color(self, users: List, k: int):
        user_viewed, recos = self.visualize(users, k)
        common_titles = pd.merge(
            user_viewed[['title']], recos[['title']], on='title', how='inner'
        )['title'].unique()

        def highlight_common_movies(row):
            if row['title'] in common_titles:
                return ['background-color: blue'] * len(row)
            else:
                return [''] * len(row)

        user_viewed_styled = user_viewed.style.apply(highlight_common_movies, axis=1)
        recos_styled = recos.style.apply(highlight_common_movies, axis=1)

        users_str = ', '.join(map(str, users))

        print('*' * 20 + ' ' + f'Просмотры пользователей {users_str}' + ' ' + '*' * 20)
        display(user_viewed_styled)

        print('\n' * 3)

        print('*' * 20 + ' ' + f'Рекомендации для пользователей {users_str}' + ' ' + '*' * 20)
        display(recos_styled)

        return user_viewed_styled, recos_styled

In [None]:
!cp '/content/drive/MyDrive/RecSys/metrics-valid/kion_train.zip' ./
!unzip /content/kion_train.zip

In [9]:
interactions_df = pd.read_csv('/content/kion_train/interactions.csv')
items = pd.read_csv('/content/kion_train/items.csv')
users = pd.read_csv('/content/kion_train/users.csv')

In [10]:
interactions_df = interactions_df.drop(columns='total_dur')
interactions_df['watched_pct'] = interactions_df['watched_pct'] / 100
interactions_df.rename(columns={'user_id': Columns.User, 'item_id': Columns.Item,
                             'last_watch_dt': Columns.Datetime, 'watched_pct': Columns.Weight}, inplace=True)

In [11]:
dataset = Dataset.construct(interactions_df)
interactions = Interactions(interactions_df)
del interactions_df

In [12]:
models = {
    'Popular': PopularModel(),
    'Random': RandomModel(random_state=32)
}

metrics = {
    'Recall@1': Recall(k=1),
    'Recall@5': Recall(k=5),
    'Recall@10': Recall(k=10),
    'MCC1': MCC(k=1),
    'MCC5': MCC(k=5),
    'MCC10': MCC(k=10),
    'MAP@1': MAP(k=1),
    'MAP@5': MAP(k=5),
    'MAP@10': MAP(k=10),
    'NDCG@1': NDCG(k=1),
    'NDCG@5': NDCG(k=5),
    'NDCG@10': NDCG(k=10),
    'Serendipity1': Serendipity(k=1),
    'Serendipity5': Serendipity(k=5),
    'Serendipity10': Serendipity(k=10),
    'MeanInvUserFreq1': MeanInvUserFreq(k=1),
    'MeanInvUserFreq5': MeanInvUserFreq(k=5),
    'MeanInvUserFreq10': MeanInvUserFreq(k=10)
}

In [None]:
n_splits = 3

cv = TimeRangeSplitter(
    test_size="7D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

cv.get_test_fold_borders(dataset.interactions)

In [14]:
cross_val = CrossValScore(models, metrics, cv, interactions)
cross_val.evaluate(k = 10, n_splits = 3)

  0%|          | 0/3 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'i_split': 0,
 'start': Timestamp('2021-08-02 00:00:00', freq='7D'),
 'test': 263681,
 'test_items': 6602,
 'test_users': 98184,
 'train': 4266013,
 'train_items': 15237,
 'train_users': 797423}

{'end': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'i_split': 1,
 'start': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'test': 279422,
 'test_items': 6698,
 'test_users': 103511,
 'train': 4649162,
 'train_items': 15415,
 'train_users': 850489}

{'end': Timestamp('2021-08-23 00:00:00', freq='7D'),
 'i_split': 2,
 'start': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'test': 298878,
 'test_items': 6679,
 'test_users': 110076,
 'train': 5051815,
 'train_items': 15577,
 'train_users': 906071}


Unnamed: 0,model,training_time,Recall@1,MCC1,Recall@5,MCC5,Recall@10,MCC10,NDCG@1,NDCG@5,NDCG@10,MAP@1,MAP@5,MAP@10,MeanInvUserFreq1,MeanInvUserFreq5,MeanInvUserFreq10,Serendipity1,Serendipity5,Serendipity10
0,Popular,2.563768,0.04272,0.053727,0.137413,0.079391,0.173492,0.071505,0.076432,0.057932,0.043084,0.04272,0.078295,0.084109,2.377055,3.066979,3.71339,2e-06,3e-06,2e-06
1,Random,0.0001,7.2e-05,1.8e-05,0.000365,3.2e-05,0.000693,2.6e-05,0.000221,0.000208,0.0002,7.2e-05,0.000169,0.000211,15.614137,15.612989,15.613009,6e-06,7e-06,7e-06


In [None]:
users = [666262, 672861, 955527, 161176]
model = PopularModel()
model.fit(dataset)

In [16]:
vz = Visualizer(model, dataset, interactions, items)
user_viewed, recos = vz.visualize_with_color(users, 10)

******************** Просмотры пользователей 666262, 672861, 955527, 161176 ********************


Unnamed: 0,user_id,item_id,datetime,weight,title,genres
0,161176,10440,2021-07-29 00:00:00,0.0,Хрустальный,"триллеры, детективы"
1,955527,1183,2021-06-02 00:00:00,0.01,Стань легендой! Бигфут Младший,"мультфильм, фэнтези, приключения, комедии"
2,672861,6870,2021-04-27 00:00:00,0.0,Красавица и чудовище,"драмы, фэнтези, музыкальные"
3,955527,13371,2021-05-04 00:00:00,0.11,Пеле: Рождение легенды,"драмы, спорт, биография"
4,955527,4725,2021-06-02 00:00:00,0.04,Лобановский навсегда,"спорт, биография, документальное"
5,666262,7957,2021-05-12 00:00:00,0.32,Последний викинг,"боевики, историческое, приключения"
6,672861,8662,2021-05-04 00:00:00,1.0,Он – дракон,фэнтези
7,955527,1238,2021-06-02 00:00:00,0.07,Диего Марадона,"спорт, биография, документальное"
8,666262,4785,2021-05-12 00:00:00,0.28,Робин Гуд: Начало,"боевики, триллеры, приключения"
9,666262,12981,2021-05-14 00:00:00,1.0,Томирис,"боевики, драмы, историческое, военные"






******************** Рекомендации для пользователей 666262, 672861, 955527, 161176 ********************


Unnamed: 0,user_id,item_id,score,rank,title,genres
0,666262,10440,202457.0,1,Хрустальный,"триллеры, детективы"
1,672861,10440,202457.0,1,Хрустальный,"триллеры, детективы"
2,955527,10440,202457.0,1,Хрустальный,"триллеры, детективы"
3,666262,15297,193123.0,2,Клиника счастья,"драмы, мелодрамы"
4,672861,15297,193123.0,2,Клиника счастья,"драмы, мелодрамы"
5,955527,15297,193123.0,2,Клиника счастья,"драмы, мелодрамы"
6,161176,15297,193123.0,1,Клиника счастья,"драмы, мелодрамы"
7,666262,9728,132865.0,3,Гнев человеческий,"боевики, триллеры"
8,672861,9728,132865.0,3,Гнев человеческий,"боевики, триллеры"
9,955527,9728,132865.0,3,Гнев человеческий,"боевики, триллеры"
