In [1]:
!pip install rectools

Collecting rectools
  Downloading rectools-0.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting implicit<0.8.0,>=0.7.1 (from rectools)
  Downloading implicit-0.7.2-cp310-cp310-macosx_10_9_x86_64.whl.metadata (6.1 kB)
Collecting pandas<2.0.0,>=0.25.3 (from rectools)
  Downloading pandas-1.5.3-cp310-cp310-macosx_10_9_x86_64.whl (12.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
Collecting tqdm<5.0.0,>=4.27.0 (from rectools)
  Using cached tqdm-4.66.1-py3-none-any.whl.metadata (57 kB)
Collecting typeguard<3.0.0,>=2.0.1 (from rectools)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Downloading rectools-0.4.1-py3-none-any.whl (99 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.0/99.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading implicit-0.7.2-cp310-cp310-macosx_10_9_x86_64.whl (813 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━

In [2]:
from pprint import pprint
from typing import List, Dict, Any
import time
from IPython.display import display

import pandas as pd
import numpy as np

from tqdm.auto import tqdm

from rectools import Columns
from rectools.dataset import Interactions, Dataset
from rectools.models import RandomModel, PopularModel
from rectools.metrics import calc_metrics
from rectools.metrics.classification import Recall, MCC
from rectools.metrics.ranking import MAP, NDCG
from rectools.metrics.serendipity import Serendipity
from rectools.metrics.novelty import MeanInvUserFreq
from rectools.model_selection import TimeRangeSplitter

In [3]:
class CrossValScore():
    def __init__(self, models:Dict, metrics:Dict, splitter:TimeRangeSplitter, interactions:Interactions):
        self.models = models
        self.metrics = metrics
        self.splitter = splitter
        self.interactions = interactions

    def init(self, train_ids: List, test_ids:List):
        df_train = self.interactions.df.iloc[train_ids]
        dataset = Dataset.construct(df_train)

        df_test = self.interactions.df.iloc[test_ids][Columns.UserItem]
        test_users = np.unique(df_test[Columns.User])

        catalog = df_train[Columns.Item].unique()
        return dataset, df_train, df_test, test_users, catalog

    def evaluate(self, k: int, n_splits: int = 3):
        self.fold_iterator = self.splitter.split(self.interactions, collect_fold_stats=True)
        results = []

        for train_ids, test_ids, fold_info in tqdm(self.fold_iterator, total=n_splits):
            print(f"\n==================== Fold {fold_info['i_split']}")
            pprint(fold_info)

            dataset, df_train, df_test, test_users, catalog = self.init(train_ids, test_ids)

            for model_name, model in self.models.items():
                start_time = time.time()
                model.fit(dataset)
                fit_time = time.time() - start_time

                recos = model.recommend(users=test_users, dataset=dataset, k=k, filter_viewed=True)

                metric_values = calc_metrics(self.metrics, reco=recos, interactions=df_test, prev_interactions=df_train, catalog=catalog)

                res = {"fold": fold_info["i_split"], "model": model_name, "training_time": fit_time}
                res.update(metric_values)
                results.append(res)

        df = pd.DataFrame(results).groupby(['model']).mean().reset_index()
        df = df.drop(columns = 'fold')
        return df

In [4]:
class Visualizer():
    def __init__(self, model:Any, dataset: Dataset, interactions:Interactions, items:pd.DataFrame):
        self.model = model
        self.dataset = dataset
        self.interactions = interactions
        self.items = items

    def visualize(self, users:List, k:int):
        recos = self.model.recommend(users=users, dataset=self.dataset, k=k, filter_viewed=True)
        user_viewed = self.interactions.df[self.interactions.df['user_id'].isin(users)].merge(self.items[['title', 'genres', 'item_id']], on="item_id", how='left')
        recos = recos.merge(self.items[['title', 'genres', 'item_id']], on="item_id")
        return user_viewed, recos

    def visualize_with_color(self, users:List, k:int):
        user_viewed, recos = self.visualize(users, k)
        common_titles = pd.merge(user_viewed[['title']], recos[['title']], on='title', how='inner')['title'].unique()

        def highlight_common_movies(row):
            if row['title'] in common_titles:
                return ['background-color: blue'] * len(row)
            else:
                return [''] * len(row)

        user_viewed_styled = user_viewed.style.apply(highlight_common_movies, axis=1)
        recos_styled = recos.style.apply(highlight_common_movies, axis=1)

        users_str = ', '.join(map(str, users))

        print('*' * 20 + ' ' + f'Просмотры пользователей {users_str}' + ' ' + '*' * 20)
        display(user_viewed_styled)

        print('\n' * 3)

        print('*' * 20 + ' ' + f'Рекомендации для пользователей {users_str}' + ' ' + '*' * 20)
        display(recos_styled)

        return user_viewed_styled, recos_styled

In [None]:
!cp '/content/drive/MyDrive/RecSys/metrics-valid/kion_train.zip' ./
!unzip /content/kion_train.zip

In [6]:
interactions_df = pd.read_csv('/content/kion_train/interactions.csv')
items = pd.read_csv('/content/kion_train/items.csv')
users = pd.read_csv('/content/kion_train/users.csv')

In [7]:
interactions_df = interactions_df.drop(columns='total_dur')
interactions_df['watched_pct'] = interactions_df['watched_pct'] / 100
interactions_df.rename(columns={'user_id': Columns.User, 'item_id': Columns.Item,
                             'last_watch_dt': Columns.Datetime, 'watched_pct': Columns.Weight}, inplace=True)

In [8]:
dataset = Dataset.construct(interactions_df)
interactions = Interactions(interactions_df)
del interactions_df

In [9]:
models = {
    'Popular': PopularModel(),
    'Random': RandomModel(random_state=32)
}

metrics = {
    'Recall@1': Recall(k=1),
    'Recall@5': Recall(k=5),
    'Recall@10': Recall(k=10),
    'MCC1': MCC(k=1),
    'MCC5': MCC(k=5),
    'MCC10': MCC(k=10),
    'MAP@1': MAP(k=1),
    'MAP@5': MAP(k=5),
    'MAP@10': MAP(k=10),
    'NDCG@1': NDCG(k=1),
    'NDCG@5': NDCG(k=5),
    'NDCG@10': NDCG(k=10),
    'Serendipity1': Serendipity(k=1),
    'Serendipity5': Serendipity(k=5),
    'Serendipity10': Serendipity(k=10),
    'MeanInvUserFreq1': MeanInvUserFreq(k=1),
    'MeanInvUserFreq5': MeanInvUserFreq(k=5),
    'MeanInvUserFreq10': MeanInvUserFreq(k=10)
}

In [None]:
n_splits = 3

cv = TimeRangeSplitter(
    test_size="7D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

cv.get_test_fold_borders(dataset.interactions)

In [None]:
cross_val = CrossValScore(models, metrics, cv, interactions)
cross_val.evaluate(k = 10, n_splits = 3)

In [None]:
users = [666262, 672861, 955527, 161176]
model = PopularModel()
model.fit(dataset)

In [None]:
vz = Visualizer(model, dataset, interactions, items)
user_viewed, recos = vz.visualize_with_color(users, 10)