In [55]:
import time
from copy import deepcopy
from typing import Callable, Any

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [56]:
import rectools
from rectools.dataset import Interactions, Dataset
from rectools.model_selection import Splitter
from rectools.models.base import ModelBase
from rectools.models import RandomModel, PopularModel
from rectools.metrics.base import MetricAtK
from rectools.metrics import (
    Precision,
    Recall,
    MAP,
    NDCG,
    MeanInvUserFreq,
    calc_metrics,
)

# Read Data

In [57]:
interactions_df = pd.read_csv("../datasets/interactions.csv", parse_dates=["last_watch_dt"])
interactions_df.rename(
    columns={"last_watch_dt": rectools.Columns.Datetime, "total_dur": rectools.Columns.Weight},
    inplace=True
)
interactions = Interactions(interactions_df)
interactions.df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../datasets/interactions.csv'

In [None]:
items = pd.read_csv("../datasets/items.csv")
items.head()

# Validation

In [None]:
def add_thresholds(metrics: dict[str, Callable | tuple[Callable, dict[str, Any]]], ks: list[int]):
    metrics_with_thresholds = {}
    for name, metric in metrics.items():
        kwargs = {}
        if isinstance(metric, tuple):
            kwargs.update(**metric[1])
            metric = metric[0]
        for k in ks:
            metrics_with_thresholds.update({f"{name}@{k}": metric(k=k, **kwargs)})
    return metrics_with_thresholds

In [None]:
def calc_coverage(reco):
    return reco.item_id.nunique() / len(reco)

In [None]:
def cross_val(
    dataset: pd.DataFrame,
    models: dict[str, ModelBase],
    metrics: dict[str, MetricAtK | Callable],
    splitter: Splitter,
    k: int,
    num_splits: int = 0,
):
    rectool_metrics = {k: v for k, v in metrics.items() if isinstance(v, MetricAtK)}
    custom_metrics = {k: v for k, v in metrics.items() if k not in rectool_metrics}
    metric_values = []
    interactions = Interactions(dataset)
    pbar = tqdm(total=len(models) * num_splits)
    pbar.set_description(f"splitting")
    splits = splitter.split(interactions)
    for train_ids, test_ids, i in splits:
        num_fold = i["i_split"]
        train = Dataset.construct(dataset.iloc[train_ids])
        test = Dataset.construct(dataset.iloc[test_ids])

        for model_name, orig_model in models.items():
            pbar_prefix = f"fold {num_fold}, {model_name}"
            pbar.set_description(f"{pbar_prefix} training")
            model = deepcopy(orig_model)
            start = time.time()
            model.fit(train)
            end = time.time()
            pbar.set_description(f"{pbar_prefix} predicting")
            reco = model.recommend(test.user_id_map.external_ids, train, k, True)
            del model
            pbar.set_description(f"{pbar_prefix} evaluating")
            cur_metrics = {
                "model": model_name,
                "time": end - start,
                **calc_metrics(
                    rectool_metrics,
                    reco=reco,
                    interactions=test.interactions.df,
                    prev_interactions=train.interactions.df,
                ),
            }
            for name, metric in custom_metrics.items():
                cur_metrics.update({name: metric(reco)})
            metric_values.append(cur_metrics)
            pbar.update(1)
    pbar.close()
    return pd.DataFrame(metric_values).groupby("model").mean()

In [None]:
metrics = add_thresholds(
    {
        "precision": Precision,
        "recall": Recall,
        "MAP": MAP,
        "NDCG": NDCG,
        "novelty": MeanInvUserFreq,
    },
    [1, 5, 10],
)
metrics.update({"coverage": calc_coverage})
metrics

In [None]:
NUM_RECOS = 10
NUM_SPLITS = 3
SEED = 32
splitter = rectools.model_selection.time_split.TimeRangeSplitter("1D", NUM_SPLITS)
models = {"random": RandomModel(random_state=SEED), "popular": PopularModel()}

results = cross_val(interactions_df, models, metrics, splitter, NUM_RECOS, num_splits=NUM_SPLITS)

In [None]:
results

# Visualization

In [None]:
from IPython.core.display import HTML


class Visualizer:
    def __init__(
        self,
        model: ModelBase,
        dataset: Dataset,
        item_data: str | None=None,
        items: pd.DataFrame | None = None,
        items_df_path: str = "../datasets/items.csv",
    ):
        if item_data is None:
            item_data = ["title"]

        self.model = model
        self.dataset = dataset
        if items is None:
            self.items = pd.read_csv(items_df_path)
        else:
            self.items = items
        for d in item_data:
            assert (
                d in self.items.columns
            ), f'Unknown column "{d}". Can only visualize information about {", ".join(self.items.columns)}.'
        self.item_data = item_data

    def display_item_data(self, interactions):
        return pd.merge(interactions, self.items)[["user_id"] + self.item_data]

    @staticmethod
    def pretty_print(df):
        display(HTML(df.to_html()))

    def get_reco(self, user_ids: list[int], k: int = 10):
        reco = self.model.recommend(np.array(user_ids), self.dataset, k, True)
        return self.display_item_data(reco)

    def get_history(self, user_ids: list[int]):
        df = self.dataset.interactions.df
        history = df[df.user_id.isin(user_ids)]
        return self.display_item_data(history)

    def analyse_recos(self, user_ids: list[int]):
        for user_id in user_ids:
            print(f"User {user_id} watched these film:")
            self.pretty_print(self.get_history([user_id]))
            print(f"And got these films as recomendations")
            self.pretty_print(self.get_reco([user_id]))
            print("\n")

In [None]:
model = RandomModel(random_state=32)
dataset = Dataset.construct(interactions_df)
model.fit(dataset)
USER_IDS = [1091234, 787802, 948921]

In [None]:
viz = Visualizer(model, dataset, item_data=["title", "genres"], items=items)
viz.analyse_recos(USER_IDS)