In [1]:
pip install numpy

Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install --upgrade pip

In [9]:
pip install rectools[lightfm]

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import warnings
from pathlib import Path
import os
import threadpoolctl

warnings.filterwarnings('ignore')


# lightfm extension is required for the LighFM section. You can install it with `pip install rectools[lightfm]`
try:
    from lightfm import LightFM
except ModuleNotFoundError:
    pass

from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import (
    ImplicitALSWrapperModel,
    ImplicitBPRWrapperModel,
    LightFMWrapperModel,
    PureSVDModel,
    ImplicitItemKNNWrapperModel,
    EASEModel
)

# For vector models optimized ranking
os.environ["OPENBLAS_NUM_THREADS"] = "1"
threadpoolctl.threadpool_limits(1, "blas");

In [None]:
print(1)

In [8]:
pip install rectools[visuals]

^C
Note: you may need to restart the kernel to use updated packages.


In [3]:
from pprint import pprint

import numpy as np
import pandas as pd

from tqdm.auto import tqdm

from implicit.nearest_neighbours import TFIDFRecommender, BM25Recommender
from implicit.als import AlternatingLeastSquares

from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics
from rectools.models import ImplicitItemKNNWrapperModel, RandomModel, PopularModel
from rectools.model_selection import TimeRangeSplitter, cross_validate
from rectools.visuals import MetricsApp
from rectools.models import LightFMWrapperModel

In [4]:
import pickle

with open('cur/dataset.pkl', 'rb') as f:
    dataset = pickle.load(f)
aboba = pd.read_pickle('cur/dataset.pkl')
print(aboba)

Dataset(user_id_map=IdMap(external_ids=array([     34,       1,       3, ..., 3036571, 3036579, 3036583],
      dtype=int64)), item_id_map=IdMap(external_ids=array([11690, 16970,     3, ..., 99052, 78096, 76603], dtype=int64)), interactions=Interactions(df=         user_id  item_id  weight                   datetime
0              0        0     7.0 2025-03-19 04:35:41.362820
1              0        1     8.0 2025-03-19 04:35:41.362820
2              1        2     7.0 2018-05-22 00:00:00.000000
3              1        3     5.6 2018-05-22 18:31:45.881993
4              1        4     7.0 2018-05-22 18:34:45.816208
...          ...      ...     ...                        ...
3046372        0        1     1.2 2025-03-19 00:35:36.589955
3046373        0        1     1.2 2025-03-19 00:35:36.589955
3046374        0        1     1.2 2025-03-19 00:35:36.589955
3046375        0        1     1.2 2025-03-19 00:35:36.589955
3046376        0        1     1.2 2025-03-19 00:35:36.589955

[3046377 r

In [5]:
interactions_df = aboba.interactions.df

# Преобразуем столбец datetime в тип datetime (если это еще не сделано)
interactions_df['datetime'] = pd.to_datetime(interactions_df['datetime'])

# Сортируем данные по убыванию времени
sorted_df = interactions_df.sort_values(by='datetime', ascending=False)

# Выводим результат
print(sorted_df.head(5))

         user_id  item_id  weight                   datetime
0              0        0     7.0 2025-03-19 04:35:41.362820
1              0        1     8.0 2025-03-19 04:35:41.362820
3046375        0        1     1.2 2025-03-19 00:35:36.589955
3046374        0        1     1.2 2025-03-19 00:35:36.589955
3046373        0        1     1.2 2025-03-19 00:35:36.589955


In [6]:
n_splits = 3

splitter = TimeRangeSplitter(
    test_size="14D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [7]:
splitter.get_test_fold_borders(dataset.interactions)

[(Timestamp('2025-02-06 00:00:00'), Timestamp('2025-02-20 00:00:00')),
 (Timestamp('2025-02-20 00:00:00'), Timestamp('2025-03-06 00:00:00')),
 (Timestamp('2025-03-06 00:00:00'), Timestamp('2025-03-20 00:00:00'))]

In [9]:
models = {
    "lighfm": LightFMWrapperModel(LightFM(no_components=100, loss="bpr", random_state=60)),
}

# We will calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "prec@1": Precision(k=1),
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty@10": MeanInvUserFreq(k=10),
    "serendipity@10": Serendipity(k=10),
}

K_RECS = 10

In [None]:
%%time

# For each fold generate train and test part of dataset
# Then fit every model, generate recommendations and calculate metrics

cv_results = cross_validate(
    dataset=dataset,
    splitter=splitter,
    models=models,
    metrics=metrics,
    k=K_RECS,
    filter_viewed=True,
)

In [12]:
pd.DataFrame(cv_results["splits"])

NameError: name 'cv_results' is not defined

In [13]:
pd.DataFrame(cv_results["metrics"])

NameError: name 'cv_results' is not defined

In [None]:
pivot_results = (
    pd.DataFrame(cv_results["metrics"])
    .drop(columns="i_split")
    .groupby(["model"], sort=False)
    .agg(["mean"])
)
pivot_results.columns = pivot_results.columns.droplevel(1)

(
    pivot_results.style
    .set_caption("Mean values of metrics")
    .highlight_min(color='lightcoral', axis=0)
    .highlight_max(color='lightgreen', axis=0)
)

In [None]:
metadata_example = {
    Columns.Model: ["lightfm"],
    "k": [40]
}

In [None]:
app = MetricsApp.construct(
    models_metrics=pd.DataFrame(cv_results["metrics"]),
    models_metadata=pd.DataFrame(metadata_example),  # optional
)

In [None]:
pip install -U kaleido

In [None]:
fig = app.fig
fig.update_layout(title="Metrics: prec@10 vs prec@1")
fig.show("png")