# Основы построения рекомендательных систем

## Домашнее задание №3

### Основные пункты оценки
- значение метрики на лидерборде
- ревью кода в ноутбуке
- реализация сервиса для модели

### Подробности
1. Побейте метрику на лидерборде map@10 = 0.063 для userKnn модели с семинара
2. Реализуйте эксперименты с кастомной моделю kNN с семинара. Результат - ноутбук(и)

Что можно сделать в ноутбуке:
- придумать, что делать с холодными пользователями в тесте. Сделайте рекомендации для них (обратите внимание на rectools.models.popular)
- сделать кол-во рекомендаций равным N, а не меньше N
- реализовать тюнинг гиперпараметров (например, векторного расстояния или типов kNN моделей (implicit/rectools/...)) и сделать выводы
- реализовать другие варианты ранжированивания айтемов похожих пользователей и сделать выводы
- провести эксперименты с параметрами оффлайн валидации и сделать выводы

3. Оберните модель в сервис
- предпочтительный онлайн вариант: обучаете модель в ноутбуке, сохраняете обученную модель (pickle, dill), при запуске сервиса ее поднимаете и запрашиваете рекомендации "на лету"
- или оффлайн вариант: предварительно посчитайте рекомендации для всех пользователей, сохраните и запрашивайте их

### Хороший pull request - это:
- наличие описания (в идеале что сделано - по пунктам)
- код по стандарту PEP8
- легкая читаемость и воспроизводимость кода
- комментарии и объяснения. В ipynb пользуйтесь силой маркдауна. В скриптах пишите комментарии и докстринг.
- обоснование схемы валидации
- анализ метрики качества


In [1]:
# установка библиотек
import sys
!{sys.executable} -m pip install rectools
!{sys.executable} -m pip install dill

Collecting rectools
  Downloading rectools-0.4.2-py3-none-any.whl (102 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/102.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m102.4/102.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.5/102.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting implicit<0.8.0,>=0.7.1 (from rectools)
  Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl (8.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.0.1 (from rectools)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, implicit, rectools
Successfully installed implicit-0.7.2 rectools-0.4.2 typeguard-2.13.3
Collecting dill
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K    

In [25]:
# подключение библиотек
import requests
from tqdm.auto import tqdm
import zipfile as zf

import numpy as np
import pandas as pd
import dill

from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import MAP, MeanInvUserFreq, NDCG, Precision, Recall, Serendipity, calc_metrics
from rectools.model_selection import TimeRangeSplitter
from implicit.nearest_neighbours import BM25Recommender, CosineRecommender, TFIDFRecommender
from models.custom_userknn import CustomUserKnn
from models.userknn import UserKnn

In [26]:
# Функция валидирования моделей методом Leave-Time-out
def validate(models: dict, metrics: dict, splitter: TimeRangeSplitter, dataset: Dataset, K_RECOS: int):
  results = []

  # создаем итератор фолдов
  fold_iterator = splitter.split(dataset.interactions, collect_fold_stats=True)

  # обучение по фолдам
  for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=splitter.n_splits):
      print(f"\n==================== Fold {fold_info['i_split']} ====================")
      print(fold_info)

      # тренировочная часть
      df_train = dataset.interactions.df.iloc[train_ids]
      train_dataset = Dataset.construct(df_train)

      # тестовая часть
      df_test = dataset.interactions.df.iloc[test_ids][Columns.UserItem]
      test_users = np.unique(df_test[Columns.User])

      # каталог
      catalog = df_train[Columns.Item].unique()


      # обучение моделей
      for model_name, model in models.items():
          # обучение и получение рекомендаций
          model.fit(train_dataset)
          recos = model.recommend(
              users=df_test,
              dataset=train_dataset,
              N_recs=K_RECOS
          )
          # подсчет метрик
          metric_values = calc_metrics(
              metrics,
              reco=recos,
              interactions=df_test,
              prev_interactions=df_train,
              catalog=catalog,
          )
          # сохранение результатов
          res = {"fold": fold_info["i_split"], "model": model_name}
          res.update(metric_values)
          results.append(res)

  # форматирование в DataFrame
  pivot_results = pd.DataFrame(results)\
                    .drop(columns="fold")\
                    .groupby(["model"], sort=False)\
                    .agg("mean")
  # сохранение отчета
  pivot_results.to_csv('./report.csv')

  return pivot_results

## Чтение данных

In [6]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'
req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='Downloading the kion dataset...',
                        total=total_size_in_bytes,
                        unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

Downloading the kion dataset...:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [7]:
# Разархивирование данных
files = zf.ZipFile('kion_train.zip','r')
files.extractall()
files.close()

In [27]:
# Чтение в DataFrame
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

interactions = pd.read_csv('data_original/interactions.csv', parse_dates=["last_watch_dt"])
interactions.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    },
    inplace=True)

## Валидация
**Замечание**: датасет очень большой, алгоритмы сильно ресурсно затратные. Всего 5 000 000 взаимодействий, что очень много и займет много времени для валидирования моделей. Возьмём 1 000 000 взаимодействий и проведем эксперименты.

In [39]:
interactions.shape

(5476251, 5)

In [28]:
interactions = interactions.iloc[:1000000]
dataset = Dataset.construct(interactions_df=interactions)

In [38]:
metrics = {
    "prec@1": Precision(k=1),
    "prec@5": Precision(k=5),
    "prec@10": Precision(k=10),
    "recall@1": Recall(k=1),
    "recall@5": Recall(k=5),
    "recall@10": Recall(k=10),
    "MAP@1": MAP(k=1),
    "MAP@5": MAP(k=5),
    "MAP@10": MAP(k=10),
    "NDCG@1": MAP(k=1),
    "NDCG@5": MAP(k=5),
    "NDCG@10": MAP(k=10),
    "MeanInvUserFreq@1": MeanInvUserFreq(k=1),
    "MeanInvUserFreq@5": MeanInvUserFreq(k=5),
    "MeanInvUserFreq@10": MeanInvUserFreq(k=10),
    "Serendipity@1": Serendipity(k=1),
    "Serendipity@5": Serendipity(k=5),
    "Serendipity@10": Serendipity(k=10),
}


Эксперимент №1.
- Три модели с разной мерой расстояния (Cosine, TFIDF, BM25)
- Кол-во соседей 50
- Кол-во фолдов 3
- Размер тестового фолда 7D


In [42]:
models = {
    'cosin_user_knn': CustomUserKnn(model=CosineRecommender(num_threads=2), N_users=50),
    'tfidf_user_knn': CustomUserKnn(model=TFIDFRecommender(num_threads=2), N_users=50),
    'bm25_user_knn': CustomUserKnn(model=BM25Recommender(num_threads=2), N_users=50)
    }

In [43]:
n_splits = 3
cv = TimeRangeSplitter(test_size="7D",
                        n_splits=n_splits,
                        filter_cold_users=True,
                        filter_cold_items=True,
                        filter_already_seen=True)

In [45]:
validate(models, metrics, cv, dataset, K_RECOS=10)

  0%|          | 0/3 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-08-02 00:00:00', freq='7D'), 'end': Timestamp('2021-08-09 00:00:00', freq='7D'), 'train': 779557, 'train_users': 360297, 'train_items': 10978, 'test': 34230, 'test_users': 23844, 'test_items': 3793}




  0%|          | 0/360297 [00:00<?, ?it/s]



  0%|          | 0/360297 [00:00<?, ?it/s]



  0%|          | 0/360297 [00:00<?, ?it/s]


{'i_split': 1, 'start': Timestamp('2021-08-09 00:00:00', freq='7D'), 'end': Timestamp('2021-08-16 00:00:00', freq='7D'), 'train': 849356, 'train_users': 387156, 'train_items': 11202, 'test': 36140, 'test_users': 25091, 'test_items': 3856}




  0%|          | 0/387156 [00:00<?, ?it/s]



  0%|          | 0/387156 [00:00<?, ?it/s]



  0%|          | 0/387156 [00:00<?, ?it/s]


{'i_split': 2, 'start': Timestamp('2021-08-16 00:00:00', freq='7D'), 'end': Timestamp('2021-08-23 00:00:00', freq='7D'), 'train': 922443, 'train_users': 414753, 'train_items': 11402, 'test': 40002, 'test_users': 27564, 'test_items': 4025}




  0%|          | 0/414753 [00:00<?, ?it/s]



  0%|          | 0/414753 [00:00<?, ?it/s]



  0%|          | 0/414753 [00:00<?, ?it/s]

Unnamed: 0_level_0,prec@1,recall@1,prec@5,recall@5,prec@10,recall@10,MAP@1,MAP@5,MAP@10,NDCG@1,NDCG@5,NDCG@10,MeanInvUserFreq@1,MeanInvUserFreq@5,MeanInvUserFreq@10,Serendipity@1,Serendipity@5,Serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
cosin_user_knn,0.001026,0.000813,0.000494,0.001873,0.000379,0.002858,0.000813,0.001176,0.001307,0.000813,0.001176,0.001307,11.873277,11.501205,11.40771,9e-06,1.1e-05,1.1e-05
tfidf_user_knn,0.001019,0.000819,0.00058,0.002214,0.000424,0.003141,0.000819,0.001277,0.0014,0.000819,0.001277,0.0014,11.907259,11.425128,11.304092,1e-05,1.2e-05,1.2e-05
bm25_user_knn,0.000955,0.000772,0.000383,0.001447,0.000295,0.002186,0.000772,0.001003,0.001101,0.000772,0.001003,0.001101,12.05212,11.732434,11.594274,1e-05,1.1e-05,1.2e-05


В данном эксперименте по многим метрикам выигрывает TFIDFRecommender, проигрывает только BM25Recommender'у в метрике novelty@5 и novelty@10.

Эксперимент №2.
- Три модели с разной мерой расстояния (Cosine, TFIDF, BM25)
- Кол-во соседей 10
- Кол-во фолдов 3
- Размер тестового фолда 7D


In [46]:
models = {
    'cosin_user_knn': CustomUserKnn(model=CosineRecommender(num_threads=2), N_users=10),
    'tfidf_user_knn': CustomUserKnn(model=TFIDFRecommender(num_threads=2), N_users=10),
    'bm25_user_knn': CustomUserKnn(model=BM25Recommender(num_threads=2), N_users=10)
    }

In [47]:
n_splits = 3
cv = TimeRangeSplitter(test_size="7D",
                        n_splits=n_splits,
                        filter_cold_users=True,
                        filter_cold_items=True,
                        filter_already_seen=True)

In [48]:
validate(models, metrics, cv, dataset, K_RECOS=10)

  0%|          | 0/3 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-08-02 00:00:00', freq='7D'), 'end': Timestamp('2021-08-09 00:00:00', freq='7D'), 'train': 779557, 'train_users': 360297, 'train_items': 10978, 'test': 34230, 'test_users': 23844, 'test_items': 3793}




  0%|          | 0/360297 [00:00<?, ?it/s]



  0%|          | 0/360297 [00:00<?, ?it/s]



  0%|          | 0/360297 [00:00<?, ?it/s]


{'i_split': 1, 'start': Timestamp('2021-08-09 00:00:00', freq='7D'), 'end': Timestamp('2021-08-16 00:00:00', freq='7D'), 'train': 849356, 'train_users': 387156, 'train_items': 11202, 'test': 36140, 'test_users': 25091, 'test_items': 3856}




  0%|          | 0/387156 [00:00<?, ?it/s]



  0%|          | 0/387156 [00:00<?, ?it/s]



  0%|          | 0/387156 [00:00<?, ?it/s]


{'i_split': 2, 'start': Timestamp('2021-08-16 00:00:00', freq='7D'), 'end': Timestamp('2021-08-23 00:00:00', freq='7D'), 'train': 922443, 'train_users': 414753, 'train_items': 11402, 'test': 40002, 'test_users': 27564, 'test_items': 4025}




  0%|          | 0/414753 [00:00<?, ?it/s]



  0%|          | 0/414753 [00:00<?, ?it/s]



  0%|          | 0/414753 [00:00<?, ?it/s]

Unnamed: 0_level_0,prec@1,recall@1,prec@5,recall@5,prec@10,recall@10,MAP@1,MAP@5,MAP@10,NDCG@1,NDCG@5,NDCG@10,MeanInvUserFreq@1,MeanInvUserFreq@5,MeanInvUserFreq@10,Serendipity@1,Serendipity@5,Serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
cosin_user_knn,0.000986,0.000782,0.000506,0.00191,0.00033,0.002444,0.000782,0.001178,0.001253,0.000782,0.001178,0.001253,11.750056,11.411887,11.362851,9e-06,1.1e-05,1.1e-05
tfidf_user_knn,0.001019,0.000819,0.000502,0.001857,0.000323,0.002352,0.000819,0.001174,0.001243,0.000819,0.001174,0.001243,11.902942,11.483278,11.422691,1e-05,1.1e-05,1.1e-05
bm25_user_knn,0.000968,0.000785,0.000354,0.001339,0.000228,0.001701,0.000785,0.000986,0.001036,0.000785,0.000986,0.001036,11.996135,11.70593,11.647348,1e-05,1.1e-05,1.1e-05


В данном эксперименте CosineRecommender и TFIDFRecommender показывают приблизительно одинаковые резульататы, в то время как BM25Recommender проигрывает в большинстве случаев, показывая лучше результаты только по метрике novelty.\
Видно, что по сравнению с экспериментом №1, в целом у алгоритмов уменьшилась метрика MAP@10. Это происходит потому, что не было реализовано условие N-рекомендаций для каждого user'a, то есть рекомендаций у user'a может оказаться меньше того же значения 10, что сказывается на подсчитываемой метрике.

Эксперимент №3.
- Три модели с разной мерой расстояния (Cosine, TFIDF, BM25)
- Кол-во соседей 10
- Кол-во фолдов 5
- Размер тестового фолда 3D


In [49]:
models = {
    'cosin_user_knn': CustomUserKnn(model=CosineRecommender(num_threads=2), N_users=10),
    'tfidf_user_knn': CustomUserKnn(model=TFIDFRecommender(num_threads=2), N_users=10),
    'bm25_user_knn': CustomUserKnn(model=BM25Recommender(num_threads=2), N_users=10)
    }

In [50]:
n_splits = 5
cv = TimeRangeSplitter(test_size="3D",
                        n_splits=n_splits,
                        filter_cold_users=True,
                        filter_cold_items=True,
                        filter_already_seen=True)

In [51]:
validate(models, metrics, cv, dataset, K_RECOS=10)

  0%|          | 0/5 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-08-08 00:00:00', freq='3D'), 'end': Timestamp('2021-08-11 00:00:00', freq='3D'), 'train': 838173, 'train_users': 382886, 'train_items': 11184, 'test': 16728, 'test_users': 13112, 'test_items': 2922}




  0%|          | 0/382886 [00:00<?, ?it/s]



  0%|          | 0/382886 [00:00<?, ?it/s]



  0%|          | 0/382886 [00:00<?, ?it/s]


{'i_split': 1, 'start': Timestamp('2021-08-11 00:00:00', freq='3D'), 'end': Timestamp('2021-08-14 00:00:00', freq='3D'), 'train': 868537, 'train_users': 394324, 'train_items': 11261, 'test': 16748, 'test_users': 12991, 'test_items': 2977}




  0%|          | 0/394324 [00:00<?, ?it/s]



  0%|          | 0/394324 [00:00<?, ?it/s]



  0%|          | 0/394324 [00:00<?, ?it/s]


{'i_split': 2, 'start': Timestamp('2021-08-14 00:00:00', freq='3D'), 'end': Timestamp('2021-08-17 00:00:00', freq='3D'), 'train': 898994, 'train_users': 405732, 'train_items': 11355, 'test': 18338, 'test_users': 14312, 'test_items': 2998}




  0%|          | 0/405732 [00:00<?, ?it/s]



  0%|          | 0/405732 [00:00<?, ?it/s]



  0%|          | 0/405732 [00:00<?, ?it/s]


{'i_split': 3, 'start': Timestamp('2021-08-17 00:00:00', freq='3D'), 'end': Timestamp('2021-08-20 00:00:00', freq='3D'), 'train': 932350, 'train_users': 418395, 'train_items': 11419, 'test': 17408, 'test_users': 13597, 'test_items': 2995}




  0%|          | 0/418395 [00:00<?, ?it/s]



  0%|          | 0/418395 [00:00<?, ?it/s]



  0%|          | 0/418395 [00:00<?, ?it/s]


{'i_split': 4, 'start': Timestamp('2021-08-20 00:00:00', freq='3D'), 'end': Timestamp('2021-08-23 00:00:00', freq='3D'), 'train': 963460, 'train_users': 429618, 'train_items': 11505, 'test': 20381, 'test_users': 15782, 'test_items': 3153}




  0%|          | 0/429618 [00:00<?, ?it/s]



  0%|          | 0/429618 [00:00<?, ?it/s]



  0%|          | 0/429618 [00:00<?, ?it/s]

Unnamed: 0_level_0,prec@1,recall@1,prec@5,recall@5,prec@10,recall@10,MAP@1,MAP@5,MAP@10,NDCG@1,NDCG@5,NDCG@10,MeanInvUserFreq@1,MeanInvUserFreq@5,MeanInvUserFreq@10,Serendipity@1,Serendipity@5,Serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
cosin_user_knn,0.000942,0.000843,0.000448,0.001899,0.000276,0.002301,0.000843,0.001197,0.001253,0.000843,0.001197,0.001253,11.825301,11.471577,11.414977,1.2e-05,1.2e-05,1.2e-05
tfidf_user_knn,0.000967,0.000868,0.000433,0.001871,0.000279,0.002327,0.000868,0.001207,0.00127,0.000868,0.001207,0.00127,11.995665,11.567589,11.503152,1.4e-05,1.3e-05,1.4e-05
bm25_user_knn,0.000937,0.000838,0.000326,0.001428,0.000205,0.001739,0.000838,0.001025,0.001066,0.000838,0.001025,0.001066,12.093106,11.779373,11.709355,1.4e-05,1.3e-05,1.3e-05


В данном эксперименте CosineRecommender и TFIDFRecommender показывают приблизительно одинаковые результаты, в то время как BM25Recommender проигрывает в большинстве случаев, показывая лучше результаты только по метрике novelty.\
Метрика MAP@10 не сильно отличается от результатов, полученных в эксперименте №2, что на самом деле странно, поскольку для данных kion'a выявлена семидневная периодичность в взаимодействиях, а в опыте кроссвалидация проводилась с размером 3D для тестовой выборки.

Таким образом, в результате проведенных 3х экспериментов, можно прийти к выводу, что лучшие метрики качества показывает алгоритм TFIDFRecommender. Поэтому обучим данную модель на всем датасете из 5 млн. взаимодействий.

## Лидерборд
Проверим метрики качества модели TFIDFRecommender на всем датасете

In [10]:
interactions = pd.read_csv('data_original/interactions.csv', parse_dates=["last_watch_dt"])
interactions.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    },
    inplace=True)

dataset = Dataset.construct(interactions_df=interactions)

In [10]:
models = { 'tfidf_user_knn': CustomUserKnn(model=TFIDFRecommender(num_threads=2), N_users=50) }

In [9]:
n_splits = 3
cv = TimeRangeSplitter(test_size="7D",
                        n_splits=n_splits,
                        filter_cold_users=True,
                        filter_cold_items=True,
                        filter_already_seen=True)

In [14]:
validate(models, metrics, cv, dataset, K_RECOS=10)

  0%|          | 0/3 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-08-02 00:00:00', freq='7D'), 'end': Timestamp('2021-08-09 00:00:00', freq='7D'), 'train': 4266013, 'train_users': 797423, 'train_items': 15237, 'test': 263681, 'test_users': 98184, 'test_items': 6602}




  0%|          | 0/797423 [00:00<?, ?it/s]


{'i_split': 1, 'start': Timestamp('2021-08-09 00:00:00', freq='7D'), 'end': Timestamp('2021-08-16 00:00:00', freq='7D'), 'train': 4649162, 'train_users': 850489, 'train_items': 15415, 'test': 279422, 'test_users': 103511, 'test_items': 6698}




  0%|          | 0/850489 [00:00<?, ?it/s]


{'i_split': 2, 'start': Timestamp('2021-08-16 00:00:00', freq='7D'), 'end': Timestamp('2021-08-23 00:00:00', freq='7D'), 'train': 5051815, 'train_users': 906071, 'train_items': 15577, 'test': 298878, 'test_users': 110076, 'test_items': 6679}




  0%|          | 0/906071 [00:00<?, ?it/s]

Unnamed: 0_level_0,prec@1,recall@1,prec@5,recall@5,prec@10,recall@10,MAP@1,MAP@5,MAP@10,NDCG@1,NDCG@5,NDCG@10,MeanInvUserFreq@1,MeanInvUserFreq@5,MeanInvUserFreq@10,Serendipity@1,Serendipity@5,Serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
tfidf_user_knn,0.001833,0.001013,0.001561,0.003745,0.001468,0.006793,0.001013,0.00188,0.00228,0.001013,0.00188,0.00228,11.237014,10.499605,10.203509,2.1e-05,2.3e-05,2.3e-05


## Обучение и сохранение модели

In [29]:
model = CustomUserKnn(model=TFIDFRecommender(num_threads=2), N_users=50)
model.fit(dataset)



  0%|          | 0/83986 [00:00<?, ?it/s]

In [30]:
with open('model.dill', 'wb') as f:
    dill.dump(model, f)

In [31]:
with open('model.dill', 'rb') as f:
    userknn = dill.load(f)

In [35]:
userknn.predict_single(13)

[8, 16, 12, 1176]