# Основы построения рекомендательных систем

## Домашнее задание №4
### Основные пункты оценки
1. значение метрики на лидерборде
2. ревью кода в ноутбуке
3. реализация сервиса для модели

Вы можете сделать **НЕ ВСЕ пункты и все равно получить 20 баллов**. Получение > 20 баллов будет расцениваться как 20.

### Подробности
#### 1. Побейте метрику на лидерборде map@10 = 0.075 c моделью из implicit, lightfm или rectools, в том числе используя ANN **(5 баллов)**
#### 2. Реализуйте эксперименты c моделями из implicit, lightfm или rectools, в том числе используя ANN. Результат - ноутбук(и) **(максимум 12 баллов)**
Что можно сделать в ноутбуке:
- Реализовать тюнинг гиперпараметров для моделей из implicit, lightfm или rectools **(3 балла)**
  - Для перебора гиперпараметров можно использовать [`Optuna`](https://github.com/optuna/optuna), [`Hyperopt`](https://github.com/hyperopt/hyperopt)
- Воспользоваться методом приближенного поиска соседей для выдачи рекомендаций. **(3 балла)**
    - Можно использовать любые удобные: [`Annoy`](https://github.com/spotify/annoy), [`nmslib`](https://github.com/nmslib/nmslib) и.т.д
- Сделать рекомендации для холодных пользователей используя их фичи (для кого нет фичей - там другим способом) **(3 балла)**
- Провести эксперименты с параметрами оффлайн валидации и сделать выводы **(3 балла)**

#### 3. Оберните модель в сервис **(максимум 12 баллов)**
- Онлайн вариант: обучаете модель в ноутбуке, сохраняете обученную модель (pickle, dill), при запуске сервиса ее поднимаете и запрашиваете рекомендации "на лету" **(12 баллов)**
- Оффлайн вариант: предварительно посчитайте рекомендации для всех пользователей, сохраните и запрашивайте их **(6 баллов)**

### Хороший pull request - это:
- наличие описания (в идеале что сделано - по пунктам)
- код по стандарту PEP8
- легкая читаемость и воспроизводимость кода
- комментарии и объяснения. В ipynb пользуйтесь силой маркдауна. В скриптах пишите комментарии и докстринг.
- обоснование схемы валидации
- анализ метрики качества


In [57]:
# установка библиотек
import sys
!{sys.executable} -m pip install rectools[lightfm]
!{sys.executable} -m pip install dill
!{sys.executable} -m pip install optuna -q
!{sys.executable} -m pip install nmslib

Collecting nmslib
  Downloading nmslib-2.1.1.tar.gz (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.7/188.7 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pybind11<2.6.2 (from nmslib)
  Using cached pybind11-2.6.1-py2.py3-none-any.whl (188 kB)
Building wheels for collected packages: nmslib
  Building wheel for nmslib (setup.py) ... [?25ldone
[?25h  Created wheel for nmslib: filename=nmslib-2.1.1-cp310-cp310-linux_x86_64.whl size=890848 sha256=3774c560c54812f58de142a002b181eaad6d79ffc879aa1e0c028083785fb830
  Stored in directory: /root/.cache/pip/wheels/21/1a/5d/4cc754a5b1a88405cad184b76f823897a63a8d19afcd4b9314
Successfully built nmslib
Installing collected packages: pybind11, nmslib
  Attempting uninstall: pybind11
    Found existing installation: pybind11 2.11.1
    Uninstalling pybind11-2.11.1:
      Successfully uninstalled pybind11-2.11.1
Successfully installed nmsl

In [97]:
import os
import requests
from tqdm.auto import tqdm
import zipfile as zf
import dill

import pandas as pd
import numpy as np
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import MAP, MeanInvUserFreq, NDCG, Precision, Recall, Serendipity, calc_metrics
from rectools.model_selection import TimeRangeSplitter
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel
from rectools.tools import UserToItemAnnRecommender

from implicit.als import AlternatingLeastSquares
from implicit.lmf import LogisticMatrixFactorization

from lightfm import LightFM

import optuna
import nmslib

import warnings
warnings.filterwarnings('ignore')

os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

In [3]:
# ВАЛИДАЦИЯ

# Функция валидирования моделей методом Leave-Time-out
def validate(models: dict, metrics: dict, splitter: TimeRangeSplitter, dataset: Dataset, K_RECOS: int):
  results = []

  # создаем итератор фолдов
  fold_iterator = splitter.split(dataset.interactions, collect_fold_stats=True)

  # обучение по фолдам
  for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=splitter.n_splits):
      print(f"\n==================== Fold {fold_info['i_split']} ====================")
      print(fold_info)

      # тренировочная часть
      df_train = dataset.interactions.df.iloc[train_ids]
      train_dataset = Dataset.construct(df_train)

      # тестовая часть
      df_test = dataset.interactions.df.iloc[test_ids][Columns.UserItem]
      test_users = np.unique(df_test[Columns.User])

      # каталог
      catalog = df_train[Columns.Item].unique()


      # обучение моделей
      for model_name, model in models.items():
          # обучение и получение рекомендаций
          model.fit(train_dataset)
          recos = model.recommend(
              users=test_users,
              dataset=train_dataset,
              k=K_RECOS,
              filter_viewed=True,
          )
          # подсчет метрик
          metric_values = calc_metrics(
              metrics,
              reco=recos,
              interactions=df_test,
              prev_interactions=df_train,
              catalog=catalog,
          )
          # сохранение результатов
          res = {"fold": fold_info["i_split"], "model": model_name}
          res.update(metric_values)
          results.append(res)

  # форматирование в DataFrame
  pivot_results = pd.DataFrame(results)\
                    .drop(columns="fold")\
                    .groupby(["model"], sort=False)\
                    .agg("mean")
  # сохранение отчета
  pivot_results.to_csv('./report.csv')

  return pivot_results.style.highlight_max(color='green', axis=0)

In [4]:
# ПОДБОР ГИПЕРПАРАМЕТРОВ

# Функция подсчета метрики MAP@10
def calc_MAP10(model, dataset, train, test):
    metric = {"MAP@10": MAP(k=10) }
    
    test_users = np.unique(test[Columns.User])
    catalog = train[Columns.Item].unique()
    
    model.fit(dataset)
    recos = model.recommend(
        users=test_users,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True
    )
    
    metric_values = calc_metrics(
        metric,
        reco=recos,
        interactions=test,
        prev_interactions=train,
        catalog=catalog,
      )
    
    return metric_values['MAP@10']


# Целевые функции Optuna
def objective_als(trial, dataset, train, test):
    factors=trial.suggest_int('factors', 10, 30, 10)
    iterations=trial.suggest_int('iterations', 10, 50, 10)
    
    model = ImplicitALSWrapperModel(
            model=AlternatingLeastSquares(
                factors=factors,
                iterations=iterations,
                num_threads=NUM_THREADS,
                random_state=RANDOM_STATE
            ),
        )
    return calc_MAP10(model, dataset, train, test)


def objective_logMF(trial, dataset, train, test):
    factors=trial.suggest_int('factors', 10, 30, 10)
    iterations=trial.suggest_int('iterations', 10, 50, 20)
    
    model = ImplicitALSWrapperModel(
            model=LogisticMatrixFactorization(
                factors=factors,
                iterations=iterations,
                num_threads=NUM_THREADS,
                random_state=RANDOM_STATE
            ),
        )
    return calc_MAP10(model, dataset, train, test)

        
def objective_lightFM(trial, dataset, train, test):
    no_components=trial.suggest_int('no_components', 10, 30, 10)
    loss=trial.suggest_categorical('loss', ['logistic', 'bpr', 'warp'])
    
    model = LightFMWrapperModel(
            LightFM(
                no_components=no_components,
                loss=loss,
                random_state=RANDOM_STATE,
            ),
            epochs=NUM_EPOCHS,
            num_threads=NUM_THREADS,
        )
    return calc_MAP10(model, dataset, train, test)

In [78]:
def augment_inner_product(factors):
    normed_factors = np.linalg.norm(factors, axis=1)
    max_norm = normed_factors.max()
    
    extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
    augmented_factors = np.append(factors, extra_dim, axis=1)
    return max_norm, augmented_factors

## Чтение данных

In [5]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'
req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='Downloading the kion dataset...',
                        total=total_size_in_bytes,
                        unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

Downloading the kion dataset...:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [6]:
# Разархивирование данных
files = zf.ZipFile('kion_train.zip','r')
files.extractall()
files.close()

In [7]:
# Чтение в DataFrame
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

interactions = pd.read_csv('data_original/interactions.csv', parse_dates=["last_watch_dt"])
interactions.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    },
    inplace=True)

## Подбор гиперпараметров и валидация
Для экспериментов будет использоваться часть датасета из 1 000 000 записей о взаимодействиях с фильтрацией холодных пользователей.

In [8]:
# Тренеровочная и тестовая выборки
interactions_sm = interactions.iloc[:1000000]

max_date = interactions_sm[Columns.Datetime].max()

train = interactions_sm[interactions_sm[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions_sm[interactions_sm[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (910259, 5)
test: (89741, 5)


In [9]:
# Отфильтруем холодных пользователей из теста
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

In [10]:
# Сформируем датасет
dataset = Dataset.construct(interactions_df=train)

## 1. Подбор гиперпараметров моделей

In [11]:
RANDOM_STATE = 42
NUM_THREADS = 4
NUM_EPOCHS = 10
K_RECOS = 10

In [12]:
study_als = optuna.create_study(study_name='AlternatingLeastSquares Optuna Optimization', direction='maximize')
study_als.optimize(lambda trial: objective_als(trial, dataset, train, test), n_trials=20, n_jobs=-1)

[I 2023-12-10 12:03:14,561] A new study created in memory with name: AlternatingLeastSquares Optuna Optimization
[I 2023-12-10 12:03:57,784] Trial 2 finished with value: 0.013344147032424107 and parameters: {'factors': 20, 'iterations': 20}. Best is trial 2 with value: 0.013344147032424107.
[I 2023-12-10 12:04:11,864] Trial 1 finished with value: 0.015325705735661931 and parameters: {'factors': 10, 'iterations': 40}. Best is trial 1 with value: 0.015325705735661931.
[I 2023-12-10 12:04:14,414] Trial 0 finished with value: 0.017838438383127445 and parameters: {'factors': 20, 'iterations': 50}. Best is trial 0 with value: 0.017838438383127445.
[I 2023-12-10 12:04:17,099] Trial 3 finished with value: 0.017838438383127445 and parameters: {'factors': 20, 'iterations': 50}. Best is trial 0 with value: 0.017838438383127445.
[I 2023-12-10 12:04:38,166] Trial 4 finished with value: 0.014264689164555491 and parameters: {'factors': 30, 'iterations': 20}. Best is trial 0 with value: 0.017838438383

In [23]:
study_als.best_params

{'factors': 20, 'iterations': 50}

In [13]:
study_logMF = optuna.create_study(study_name='LogisticMatrixFactorization Optuna Optimization', direction='maximize')
study_logMF.optimize(lambda trial: objective_logMF(trial, dataset, train, test), n_trials=20, n_jobs=-1)

[I 2023-12-10 12:07:02,312] A new study created in memory with name: LogisticMatrixFactorization Optuna Optimization
[I 2023-12-10 12:07:25,020] Trial 1 finished with value: 0.0001458842225366093 and parameters: {'factors': 10, 'iterations': 10}. Best is trial 1 with value: 0.0001458842225366093.
[I 2023-12-10 12:08:06,972] Trial 3 finished with value: 0.0004168586268534732 and parameters: {'factors': 10, 'iterations': 50}. Best is trial 3 with value: 0.0004168586268534732.
[I 2023-12-10 12:08:21,944] Trial 0 finished with value: 0.00019787586060213773 and parameters: {'factors': 20, 'iterations': 30}. Best is trial 3 with value: 0.0004168586268534732.
[I 2023-12-10 12:09:14,286] Trial 6 finished with value: 0.00031265482503162076 and parameters: {'factors': 10, 'iterations': 30}. Best is trial 3 with value: 0.0004168586268534732.
[I 2023-12-10 12:09:22,715] Trial 2 finished with value: 0.00023360539932723142 and parameters: {'factors': 20, 'iterations': 50}. Best is trial 3 with value

In [24]:
study_logMF.best_params

{'factors': 10, 'iterations': 50}

In [14]:
study_lightFM = optuna.create_study(study_name='LightFM Optuna Optimization', direction='maximize')
study_lightFM.optimize(lambda trial: objective_lightFM(trial, dataset, train, test), n_trials=20, n_jobs=-1)

[I 2023-12-10 12:13:20,388] A new study created in memory with name: LightFM Optuna Optimization
[I 2023-12-10 12:14:07,893] Trial 2 finished with value: 0.04154694247261958 and parameters: {'no_components': 10, 'loss': 'warp'}. Best is trial 2 with value: 0.04154694247261958.
[I 2023-12-10 12:14:10,712] Trial 0 finished with value: 1.106482376211166e-05 and parameters: {'no_components': 10, 'loss': 'bpr'}. Best is trial 2 with value: 0.04154694247261958.
[I 2023-12-10 12:14:15,527] Trial 3 finished with value: 0.00031510194195429216 and parameters: {'no_components': 30, 'loss': 'logistic'}. Best is trial 2 with value: 0.04154694247261958.
[I 2023-12-10 12:14:19,726] Trial 1 finished with value: 1.1452539958627698e-05 and parameters: {'no_components': 30, 'loss': 'bpr'}. Best is trial 2 with value: 0.04154694247261958.
[I 2023-12-10 12:14:38,982] Trial 5 finished with value: 3.667539581989109e-05 and parameters: {'no_components': 10, 'loss': 'logistic'}. Best is trial 2 with value: 0.0

In [25]:
study_lightFM.best_params

{'no_components': 10, 'loss': 'warp'}

## 2. Валидация

In [15]:
metrics = {
    "prec@1": Precision(k=1),
    "prec@5": Precision(k=5),
    "prec@10": Precision(k=10),
    "recall@1": Recall(k=1),
    "recall@5": Recall(k=5),
    "recall@10": Recall(k=10),
    "MAP@1": MAP(k=1),
    "MAP@5": MAP(k=5),
    "MAP@10": MAP(k=10),
    "NDCG@1": MAP(k=1),
    "NDCG@5": MAP(k=5),
    "NDCG@10": MAP(k=10),
    "MeanInvUserFreq@1": MeanInvUserFreq(k=1),
    "MeanInvUserFreq@5": MeanInvUserFreq(k=5),
    "MeanInvUserFreq@10": MeanInvUserFreq(k=10),
    "Serendipity@1": Serendipity(k=1),
    "Serendipity@5": Serendipity(k=5),
    "Serendipity@10": Serendipity(k=10),
}


In [16]:
models = {'ALS': ImplicitALSWrapperModel(
                    model=AlternatingLeastSquares(**study_als.best_params)
                  ),
          'LogMF': ImplicitALSWrapperModel(
                      model=LogisticMatrixFactorization(**study_logMF.best_params)
                  ),
          'LightFM': LightFMWrapperModel(
                      model=LightFM(**study_lightFM.best_params),
                      epochs=NUM_EPOCHS,
                      num_threads=NUM_THREADS
                      )
         }

Эксперимент №1.
- Кол-во фолдов 3
- Размер тестового фолда 7D


In [17]:
n_splits = 3
cv = TimeRangeSplitter(test_size="7D",
                        n_splits=n_splits,
                        filter_cold_users=True,
                        filter_cold_items=True,
                        filter_already_seen=True)

In [18]:
validate(models, metrics, cv, dataset, K_RECOS=10)

  0%|          | 0/3 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-07-25 00:00:00', freq='7D'), 'end': Timestamp('2021-08-01 00:00:00', freq='7D'), 'train': 701298, 'train_users': 329509, 'train_items': 10774, 'test': 32728, 'test_users': 22701, 'test_items': 3788}

{'i_split': 1, 'start': Timestamp('2021-08-01 00:00:00', freq='7D'), 'end': Timestamp('2021-08-08 00:00:00', freq='7D'), 'train': 768221, 'train_users': 355896, 'train_items': 10931, 'test': 33489, 'test_users': 23467, 'test_items': 3769}

{'i_split': 2, 'start': Timestamp('2021-08-08 00:00:00', freq='7D'), 'end': Timestamp('2021-08-15 00:00:00', freq='7D'), 'train': 838173, 'train_users': 382886, 'train_items': 11184, 'test': 36223, 'test_users': 25023, 'test_items': 3887}


Unnamed: 0_level_0,prec@1,recall@1,prec@5,recall@5,prec@10,recall@10,MAP@1,MAP@5,MAP@10,NDCG@1,NDCG@5,NDCG@10,MeanInvUserFreq@1,MeanInvUserFreq@5,MeanInvUserFreq@10,Serendipity@1,Serendipity@5,Serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ALS,0.011774,0.009488,0.007081,0.027982,0.005482,0.042821,0.009488,0.016006,0.017971,0.009488,0.016006,0.017971,6.294636,7.300418,7.808022,1.2e-05,1.6e-05,1.8e-05
LogMF,0.000198,0.000126,0.00019,0.00058,0.000202,0.001294,0.000126,0.000272,0.000366,0.000126,0.000272,0.000366,13.556276,13.624578,13.663741,8e-06,7e-06,7e-06
LightFM,0.024932,0.020885,0.019462,0.079261,0.013737,0.110036,0.020885,0.040866,0.045148,0.020885,0.040866,0.045148,3.987654,4.633913,5.395589,3e-06,4e-06,4e-06


По большенству метрик выигрывает LightFM[warp], в том числе по ключевой метрике MAP@10. LogisticMF показывает хорошие результаты для метрики novelty, а ALS - для serendipity.

Эксперимент №2.
- Кол-во фолдов 5
- Размер тестового фолда 7D

In [19]:
n_splits = 5
cv = TimeRangeSplitter(test_size="7D",
                        n_splits=n_splits,
                        filter_cold_users=True,
                        filter_cold_items=True,
                        filter_already_seen=True)

In [20]:
validate(models, metrics, cv, dataset, K_RECOS=10)

  0%|          | 0/5 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-07-11 00:00:00', freq='7D'), 'end': Timestamp('2021-07-18 00:00:00', freq='7D'), 'train': 583565, 'train_users': 282499, 'train_items': 10376, 'test': 27422, 'test_users': 19466, 'test_items': 3464}

{'i_split': 1, 'start': Timestamp('2021-07-18 00:00:00', freq='7D'), 'end': Timestamp('2021-07-25 00:00:00', freq='7D'), 'train': 640331, 'train_users': 305483, 'train_items': 10619, 'test': 29886, 'test_users': 20780, 'test_items': 3684}

{'i_split': 2, 'start': Timestamp('2021-07-25 00:00:00', freq='7D'), 'end': Timestamp('2021-08-01 00:00:00', freq='7D'), 'train': 701298, 'train_users': 329509, 'train_items': 10774, 'test': 32728, 'test_users': 22701, 'test_items': 3788}

{'i_split': 3, 'start': Timestamp('2021-08-01 00:00:00', freq='7D'), 'end': Timestamp('2021-08-08 00:00:00', freq='7D'), 'train': 768221, 'train_users': 355896, 'train_items': 10931, 'test': 33489, 'test_users': 23467, 'test_items': 3769}

{'i_split': 4, 'start': Timestamp('2021

Unnamed: 0_level_0,prec@1,recall@1,prec@5,recall@5,prec@10,recall@10,MAP@1,MAP@5,MAP@10,NDCG@1,NDCG@5,NDCG@10,MeanInvUserFreq@1,MeanInvUserFreq@5,MeanInvUserFreq@10,Serendipity@1,Serendipity@5,Serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ALS,0.011692,0.009493,0.007748,0.031086,0.005853,0.046256,0.009493,0.016859,0.018887,0.009493,0.016859,0.018887,6.385426,7.278217,7.785989,1e-05,1.6e-05,1.7e-05
LogMF,0.000155,9.6e-05,0.000213,0.000653,0.000223,0.001477,9.6e-05,0.000282,0.000386,9.6e-05,0.000282,0.000386,13.517182,13.532958,13.556483,5e-06,7e-06,7e-06
LightFM,0.026268,0.02194,0.020714,0.084997,0.014761,0.119345,0.02194,0.043672,0.04842,0.02194,0.043672,0.04842,3.982964,4.665888,5.412196,3e-06,3e-06,4e-06


Изменение валидационных параметров не привело к новым лидерам по метрикам, все алгоритмы сохранили свои позиции. Сами же значения метрик так же не испытали значительных изменений.

Эксперимент №3
- Кол-во фолдов 5
- Размер тестового фолда 3D

In [26]:
n_splits = 5
cv = TimeRangeSplitter(test_size="3D",
                        n_splits=n_splits,
                        filter_cold_users=True,
                        filter_cold_items=True,
                        filter_already_seen=True)

In [27]:
validate(models, metrics, cv, dataset, K_RECOS=10)

  0%|          | 0/5 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-07-31 00:00:00', freq='3D'), 'end': Timestamp('2021-08-03 00:00:00', freq='3D'), 'train': 757473, 'train_users': 351548, 'train_items': 10905, 'test': 16405, 'test_users': 12855, 'test_items': 2894}

{'i_split': 1, 'start': Timestamp('2021-08-03 00:00:00', freq='3D'), 'end': Timestamp('2021-08-06 00:00:00', freq='3D'), 'train': 789369, 'train_users': 364141, 'train_items': 11015, 'test': 15104, 'test_users': 11911, 'test_items': 2814}

{'i_split': 2, 'start': Timestamp('2021-08-06 00:00:00', freq='3D'), 'end': Timestamp('2021-08-09 00:00:00', freq='3D'), 'train': 817604, 'train_users': 374994, 'train_items': 11123, 'test': 17042, 'test_users': 13390, 'test_items': 2927}

{'i_split': 3, 'start': Timestamp('2021-08-09 00:00:00', freq='3D'), 'end': Timestamp('2021-08-12 00:00:00', freq='3D'), 'train': 849356, 'train_users': 387156, 'train_items': 11202, 'test': 15966, 'test_users': 12594, 'test_items': 2901}

{'i_split': 4, 'start': Timestamp('2021

Unnamed: 0_level_0,prec@1,recall@1,prec@5,recall@5,prec@10,recall@10,MAP@1,MAP@5,MAP@10,NDCG@1,NDCG@5,NDCG@10,MeanInvUserFreq@1,MeanInvUserFreq@5,MeanInvUserFreq@10,Serendipity@1,Serendipity@5,Serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ALS,0.009117,0.007895,0.006282,0.027033,0.004894,0.041512,0.007895,0.014595,0.016492,0.007895,0.014595,0.016492,6.377705,7.316436,7.836277,1e-05,1.5e-05,1.7e-05
LogMF,0.000214,0.000154,0.000193,0.000759,0.00019,0.001472,0.000154,0.000357,0.000449,0.000154,0.000357,0.000449,13.802431,13.800061,13.814133,6e-06,7e-06,6e-06
LightFM,0.021581,0.019451,0.017,0.074318,0.012158,0.105232,0.019451,0.038214,0.042437,0.019451,0.038214,0.042437,4.057236,4.688568,5.449004,3e-06,3e-06,5e-06


Результаты эксперимента аналогичны эксперименту №2.\
Таким образом, по результам 3-х экспериментов лучшие значения для большинства метрик и для метрики MAP@10 показал алгоритм LightFM[warp].

## Обучение модели

Используется алгоритм LightFM[warp]. \
Добавим user и item признаки в процесс обучения модели.\
Для ускорения выдачи рекомендаций используется ANN.

### User features

In [108]:
users.fillna('Unknown', inplace=True)
users = users.loc[users[Columns.User].isin(interactions[Columns.User])].copy()

user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)

### Item features

In [109]:
items = items.loc[items[Columns.Item].isin(interactions[Columns.Item])].copy()

items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"

content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

item_features = pd.concat((genre_feature, content_feature))

In [110]:
# Сформируем датасет с признаками users и items
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

### Валидация

In [115]:
model = {'LightFM': LightFMWrapperModel(
              model=LightFM(**study_lightFM.best_params),
              epochs=NUM_EPOCHS,
              num_threads=NUM_THREADS
          )
        }

metric = {"MAP@10": MAP(k=10)}

In [116]:
n_splits = 3
cv = TimeRangeSplitter(test_size="7D",
                        n_splits=n_splits,
                        filter_cold_users=True,
                        filter_cold_items=True,
                        filter_already_seen=True)

In [117]:
validate(model, metrics, cv, dataset, K_RECOS=10)

  0%|          | 0/3 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-08-02 00:00:00', freq='7D'), 'end': Timestamp('2021-08-09 00:00:00', freq='7D'), 'train': 4266013, 'train_users': 797423, 'train_items': 15237, 'test': 263681, 'test_users': 98184, 'test_items': 6602}

{'i_split': 1, 'start': Timestamp('2021-08-09 00:00:00', freq='7D'), 'end': Timestamp('2021-08-16 00:00:00', freq='7D'), 'train': 4649162, 'train_users': 850489, 'train_items': 15415, 'test': 279422, 'test_users': 103511, 'test_items': 6698}

{'i_split': 2, 'start': Timestamp('2021-08-16 00:00:00', freq='7D'), 'end': Timestamp('2021-08-23 00:00:00', freq='7D'), 'train': 5051815, 'train_users': 906071, 'train_items': 15577, 'test': 298878, 'test_users': 110076, 'test_items': 6679}


Unnamed: 0_level_0,prec@1,recall@1,prec@5,recall@5,prec@10,recall@10,MAP@1,MAP@5,MAP@10,NDCG@1,NDCG@5,NDCG@10,MeanInvUserFreq@1,MeanInvUserFreq@5,MeanInvUserFreq@10,Serendipity@1,Serendipity@5,Serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
LightFM,0.087207,0.047629,0.052926,0.135297,0.036547,0.179547,0.047629,0.080945,0.087976,0.047629,0.080945,0.087976,3.049181,3.819135,4.503419,3.7e-05,3.8e-05,4.2e-05


### Обучение модели

In [118]:
model['LightFM'].fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7d565d3d68c0>

### ANN

In [120]:
user_vectors, item_vectors = model['LightFM'].get_vectors(dataset)

ann_model = UserToItemAnnRecommender(
    user_vectors=user_vectors,
    item_vectors=item_vectors,
    user_id_map=dataset.user_id_map,
    item_id_map=dataset.item_id_map,
)

ann_model.fit()

<rectools.tools.ann.UserToItemAnnRecommender at 0x7d5805820640>

In [121]:
list(ann_model.get_item_list_for_user(user_id=5, top_n=K_RECOS))

[5115, 4179, 7825, 14302, 1404, 13995, 7653, 12316, 12593, 13468]

In [122]:
with open('model.dill', 'wb') as f:
    dill.dump(model, f)