# Основы построения рекомендательных систем

## Домашнее задание №4
### Основные пункты оценки
1. значение метрики на лидерборде
2. ревью кода в ноутбуке
3. реализация сервиса для модели

Вы можете сделать **НЕ ВСЕ пункты и все равно получить 20 баллов**. Получение > 20 баллов будет расцениваться как 20.

### Подробности
#### 1. Побейте метрику на лидерборде map@10 = 0.075 c моделью из implicit, lightfm или rectools, в том числе используя ANN **(5 баллов)**
#### 2. Реализуйте эксперименты c моделями из implicit, lightfm или rectools, в том числе используя ANN. Результат - ноутбук(и) **(максимум 12 баллов)**
Что можно сделать в ноутбуке:
- Реализовать тюнинг гиперпараметров для моделей из implicit, lightfm или rectools **(3 балла)**
  - Для перебора гиперпараметров можно использовать [`Optuna`](https://github.com/optuna/optuna), [`Hyperopt`](https://github.com/hyperopt/hyperopt)
- Воспользоваться методом приближенного поиска соседей для выдачи рекомендаций. **(3 балла)**
    - Можно использовать любые удобные: [`Annoy`](https://github.com/spotify/annoy), [`nmslib`](https://github.com/nmslib/nmslib) и.т.д
- Сделать рекомендации для холодных пользователей используя их фичи (для кого нет фичей - там другим способом) **(3 балла)**
- Провести эксперименты с параметрами оффлайн валидации и сделать выводы **(3 балла)**

#### 3. Оберните модель в сервис **(максимум 12 баллов)**
- Онлайн вариант: обучаете модель в ноутбуке, сохраняете обученную модель (pickle, dill), при запуске сервиса ее поднимаете и запрашиваете рекомендации "на лету" **(12 баллов)**
- Оффлайн вариант: предварительно посчитайте рекомендации для всех пользователей, сохраните и запрашивайте их **(6 баллов)**

### Хороший pull request - это:
- наличие описания (в идеале что сделано - по пунктам)
- код по стандарту PEP8
- легкая читаемость и воспроизводимость кода
- комментарии и объяснения. В ipynb пользуйтесь силой маркдауна. В скриптах пишите комментарии и докстринг.
- обоснование схемы валидации
- анализ метрики качества


In [1]:
# установка библиотек
import sys
!{sys.executable} -m pip install rectools[lightfm]
!{sys.executable} -m pip install dill
!{sys.executable} -m pip install optuna -q
!{sys.executable} -m pip install nmslib

Collecting rectools[lightfm]
  Obtaining dependency information for rectools[lightfm] from https://files.pythonhosted.org/packages/f2/2c/548002983de17fba745d88eded49bf67007ef54db35272e3cf9605de50fb/rectools-0.4.2-py3-none-any.whl.metadata
  Downloading rectools-0.4.2-py3-none-any.whl.metadata (9.0 kB)
Collecting implicit<0.8.0,>=0.7.1 (from rectools[lightfm])
  Obtaining dependency information for implicit<0.8.0,>=0.7.1 from https://files.pythonhosted.org/packages/cd/cc/deac70cae8cc32c9885d0cd73bc66e1b3cbea36ae7080b8c83995eaf5322/implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl.metadata
  Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl.metadata (6.1 kB)
Collecting pandas<2.0.0,>=0.25.3 (from rectools[lightfm])
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting lightfm<=1.17,>=

In [2]:
import os
import requests
from tqdm.auto import tqdm
import zipfile as zf
import dill

import pandas as pd
import numpy as np
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import MAP, MeanInvUserFreq, NDCG, Precision, Recall, Serendipity, calc_metrics
from rectools.model_selection import TimeRangeSplitter
from rectools.models import PopularModel, ImplicitALSWrapperModel, LightFMWrapperModel
from rectools.tools import UserToItemAnnRecommender

from implicit.als import AlternatingLeastSquares
from implicit.lmf import LogisticMatrixFactorization

from lightfm import LightFM
import optuna

import warnings
warnings.filterwarnings('ignore')

os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS



In [3]:
# ВАЛИДАЦИЯ

# Функция валидирования моделей методом Leave-Time-out
def validate(models: dict, metrics: dict, splitter: TimeRangeSplitter, dataset: Dataset, K_RECOS: int):
    results = []

    # создаем итератор фолдов
    fold_iterator = splitter.split(dataset.interactions, collect_fold_stats=True)

    # обучение по фолдам
    for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=splitter.n_splits):
        print(f"\n==================== Fold {fold_info['i_split']} ====================")
        print(fold_info)

        # тренировочная часть
        df_train = dataset.interactions.df.iloc[train_ids]
        train_dataset = Dataset.construct(df_train)

        # тестовая часть
        df_test = dataset.interactions.df.iloc[test_ids][Columns.UserItem]
        test_users = np.unique(df_test[Columns.User])

        # каталог
        catalog = df_train[Columns.Item].unique()

        # обучение моделей
        for model_name, model in models.items():
            # обучение и получение рекомендаций
            model.fit(train_dataset)
            recos = model.recommend(
                        users=test_users,
                        dataset=train_dataset,
                        k=K_RECOS,
                        filter_viewed=True,
                    )
            # подсчет метрик
            metric_values = calc_metrics(
                                metrics,
                                reco=recos,
                                interactions=df_test,
                                prev_interactions=df_train,
                                catalog=catalog
                            )
            # сохранение результатов
            res = {"fold": fold_info["i_split"], "model": model_name}
            res.update(metric_values)
            results.append(res)

    # форматирование в DataFrame
    pivot_results = pd.DataFrame(results)\
                      .drop(columns="fold")\
                      .groupby(["model"], sort=False)\
                      .agg("mean")
    # сохранение отчета
    pivot_results.to_csv('./report.csv')

    return pivot_results.style.highlight_max(color='green', axis=0)

In [4]:
# ПОДБОР ГИПЕРПАРАМЕТРОВ

# Функция подсчета метрики MAP@10
def calc_MAP10(model, dataset, train, test):
    metric = {"MAP@10": MAP(k=10) }
    
    test_users = np.unique(test[Columns.User])
    catalog = train[Columns.Item].unique()
    
    model.fit(dataset)
    recos = model.recommend(
        users=test_users,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True
    )
    
    metric_values = calc_metrics(
        metric,
        reco=recos,
        interactions=test,
        prev_interactions=train,
        catalog=catalog,
      )
    
    return metric_values['MAP@10']


# Целевые функции Optuna
def objective_als(trial, dataset, train, test):
    factors=trial.suggest_int('factors', 10, 30, 10)
    iterations=trial.suggest_int('iterations', 10, 50, 10)
    
    model = ImplicitALSWrapperModel(
            model=AlternatingLeastSquares(
                factors=factors,
                iterations=iterations,
                num_threads=NUM_THREADS,
                random_state=RANDOM_STATE
            ),
        )
    return calc_MAP10(model, dataset, train, test)


def objective_logMF(trial, dataset, train, test):
    factors=trial.suggest_int('factors', 10, 30, 10)
    iterations=trial.suggest_int('iterations', 10, 50, 20)
    
    model = ImplicitALSWrapperModel(
            model=LogisticMatrixFactorization(
                factors=factors,
                iterations=iterations,
                num_threads=NUM_THREADS,
                random_state=RANDOM_STATE
            ),
        )
    return calc_MAP10(model, dataset, train, test)

        
def objective_lightFM(trial, dataset, train, test):
    no_components=trial.suggest_int('no_components', 10, 30, 10)
    loss=trial.suggest_categorical('loss', ['logistic', 'bpr', 'warp'])
    
    model = LightFMWrapperModel(
            LightFM(
                no_components=no_components,
                loss=loss,
                random_state=RANDOM_STATE,
            ),
            epochs=NUM_EPOCHS,
            num_threads=NUM_THREADS,
        )
    return calc_MAP10(model, dataset, train, test)

In [33]:
# Имплементация LightFM, PopularModel, ANN
class CustomLightFMWithANN():
    def __init__(self, params: dict, n_epochs: int = 10, n_threads: int = 4):
        self.lightFM =  LightFMWrapperModel(
                          model=LightFM(**params),
                          epochs=n_epochs,
                          num_threads=n_threads
                        )
        self.pop_model = PopularModel()
        self.ann_model = None
        self.is_fitted = False

    def get_mappings(self, train):
        self.users_inv_mapping = dict(enumerate(train['user_id'].unique()))
        self.users_mapping = {v: k for k, v in self.users_inv_mapping.items()}

        self.items_inv_mapping = dict(enumerate(train['item_id'].unique()))
        self.items_mapping = {v: k for k, v in self.items_inv_mapping.items()}

    def fit(self, dataset: Dataset):
        train = dataset.interactions.df
        self.get_mappings(train)

        # обучаем PopularModel, LightFM, ANN
        self.lightFM.fit(dataset)
        self.pop_model.fit(dataset)

        user_vectors, item_vectors = self.lightFM.get_vectors(dataset)

        self.ann_model = UserToItemAnnRecommender(
            user_vectors=user_vectors,
            item_vectors=item_vectors,
            user_id_map=dataset.user_id_map,
            item_id_map=dataset.item_id_map,
        )

        self.ann_model.fit()
        
        # список популярного
        popular_list = [self.users_inv_mapping[item] for item in self.pop_model.popularity_list[0]]
        popular_scores = self.pop_model.popularity_list[1]
        self.popular = pd.DataFrame([popular_list, popular_scores]).transpose()\
                         .rename(columns={0: "item_id", 1: "score"})\
                         .reset_index(drop=True)

        self.is_fitted = True

    def recommend(self, users: np.array, dataset: Dataset, k: int = 10, filter_viewed: bool = False):
        if not self.is_fitted:
                raise ValueError("Please call fit before predict")

        # выделяем "горячих" и "холодных" юзеров
        user_ids = dataset.user_id_map.external_ids
        hot_users_mask = np.isin(users, user_ids)
        cold_users = users[~hot_users_mask]
        hot_users = users[hot_users_mask]
        
        # рекомендуем "холодным" юзерам популярное
        cold_users = pd.DataFrame(cold_users, columns=['user_id'])
        pop_recs = pd.merge(cold_users, self.popular.iloc[:k], how='cross')
        pop_recs['rank'] = pop_recs.groupby('user_id').cumcount() + 1
        pop_recs.drop(columns=['score'], inplace=True)

        # ann рекомендации 
        ann_recs = self.ann_model.get_item_list_for_user_batch(user_ids=hot_users, top_n=k)
        ann_recs = pd.DataFrame({'user_id': hot_users,
                                 'item_id': ann_recs}).explode('item_id')
        ann_recs['rank'] = ann_recs.groupby('user_id').cumcount() + 1

        resc = pd.concat([ann_recs, pop_recs])
        resc = resc.astype({"user_id": int, "item_id": int, "rank": int})
        return resc

    def predict_single(self, user_id: int, N_recs: int = 10):
        if not self.is_fitted:
            raise ValueError("Please call fit before predict")
        try:
            recs = self.ann_model.get_item_list_for_user(user_id=user_id, top_n=N_recs)
        except:
            recs = self.popular.iloc[:N_recs]
        return recs

## Чтение данных

In [5]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'
req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='Downloading the kion dataset...',
                        total=total_size_in_bytes,
                        unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

Downloading the kion dataset...:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [6]:
# Разархивирование данных
files = zf.ZipFile('kion_train.zip','r')
files.extractall()
files.close()

In [7]:
# Чтение в DataFrame
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

interactions = pd.read_csv('data_original/interactions.csv', parse_dates=["last_watch_dt"])
interactions.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    },
    inplace=True)

## Подбор гиперпараметров и валидация
Для экспериментов будет использоваться часть датасета из 1 000 000 записей о взаимодействиях с фильтрацией холодных пользователей.

In [8]:
# Тренеровочная и тестовая выборки
interactions_sm = interactions.iloc[:1000000]

max_date = interactions_sm[Columns.Datetime].max()

train = interactions_sm[interactions_sm[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions_sm[interactions_sm[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (910259, 5)
test: (89741, 5)


In [9]:
# Отфильтруем холодных пользователей из теста
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

In [10]:
# Сформируем датасет
dataset = Dataset.construct(interactions_df=train)

## 1. Подбор гиперпараметров моделей

In [11]:
RANDOM_STATE = 42
NUM_THREADS = 4
NUM_EPOCHS = 10
K_RECOS = 10

In [12]:
study_als = optuna.create_study(study_name='AlternatingLeastSquares Optuna Optimization', direction='maximize')
study_als.optimize(lambda trial: objective_als(trial, dataset, train, test), n_trials=20, n_jobs=-1)

[I 2023-12-10 21:19:45,812] A new study created in memory with name: AlternatingLeastSquares Optuna Optimization
[I 2023-12-10 21:20:31,329] Trial 3 finished with value: 0.013344147032424107 and parameters: {'factors': 20, 'iterations': 20}. Best is trial 3 with value: 0.013344147032424107.
[I 2023-12-10 21:20:48,534] Trial 2 finished with value: 0.01559216258506367 and parameters: {'factors': 30, 'iterations': 40}. Best is trial 2 with value: 0.01559216258506367.
[I 2023-12-10 21:20:48,855] Trial 0 finished with value: 0.01559216258506367 and parameters: {'factors': 30, 'iterations': 40}. Best is trial 2 with value: 0.01559216258506367.
[I 2023-12-10 21:20:49,214] Trial 1 finished with value: 0.017838438383127445 and parameters: {'factors': 20, 'iterations': 50}. Best is trial 1 with value: 0.017838438383127445.
[I 2023-12-10 21:20:55,213] Trial 4 finished with value: 0.012340126601384088 and parameters: {'factors': 10, 'iterations': 20}. Best is trial 1 with value: 0.0178384383831274

In [13]:
study_als.best_params

{'factors': 20, 'iterations': 50}

In [14]:
study_logMF = optuna.create_study(study_name='LogisticMatrixFactorization Optuna Optimization', direction='maximize')
study_logMF.optimize(lambda trial: objective_logMF(trial, dataset, train, test), n_trials=20, n_jobs=-1)

[I 2023-12-10 21:23:47,822] A new study created in memory with name: LogisticMatrixFactorization Optuna Optimization
[I 2023-12-10 21:24:10,651] Trial 2 finished with value: 0.0001458842225366093 and parameters: {'factors': 10, 'iterations': 10}. Best is trial 2 with value: 0.0001458842225366093.
[I 2023-12-10 21:24:23,905] Trial 0 finished with value: 0.00010454064237160288 and parameters: {'factors': 20, 'iterations': 10}. Best is trial 2 with value: 0.0001458842225366093.
[I 2023-12-10 21:24:36,519] Trial 3 finished with value: 0.0002117441707082885 and parameters: {'factors': 30, 'iterations': 10}. Best is trial 3 with value: 0.0002117441707082885.
[I 2023-12-10 21:25:08,103] Trial 6 finished with value: 0.0001458842225366093 and parameters: {'factors': 10, 'iterations': 10}. Best is trial 3 with value: 0.0002117441707082885.
[I 2023-12-10 21:25:08,220] Trial 5 finished with value: 0.00010454064237160288 and parameters: {'factors': 20, 'iterations': 10}. Best is trial 3 with value:

In [15]:
study_logMF.best_params

{'factors': 10, 'iterations': 50}

In [16]:
study_lightFM = optuna.create_study(study_name='LightFM Optuna Optimization', direction='maximize')
study_lightFM.optimize(lambda trial: objective_lightFM(trial, dataset, train, test), n_trials=20, n_jobs=-1)

[I 2023-12-10 21:29:21,710] A new study created in memory with name: LightFM Optuna Optimization
[I 2023-12-10 21:30:06,709] Trial 3 finished with value: 0.00010630773349803177 and parameters: {'no_components': 10, 'loss': 'logistic'}. Best is trial 3 with value: 0.00010630773349803177.
[I 2023-12-10 21:30:06,924] Trial 1 finished with value: 2.6444955904467597e-05 and parameters: {'no_components': 10, 'loss': 'logistic'}. Best is trial 3 with value: 0.00010630773349803177.
[I 2023-12-10 21:30:11,859] Trial 2 finished with value: 0.04131388177180395 and parameters: {'no_components': 10, 'loss': 'warp'}. Best is trial 2 with value: 0.04131388177180395.
[I 2023-12-10 21:30:16,491] Trial 0 finished with value: 0.00039330542209334974 and parameters: {'no_components': 30, 'loss': 'logistic'}. Best is trial 2 with value: 0.04131388177180395.
[I 2023-12-10 21:30:38,890] Trial 4 finished with value: 3.2210268633640407e-06 and parameters: {'no_components': 10, 'loss': 'bpr'}. Best is trial 2 wi

In [17]:
study_lightFM.best_params

{'no_components': 10, 'loss': 'warp'}

## 2. Валидация

In [18]:
metrics = {
    "prec@1": Precision(k=1),
    "prec@5": Precision(k=5),
    "prec@10": Precision(k=10),
    "recall@1": Recall(k=1),
    "recall@5": Recall(k=5),
    "recall@10": Recall(k=10),
    "MAP@1": MAP(k=1),
    "MAP@5": MAP(k=5),
    "MAP@10": MAP(k=10),
    "NDCG@1": MAP(k=1),
    "NDCG@5": MAP(k=5),
    "NDCG@10": MAP(k=10),
    "MeanInvUserFreq@1": MeanInvUserFreq(k=1),
    "MeanInvUserFreq@5": MeanInvUserFreq(k=5),
    "MeanInvUserFreq@10": MeanInvUserFreq(k=10),
    "Serendipity@1": Serendipity(k=1),
    "Serendipity@5": Serendipity(k=5),
    "Serendipity@10": Serendipity(k=10),
}


In [19]:
models = {'ALS': ImplicitALSWrapperModel(
                    model=AlternatingLeastSquares(**study_als.best_params)
                  ),
          'LogMF': ImplicitALSWrapperModel(
                      model=LogisticMatrixFactorization(**study_logMF.best_params)
                  ),
          'LightFM': LightFMWrapperModel(
                      model=LightFM(**study_lightFM.best_params),
                      epochs=NUM_EPOCHS,
                      num_threads=NUM_THREADS
                      )
         }

Эксперимент №1.
- Кол-во фолдов 3
- Размер тестового фолда 7D


In [20]:
n_splits = 3
cv = TimeRangeSplitter(test_size="7D",
                        n_splits=n_splits,
                        filter_cold_users=True,
                        filter_cold_items=True,
                        filter_already_seen=True)

In [21]:
validate(models, metrics, cv, dataset, K_RECOS=10)

  0%|          | 0/3 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-07-25 00:00:00', freq='7D'), 'end': Timestamp('2021-08-01 00:00:00', freq='7D'), 'train': 701298, 'train_users': 329509, 'train_items': 10774, 'test': 32728, 'test_users': 22701, 'test_items': 3788}

{'i_split': 1, 'start': Timestamp('2021-08-01 00:00:00', freq='7D'), 'end': Timestamp('2021-08-08 00:00:00', freq='7D'), 'train': 768221, 'train_users': 355896, 'train_items': 10931, 'test': 33489, 'test_users': 23467, 'test_items': 3769}

{'i_split': 2, 'start': Timestamp('2021-08-08 00:00:00', freq='7D'), 'end': Timestamp('2021-08-15 00:00:00', freq='7D'), 'train': 838173, 'train_users': 382886, 'train_items': 11184, 'test': 36223, 'test_users': 25023, 'test_items': 3887}


Unnamed: 0_level_0,prec@1,recall@1,prec@5,recall@5,prec@10,recall@10,MAP@1,MAP@5,MAP@10,NDCG@1,NDCG@5,NDCG@10,MeanInvUserFreq@1,MeanInvUserFreq@5,MeanInvUserFreq@10,Serendipity@1,Serendipity@5,Serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ALS,0.011733,0.009422,0.007642,0.029906,0.005797,0.044922,0.009422,0.016564,0.018593,0.009422,0.016564,0.018593,6.292548,7.248624,7.769004,1.2e-05,1.6e-05,1.7e-05
LogMF,0.00017,0.000132,0.000242,0.000854,0.000232,0.001568,0.000132,0.000358,0.000452,0.000132,0.000358,0.000452,13.681289,13.684302,13.700211,6e-06,8e-06,8e-06
LightFM,0.025473,0.020987,0.019491,0.079261,0.013823,0.110527,0.020987,0.041045,0.04536,0.020987,0.041045,0.04536,4.052514,4.672829,5.441932,4e-06,4e-06,6e-06


По большенству метрик выигрывает LightFM[warp], в том числе по ключевой метрике MAP@10. LogisticMF показывает хорошие результаты для метрики novelty, а ALS - для serendipity.

Эксперимент №2.
- Кол-во фолдов 5
- Размер тестового фолда 7D

In [22]:
n_splits = 5
cv = TimeRangeSplitter(test_size="7D",
                        n_splits=n_splits,
                        filter_cold_users=True,
                        filter_cold_items=True,
                        filter_already_seen=True)

In [23]:
validate(models, metrics, cv, dataset, K_RECOS=10)

  0%|          | 0/5 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-07-11 00:00:00', freq='7D'), 'end': Timestamp('2021-07-18 00:00:00', freq='7D'), 'train': 583565, 'train_users': 282499, 'train_items': 10376, 'test': 27422, 'test_users': 19466, 'test_items': 3464}

{'i_split': 1, 'start': Timestamp('2021-07-18 00:00:00', freq='7D'), 'end': Timestamp('2021-07-25 00:00:00', freq='7D'), 'train': 640331, 'train_users': 305483, 'train_items': 10619, 'test': 29886, 'test_users': 20780, 'test_items': 3684}

{'i_split': 2, 'start': Timestamp('2021-07-25 00:00:00', freq='7D'), 'end': Timestamp('2021-08-01 00:00:00', freq='7D'), 'train': 701298, 'train_users': 329509, 'train_items': 10774, 'test': 32728, 'test_users': 22701, 'test_items': 3788}

{'i_split': 3, 'start': Timestamp('2021-08-01 00:00:00', freq='7D'), 'end': Timestamp('2021-08-08 00:00:00', freq='7D'), 'train': 768221, 'train_users': 355896, 'train_items': 10931, 'test': 33489, 'test_users': 23467, 'test_items': 3769}

{'i_split': 4, 'start': Timestamp('2021

Unnamed: 0_level_0,prec@1,recall@1,prec@5,recall@5,prec@10,recall@10,MAP@1,MAP@5,MAP@10,NDCG@1,NDCG@5,NDCG@10,MeanInvUserFreq@1,MeanInvUserFreq@5,MeanInvUserFreq@10,Serendipity@1,Serendipity@5,Serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ALS,0.010779,0.008585,0.006731,0.026372,0.005255,0.040787,0.008585,0.014628,0.016546,0.008585,0.014628,0.016546,6.390254,7.387294,7.884815,1.2e-05,1.7e-05,1.9e-05
LogMF,0.000245,0.000181,0.000194,0.000695,0.000193,0.001353,0.000181,0.000345,0.000431,0.000181,0.000345,0.000431,13.521984,13.569366,13.597538,9e-06,6e-06,5e-06
LightFM,0.026544,0.022012,0.020575,0.08429,0.014811,0.119895,0.022012,0.043513,0.048409,0.022012,0.043513,0.048409,4.036561,4.694605,5.435315,5e-06,4e-06,5e-06


Изменение валидационных параметров не привело к новым лидерам по метрикам, все алгоритмы сохранили свои позиции. Сами же значения метрик так же не испытали значительных изменений.

Эксперимент №3
- Кол-во фолдов 5
- Размер тестового фолда 3D

In [24]:
n_splits = 5
cv = TimeRangeSplitter(test_size="3D",
                        n_splits=n_splits,
                        filter_cold_users=True,
                        filter_cold_items=True,
                        filter_already_seen=True)

In [25]:
validate(models, metrics, cv, dataset, K_RECOS=10)

  0%|          | 0/5 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-07-31 00:00:00', freq='3D'), 'end': Timestamp('2021-08-03 00:00:00', freq='3D'), 'train': 757473, 'train_users': 351548, 'train_items': 10905, 'test': 16405, 'test_users': 12855, 'test_items': 2894}

{'i_split': 1, 'start': Timestamp('2021-08-03 00:00:00', freq='3D'), 'end': Timestamp('2021-08-06 00:00:00', freq='3D'), 'train': 789369, 'train_users': 364141, 'train_items': 11015, 'test': 15104, 'test_users': 11911, 'test_items': 2814}

{'i_split': 2, 'start': Timestamp('2021-08-06 00:00:00', freq='3D'), 'end': Timestamp('2021-08-09 00:00:00', freq='3D'), 'train': 817604, 'train_users': 374994, 'train_items': 11123, 'test': 17042, 'test_users': 13390, 'test_items': 2927}

{'i_split': 3, 'start': Timestamp('2021-08-09 00:00:00', freq='3D'), 'end': Timestamp('2021-08-12 00:00:00', freq='3D'), 'train': 849356, 'train_users': 387156, 'train_items': 11202, 'test': 15966, 'test_users': 12594, 'test_items': 2901}

{'i_split': 4, 'start': Timestamp('2021

Unnamed: 0_level_0,prec@1,recall@1,prec@5,recall@5,prec@10,recall@10,MAP@1,MAP@5,MAP@10,NDCG@1,NDCG@5,NDCG@10,MeanInvUserFreq@1,MeanInvUserFreq@5,MeanInvUserFreq@10,Serendipity@1,Serendipity@5,Serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ALS,0.009977,0.008681,0.006588,0.02803,0.005147,0.043498,0.008681,0.015394,0.017426,0.008681,0.015394,0.017426,6.308459,7.279139,7.807347,9e-06,1.4e-05,1.7e-05
LogMF,0.00011,9.4e-05,0.00014,0.00053,0.000152,0.001052,9.4e-05,0.000244,0.000313,9.4e-05,0.000244,0.000313,13.750177,13.766602,13.793113,4e-06,5e-06,4e-06
LightFM,0.022576,0.019936,0.017163,0.074914,0.012172,0.105503,0.019936,0.038894,0.043024,0.019936,0.038894,0.043024,4.074367,4.68381,5.447755,5e-06,4e-06,5e-06


Результаты эксперимента аналогичны эксперименту №2.\
Таким образом, по результам 3-х экспериментов лучшие значения для большинства метрик и для метрики MAP@10 показал алгоритм LightFM[warp].

## Обучение модели

Добавим user и item признаки в процесс обучения модели.\
Создадим класс CustomLightFMWithANN, включающий в себя алгоритм LightFM[warp], PopularModel и ANN.

### User features

In [27]:
users.fillna('Unknown', inplace=True)
users = users.loc[users[Columns.User].isin(interactions[Columns.User])].copy()

user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)

### Item features

In [28]:
items = items.loc[items[Columns.Item].isin(interactions[Columns.Item])].copy()

items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"

content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

item_features = pd.concat((genre_feature, content_feature))

In [29]:
# Сформируем датасет с признаками users и items
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

### Валидация

In [34]:
model = {'CustomLightFM': CustomLightFMWithANN(
            params={'no_components': 10, 'loss': 'warp'},
            n_epochs=NUM_EPOCHS,
            n_threads=NUM_THREADS)
        }

metric = {"MAP@10": MAP(k=10)}

In [35]:
n_splits = 3
cv = TimeRangeSplitter(test_size="7D",
                        n_splits=n_splits,
                        filter_cold_users=True,
                        filter_cold_items=True,
                        filter_already_seen=True)

In [36]:
validate(model, metric, cv, dataset, K_RECOS=10)

  0%|          | 0/3 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-08-02 00:00:00', freq='7D'), 'end': Timestamp('2021-08-09 00:00:00', freq='7D'), 'train': 4266013, 'train_users': 797423, 'train_items': 15237, 'test': 263681, 'test_users': 98184, 'test_items': 6602}

{'i_split': 1, 'start': Timestamp('2021-08-09 00:00:00', freq='7D'), 'end': Timestamp('2021-08-16 00:00:00', freq='7D'), 'train': 4649162, 'train_users': 850489, 'train_items': 15415, 'test': 279422, 'test_users': 103511, 'test_items': 6698}

{'i_split': 2, 'start': Timestamp('2021-08-16 00:00:00', freq='7D'), 'end': Timestamp('2021-08-23 00:00:00', freq='7D'), 'train': 5051815, 'train_users': 906071, 'train_items': 15577, 'test': 298878, 'test_users': 110076, 'test_items': 6679}


Unnamed: 0_level_0,MAP@10
model,Unnamed: 1_level_1
CustomLightFM,0.044009


### Обучение и сохранение модели

In [37]:
model['CustomLightFM'].fit(dataset)

In [38]:
with open('lightFM_with_ann.dill', 'wb') as f:
    dill.dump(model['CustomLightFM'], f)