# Домашние задание №4. Работа с моделями ALS, LightFM, ANN

In [None]:
!pip -q install implicit
!pip -q install rectools
!pip -q install lightfm
!pip -q install optuna

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.5/102.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m230.6/230.6 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip -q install nmslib

In [None]:
!pip -q install rectools[nmslib]

In [None]:
import os
from typing import List
from pathlib import Path
import pickle
import json
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from implicit.gpu.als import AlternatingLeastSquares
from implicit.als import AlternatingLeastSquares as CPUAlternatingLeastSquares
from implicit.lmf import LogisticMatrixFactorization
from lightfm import LightFM
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import (
    MAP,
    MeanInvUserFreq,
    Precision,
    Recall,
    Serendipity,
    calc_metrics,
)
from rectools.model_selection import TimeRangeSplitter, cross_validate
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel
from rectools.tools import UserToItemAnnRecommender
from tqdm import tqdm

import optuna

warnings.filterwarnings('ignore')



In [None]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

## Подготовка данных

### Загрузка данных

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
DATA_PATH = Path("/content/drive/MyDrive/recsys_course/data_kion")

In [None]:
%%time
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

CPU times: user 5.83 s, sys: 733 ms, total: 6.56 s
Wall time: 15.1 s


In [None]:
Columns.Datetime = "last_watch_dt"
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format="%Y-%m-%d")
max_date = interactions[Columns.Datetime].max()
interactions[Columns.Weight] = np.where(interactions["watched_pct"] > 10, 3, 1)

In [None]:
# разделим данные на train/test;
# на train-е мы будем делать кросс-валидацию
# на test-е сравним модели при подборе гиперпараметров
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

train.drop(train.query("total_dur < 300").index, inplace=True)

# отфильтруем холодных пользователей из теста
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (3832711, 6)
test: (333026, 6)


### Подготовка признаков

In [None]:
def get_users_features(users: pd.DataFrame, interactions: pd.DataFrame, features_to_get: List[str]):
  users.fillna('Unknown', inplace=True)
  users = users.loc[users[Columns.User].isin(interactions[Columns.User])].copy()
  user_features_frames = []
  for feature in features_to_get:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
  user_features = pd.concat(user_features_frames)
  return user_features

In [None]:
def get_items_features(items: pd.DataFrame, interactions: pd.DataFrame):
  items = items.loc[items[Columns.Item].isin(interactions[Columns.Item])].copy()
  # получаем жанры в нужном формате
  items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
  genre_feature = items[["item_id", "genre"]].explode("genre")
  genre_feature.columns = ["id", "value"]
  genre_feature["feature"] = "genre"
  # получаем тип контента в нужном формате
  content_feature = items.reindex(columns=[Columns.Item, "content_type"])
  content_feature.columns = ["id", "value"]
  content_feature["feature"] = "content_type"
  # получаем страну производства в нужном формате
  items["country"] = items["countries"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
  country_feature = items[["item_id", "country"]].explode("country")
  country_feature.columns = ["id", "value"]
  country_feature["feature"] = "country"
  # получаем год выпуска в нужном формате
  items['binned_r_year'] = pd.qcut(items['release_year'], q=10, labels=list(range(10)))
  release_year_feature = items.reindex(columns=[Columns.Item, "binned_r_year"])
  release_year_feature.columns = ["id", "value"]
  release_year_feature["feature"] = "binned_r_year"
  # смержим все признаки в один фрейм
  item_features = pd.concat((genre_feature, content_feature, country_feature, release_year_feature))
  return item_features

In [None]:
user_features = get_users_features(users, train, ["sex", "age", "income"])
item_features = get_items_features(items, train)

In [None]:
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


In [None]:
item_features.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


### Инициализируем Dataset (Rectools)

In [None]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type", "country", "binned_r_year"],
)

In [None]:
TEST_USERS = test[Columns.User].unique()

## Подбираем гиперпараметры для моделей

In [None]:
K_RECOS = 10
RANDOM_STATE = 1008
N_EPOCHS = 1 # Lightfm

In [None]:
metrics = {
    "map@10": MAP(k=10),
    "precision@10": Precision(k=10),
    "recall@10": Recall(k=10),
}

### ImplicitALSWrapperModel

In [None]:
def objective_ImplicitALSWrapperModel(trial):

    n_factors = trial.suggest_int("n_factors", low=32, high=128, step=32)
    regularization = trial.suggest_float("regularization", low=0.01, high=0.51, step=0.1)

    model_obj = ImplicitALSWrapperModel(
          model=AlternatingLeastSquares(
              factors=n_factors,
              regularization=regularization,
              random_state=RANDOM_STATE,
          ),
          fit_features_together=True,
      )

    model_obj.fit(dataset)
    recos = model_obj.recommend(
      users=TEST_USERS,
      dataset=dataset,
      k=K_RECOS,
      filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)

    return metric_values['map@10'], metric_values['precision@10'], metric_values['recall@10']

In [None]:
study = optuna.create_study(directions=["maximize", "maximize", "maximize"])
study.optimize(objective_ImplicitALSWrapperModel, n_trials=20)

[I 2023-12-10 14:07:11,122] A new study created in memory with name: no-name-5ef410c4-c98c-44f5-af74-48fc2ccd4cdc
[I 2023-12-10 14:08:45,079] Trial 0 finished with values: [0.07137507308676502, 0.029929985749311806, 0.1418593446154553] and parameters: {'n_factors': 96, 'regularization': 0.51}. 
[I 2023-12-10 14:10:00,933] Trial 1 finished with values: [0.07471662487622824, 0.030955858272037673, 0.14605920896136587] and parameters: {'n_factors': 32, 'regularization': 0.01}. 
[I 2023-12-10 14:11:24,416] Trial 2 finished with values: [0.07175445965761339, 0.030134452145126882, 0.14252391120828675] and parameters: {'n_factors': 64, 'regularization': 0.51}. 
[I 2023-12-10 14:12:57,050] Trial 3 finished with values: [0.07120716328543755, 0.030253945493330502, 0.14300302465062698] and parameters: {'n_factors': 64, 'regularization': 0.51}. 
[I 2023-12-10 14:14:43,291] Trial 4 finished with values: [0.07144256566219208, 0.029839701886224635, 0.1412178228769466] and parameters: {'n_factors': 128

In [None]:
for trial in study.best_trials:
  print(f"Лучшие метрики: 'map@10': {round(trial.values[0], 3)}, 'precision@10': {round(trial.values[1], 3)}, 'recall@10': {round(trial.values[2], 3)}")
  print(f"Лучшие параметры: {trial.params}")

Лучшие метрики: 'map@10': 0.075, 'precision@10': 0.031, 'recall@10': 0.148
Лучшие параметры: {'n_factors': 32, 'regularization': 0.01}
Лучшие метрики: 'map@10': 0.074, 'precision@10': 0.032, 'recall@10': 0.15
Лучшие параметры: {'n_factors': 32, 'regularization': 0.51}
Лучшие метрики: 'map@10': 0.074, 'precision@10': 0.031, 'recall@10': 0.149
Лучшие параметры: {'n_factors': 64, 'regularization': 0.11}


### LightFMWrapperModel

In [None]:
def objective_LightFMWrapperModel(trial):
    n_factors = trial.suggest_int("n_factors", low=16, high=64, step=16)
    loss = trial.suggest_categorical("loss", choices=['logistic', 'warp'])
    lr = trial.suggest_float("lr", 1e-3, 1e-1, log=True)
    item_alpha = trial.suggest_float("item_alpha", 0, 1)
    user_alpha = trial.suggest_float("user_alpha", 0, 1)

    model_obj = LightFMWrapperModel(
      model=LightFM(
        no_components=n_factors,
        loss=loss,
        random_state=RANDOM_STATE,
        learning_rate=lr,
        user_alpha=user_alpha,
        item_alpha=item_alpha,
      ),
      epochs=N_EPOCHS,
      num_threads=2,
    )

    model_obj.fit(dataset)
    recos = model_obj.recommend(
      users=TEST_USERS,
      dataset=dataset,
      k=K_RECOS,
      filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)

    return metric_values['map@10'], metric_values['precision@10'], metric_values['recall@10']

In [None]:
study = optuna.create_study(directions=["maximize", "maximize", "maximize"])
study.optimize(objective_LightFMWrapperModel, n_trials=20)

[I 2023-12-11 15:10:21,895] A new study created in memory with name: no-name-f7318147-e627-402f-9683-f72a00321459
[I 2023-12-11 15:11:08,959] Trial 0 finished with values: [0.07083245185148171, 0.03158164936225958, 0.1489910176822698] and parameters: {'n_factors': 16, 'loss': 'warp', 'lr': 0.047056730749378564, 'item_alpha': 0.5319631978329716, 'user_alpha': 0.7306057571768695}. 
[I 2023-12-11 15:11:59,769] Trial 1 finished with values: [0.00040980252214975173, 0.00035936518052346933, 0.0011236202314321225] and parameters: {'n_factors': 48, 'loss': 'logistic', 'lr': 0.0016031485500636901, 'item_alpha': 0.6666520284934133, 'user_alpha': 0.7645103609090356}. 
[I 2023-12-11 15:13:31,930] Trial 2 finished with values: [0.073977362094152, 0.03310673853970278, 0.15843010652208717] and parameters: {'n_factors': 64, 'loss': 'warp', 'lr': 0.009062298232854862, 'item_alpha': 0.8764324952261785, 'user_alpha': 0.8847573526935165}. 
[I 2023-12-11 15:17:12,563] Trial 3 finished with values: [0.00020

In [None]:
for trial in study.best_trials:
  print(f"Лучшие метрики: 'map@10': {round(trial.values[0], 3)}, 'precision@10': {round(trial.values[1], 3)}, 'recall@10': {round(trial.values[2], 3)}")
  print(f"Лучшие параметры: {trial.params}")

Лучшие метрики: 'map@10': 0.078, 'precision@10': 0.036, 'recall@10': 0.172
Лучшие параметры: {'n_factors': 64, 'loss': 'warp', 'lr': 0.013681895729046522, 'item_alpha': 0.5801959801737767, 'user_alpha': 0.4233951257813444}


## Кросс-валидация моделей с лучшими параметрами

In [None]:
# модели
models = {
    "ALS": ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            factors=32,
            regularization=0.01,
            random_state=RANDOM_STATE,
        ),
        fit_features_together=True,
    ),
    "LightFM": LightFMWrapperModel(
        LightFM(
            no_components=64,
            loss="warp",
            random_state=RANDOM_STATE,
            learning_rate=0.013681895729046522,
            user_alpha=0.4233951257813444,
            item_alpha=0.5801959801737767,
        ),
        epochs=N_EPOCHS,
        num_threads=2,
    ),
}

In [None]:
# метрики
metrics_name = {
    "precision": Precision,
    "recall": Recall,
    "MAP": MAP,
    "novelty": MeanInvUserFreq,
    "serendipity": Serendipity,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in [1, 5, 10]:
        metrics[f"{metric_name}@{k}"] = metric(k=k)

metrics

{'precision@1': Precision(k=1),
 'precision@5': Precision(k=5),
 'precision@10': Precision(k=10),
 'recall@1': Recall(k=1),
 'recall@5': Recall(k=5),
 'recall@10': Recall(k=10),
 'MAP@1': MAP(k=1, divide_by_k=False),
 'MAP@5': MAP(k=5, divide_by_k=False),
 'MAP@10': MAP(k=10, divide_by_k=False),
 'novelty@1': MeanInvUserFreq(k=1),
 'novelty@5': MeanInvUserFreq(k=5),
 'novelty@10': MeanInvUserFreq(k=10),
 'serendipity@1': Serendipity(k=1),
 'serendipity@5': Serendipity(k=5),
 'serendipity@10': Serendipity(k=10)}

In [None]:
# разбиваем данные на фолды, оставляя на валидацию по 2 недели
TEST_SIZE = "14D"
N_SPLITS = 5

splitter = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

splitter.get_test_fold_borders(dataset.interactions)

[(Timestamp('2021-06-06 00:00:00', freq='14D'),
  Timestamp('2021-06-20 00:00:00', freq='14D')),
 (Timestamp('2021-06-20 00:00:00', freq='14D'),
  Timestamp('2021-07-04 00:00:00', freq='14D')),
 (Timestamp('2021-07-04 00:00:00', freq='14D'),
  Timestamp('2021-07-18 00:00:00', freq='14D')),
 (Timestamp('2021-07-18 00:00:00', freq='14D'),
  Timestamp('2021-08-01 00:00:00', freq='14D')),
 (Timestamp('2021-08-01 00:00:00', freq='14D'),
  Timestamp('2021-08-15 00:00:00', freq='14D'))]

In [None]:
%%time

results = cross_validate(dataset, splitter, metrics, models, k=10, filter_viewed=True)

CPU times: user 10min 59s, sys: 1min 55s, total: 12min 55s
Wall time: 9min 8s


In [None]:
# усредняем метрики по фолдам
pivot_results = (
    pd.DataFrame.from_dict(results["metrics"]).groupby(["model"], sort=False).agg(["mean"]).drop("i_split", axis=1)
)
mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='lightgreen', axis=0)
)

Unnamed: 0_level_0,precision@1,recall@1,precision@5,recall@5,precision@10,recall@10,MAP@1,MAP@5,MAP@10,novelty@1,novelty@5,novelty@10,serendipity@1,serendipity@5,serendipity@10
Unnamed: 0_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
ALS,0.11064,0.059418,0.065549,0.16406,0.045571,0.220702,0.059418,0.10008,0.109324,3.736479,4.548895,5.331623,4.5e-05,3.5e-05,3.7e-05
LightFM,0.089457,0.04387,0.068983,0.168781,0.049982,0.24539,0.04387,0.090271,0.103854,3.015844,4.033761,4.943282,6e-06,9e-06,2.4e-05


По большинству метрик лучшие результаты показала **модель ALS со следующими параметрами: {'n_factors': 32, 'regularization': 0.01}**

## Обучение модели ALS на всем датасете

В экспериментах использовалась GPU версия модели ALS из implicit (работа с ноутбуком велась в google colab), однако использовать GPU в сервисе возможности нет. Поскольку CPU верcия обучается слишком долго, в сервисе все же будем использовать LightFM.

In [None]:
MODELS_PATH = "/content/drive/MyDrive/recsys_course/models"
RECOS_PATH = "/content/drive/MyDrive/recsys_course/recommendations"

In [None]:
# подготовка признаков
user_features = get_users_features(users, interactions, ["sex", "age", "income"])
item_features = get_items_features(items, interactions)

In [None]:
display(user_features.head())
display(item_features.head())

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


In [None]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type", "country", "binned_r_year"],
)

In [None]:
lightfm_model = LightFMWrapperModel(
    LightFM(
        no_components=64,
        loss="warp",
        random_state=RANDOM_STATE,
        learning_rate=0.013681895729046522,
        user_alpha=0.4233951257813444,
        item_alpha=0.5801959801737767,
    ),
    epochs=N_EPOCHS,
    num_threads=2,
)

In [None]:
%%time

lightfm_model.fit(dataset)

CPU times: user 3min 10s, sys: 746 ms, total: 3min 10s
Wall time: 2min 2s


<rectools.models.lightfm.LightFMWrapperModel at 0x798c893f2ec0>

In [None]:
pickle.dump(lightfm_model, open(f"{MODELS_PATH}/lightfm_warp_64.pkl", "wb"))

In [None]:
# проверим, что обученная модель выдает рекомендации
pickled_model = pickle.load(open(f"{MODELS_PATH}/lightfm_warp_64.pkl", "rb"))
pickled_model.recommend(
    [864613],
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True
)

Unnamed: 0,user_id,item_id,score,rank
0,864613,10440,-0.001715,1
1,864613,13865,-0.001718,2
2,864613,9728,-0.001719,3
3,864613,3734,-0.001724,4
4,864613,2657,-0.001729,5
5,864613,8636,-0.001729,6
6,864613,7829,-0.001731,7
7,864613,14431,-0.001731,8
8,864613,9996,-0.001732,9
9,864613,11237,-0.001732,10


### Получим рекомендации для всех пользователей (оффлайн вариант)

In [None]:
ALL_USERS = interactions[Columns.User].unique()

In [None]:
%%time

recos = pickled_model.recommend(
    users=ALL_USERS,
    dataset=dataset,
    k=10,
    filter_viewed=True,
)
lightfm_recos = recos.groupby("user_id")["item_id"].agg(list).to_dict()

CPU times: user 5min 58s, sys: 1min 48s, total: 7min 46s
Wall time: 5min 33s


In [None]:
# сохраним рекомендации
with open(f"{RECOS_PATH}/lightfm_recos.json", "w") as f:
    json.dump(lightfm_recos, f)

## Добавим ANN

Используем UserToItemAnnRecommender из rectools

In [None]:
%%time

user_vectors, item_vectors = pickled_model.get_vectors(dataset)
ann_lightfm = UserToItemAnnRecommender(
    user_vectors=user_vectors,
    item_vectors=item_vectors,
    user_id_map=dataset.user_id_map,
    item_id_map=dataset.item_id_map,
)
ann_lightfm.fit()

CPU times: user 28min 54s, sys: 2.03 s, total: 28min 56s
Wall time: 18min 23s


<rectools.tools.ann.UserToItemAnnRecommender at 0x7969e0586fe0>

In [18]:
# проверим, что обученная модель выдает рекомендации
ann_lightfm.get_item_list_for_user(864613, top_n=10).tolist()

[15297, 10440, 13865, 9728, 3734, 4151, 2657, 8636, 7829, 14431]

In [23]:
pickle.dump(ann_lightfm, open(f"{MODELS_PATH}/ann_lightfm_warp_64.pkl", "wb"))

In [24]:
# проверим, что загруженная модель выдает рекомендации
ann_lightfm_load = pickle.load(open(f"{MODELS_PATH}/ann_lightfm_warp_64.pkl", "rb"))
ann_lightfm_load.get_item_list_for_user(864613, top_n=10).tolist()

[15297, 10440, 13865, 9728, 3734, 4151, 2657, 8636, 7829, 14431]