In [None]:
!pip install rectools
!pip install lightfm
!pip install optuna

In [60]:
import os

In [None]:
import requests
import pandas as pd 
import numpy as np

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import LightFMWrapperModel

import matplotlib.pyplot as plt

import typing as tp
from tqdm import tqdm

from lightfm import LightFM

import optuna

In [None]:
# download dataset by chunks
url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"

req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)


kion dataset download: 100%|█████████▉| 78.6M/78.8M [00:05<00:00, 17.5MiB/s]

In [None]:
!unzip kion_train.zip

Archive:  kion_train.zip
replace kion_train/interactions.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
interactions = pd.read_csv('kion_train/interactions.csv')
users = pd.read_csv('kion_train/users.csv')
items = pd.read_csv('kion_train/items.csv')

## Normalization

In [None]:
users['sex'] = users['sex'].map({'Ж': 1, 'М': 0})

In [None]:
print('Кол-во уникальных пользователей и айтемов в интеракциях')
interactions[Columns.UserItem].nunique()

Кол-во уникальных пользователей и айтемов в интеракциях


user_id    962179
item_id     15706
dtype: int64

In [None]:
Columns.Datetime = 'last_watch_dt'

In [None]:
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)

## Train/test split

In [None]:
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')

In [None]:
max_date = interactions[Columns.Datetime].max()

In [None]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

In [None]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 6)
test: (490982, 6)


In [None]:
train.drop(train.query("total_dur < 300").index, inplace=True)

In [None]:
# отфильтруем холодных пользователей из теста
cold_users = set(test[Columns.User]) - set(train[Columns.User])

In [None]:
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

# MODELS

In [None]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 16
N_EPOCHS = 10

N_TRIALS = 2 # number of iterations for optuna to tune hyperparameters 

In [None]:
dataset = Dataset.construct(
    interactions_df=train
)

Было взято 3 модели, созданные при помощи LightFM и подобраны гиперпараметры learning rate, количество компонентов и функция потерь. Каждая модель оптимизировалась под различные метрики: "MAP@10", "Precision@10", "Recall@10" соответственно.

Ниже представлена модель, для которой гиперпараметры подбирались для максимизации метрики Mean Average Precision (при k=10):

In [None]:
metric_results = []

In [None]:
def objective_MAP10(trial, dataset):
  param_grid = {
      "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.4),
      "no_components": trial.suggest_int("no_components", 4, 32, step=4),
      "loss": trial.suggest_categorical('loss', ['logistic', 'bpr', 'warp'])
  }

  metrics_name = {
    'Precision': Precision,
    'Recall': Recall,
    'MAP': MAP,
  }

  LightFM_model = LightFMWrapperModel(
      LightFM(
          **param_grid,
          random_state=RANDOM_STATE,
      ),
        epochs=N_EPOCHS,
        num_threads=NUM_THREADS,
  )
  
  metrics = {}

  for metric_name, metric in metrics_name.items():
      metrics[f'{metric_name}@{K_RECOS}'] = metric(k=K_RECOS)

  LightFM_model.fit(dataset)

  recos = LightFM_model.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
  )
  metric_values = calc_metrics(metrics, recos, test, train)
  metric_results.append(metric_values)
  return metric_values['MAP@10']
  

In [None]:
study_map = optuna.create_study(direction = "maximize", study_name = "LightFM_MAP")  # Create a new study.
func = lambda trial: objective_MAP10(trial, dataset)
study_map.optimize(func, n_trials=N_TRIALS, show_progress_bar=True)

[32m[I 2022-12-06 15:27:53,024][0m A new study created in memory with name: LightFM_MAP[0m
  self._init_valid()


  0%|          | 0/2 [00:00<?, ?it/s]

kion dataset download: 100%|██████████| 78.8M/78.8M [14:21<00:00, 17.5MiB/s]

[32m[I 2022-12-06 15:30:59,526][0m Trial 0 finished with value: 0.02298440843986701 and parameters: {'learning_rate': 0.20112030591211583, 'no_components': 32, 'loss': 'warp'}. Best is trial 0 with value: 0.02298440843986701.[0m


kion dataset download: 100%|██████████| 78.8M/78.8M [16:45<00:00, 17.5MiB/s]

[32m[I 2022-12-06 15:33:23,241][0m Trial 1 finished with value: 2.2130234387561793e-06 and parameters: {'learning_rate': 0.3911756952938731, 'no_components': 8, 'loss': 'warp'}. Best is trial 0 with value: 0.02298440843986701.[0m


Ниже представлена модель, для которой гиперпараметры подбирались для максимизации метрики Precision (при k=10):

In [None]:
def objective_Precision10(trial, dataset):
  param_grid = {
      "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.4),
      "no_components": trial.suggest_int("no_components", 4, 32, step=4),
      "loss": trial.suggest_categorical('loss', ['logistic', 'bpr', 'warp'])
  }

  metrics_name = {
    'Precision': Precision,
    'Recall': Recall,
    'MAP': MAP,
  }

  LightFM_model = LightFMWrapperModel(
      LightFM(
          **param_grid,
          random_state=RANDOM_STATE,
      ),
        epochs=N_EPOCHS,
        num_threads=NUM_THREADS,
  )
  
  metrics = {}

  for metric_name, metric in metrics_name.items():
      metrics[f'{metric_name}@{K_RECOS}'] = metric(k=K_RECOS)

  LightFM_model.fit(dataset)

  recos = LightFM_model.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
  )
  metric_values = calc_metrics(metrics, recos, test, train)

  return metric_values['Precision@10']
  

In [None]:
study_precision = optuna.create_study(direction = "maximize", study_name = "LightFM_Precision")  # Create a new study.
func = lambda trial: objective_Precision10(trial, dataset)
study_precision.optimize(func, n_trials=N_TRIALS, show_progress_bar=True)

[32m[I 2022-12-06 14:45:23,162][0m A new study created in memory with name: LightFM_Precision[0m


  0%|          | 0/2 [00:00<?, ?it/s]

kion dataset download: 100%|██████████| 78.8M/78.8M [32:03<00:00, 16.9MiB/s]

[32m[I 2022-12-06 14:47:47,573][0m Trial 0 finished with value: 0.008353027607389115 and parameters: {'learning_rate': 0.18159034398507715, 'no_components': 16, 'loss': 'warp'}. Best is trial 0 with value: 0.008353027607389115.[0m


kion dataset download: 100%|██████████| 78.8M/78.8M [34:46<00:00, 16.9MiB/s]

[32m[I 2022-12-06 14:50:30,079][0m Trial 1 finished with value: 0.009028386308717705 and parameters: {'learning_rate': 0.2802825757103522, 'no_components': 24, 'loss': 'bpr'}. Best is trial 1 with value: 0.009028386308717705.[0m


Ниже представлена модель, для которой гиперпараметры подбирались для максимизации метрики Recall (при k=10):

In [None]:
def objective_Recall10(trial, dataset):
  param_grid = {
      "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.4),
      "no_components": trial.suggest_int("no_components", 4, 32, step = 4),
      "loss": trial.suggest_categorical('loss', ['logistic', 'bpr', 'warp'])
  }

  metrics_name = {
    'Precision': Precision,
    'Recall': Recall,
    'MAP': MAP,
  }

  LightFM_model = LightFMWrapperModel(
      LightFM(
          **param_grid,
          random_state=RANDOM_STATE,
      ),
        epochs=N_EPOCHS,
        num_threads=NUM_THREADS,
  )
  
  metrics = {}

  for metric_name, metric in metrics_name.items():
      metrics[f'{metric_name}@{K_RECOS}'] = metric(k=K_RECOS)

  LightFM_model.fit(dataset)

  recos = LightFM_model.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
  )
  metric_values = calc_metrics(metrics, recos, test, train)

  return metric_values['Recall@10']
  

In [None]:
study_recall = optuna.create_study(direction = "maximize", study_name = "LightFM_Recall")  # Create a new study.
func = lambda trial: objective_Recall10(trial, dataset)
study_recall.optimize(func, n_trials=N_TRIALS, show_progress_bar=True)

[32m[I 2022-12-06 14:50:54,240][0m A new study created in memory with name: LightFM_Recall[0m
  self._init_valid()


  0%|          | 0/2 [00:00<?, ?it/s]

kion dataset download: 100%|██████████| 78.8M/78.8M [37:25<00:00, 16.9MiB/s]

[32m[I 2022-12-06 14:53:09,502][0m Trial 0 finished with value: 0.05311648294380387 and parameters: {'learning_rate': 0.17619995265013697, 'no_components': 8, 'loss': 'bpr'}. Best is trial 0 with value: 0.05311648294380387.[0m


kion dataset download: 100%|██████████| 78.8M/78.8M [39:58<00:00, 16.9MiB/s]

[32m[I 2022-12-06 14:55:42,695][0m Trial 1 finished with value: 0.16019814204505362 and parameters: {'learning_rate': 0.06372281384932382, 'no_components': 24, 'loss': 'warp'}. Best is trial 1 with value: 0.16019814204505362.[0m


Вывод лучших значений метрик и соответствующих параметров:

In [None]:
print(f"\tBest value (MAP@10): {study_map.best_value:.5f}")
print(f"\tBest params (MAP@10):")

for key, value in study_map.best_params.items():
    print(f"\t\t{key}: {value}")

print(f"\tPrecision@10 in best result:")
best_result = list(filter(lambda best_result: best_result['MAP@10'] == study_map.best_value, metric_results))[0]
print(f"\t\t{best_result['Precision@10']}")
print(f"\tRecall@10 in best result:")
print(f"\t\t{best_result['Recall@10']}")

	Best value (MAP@10): 0.02298
	Best params (MAP@10):
		learning_rate: 0.20112030591211583
		no_components: 32
		loss: warp
	Precision@10 in best result:
		0.008006939465554936
	Recall@10 in best result:
		0.04558223359423488


In [None]:
print(f"\tBest value (Precision@10): {study_precision.best_value:.5f}")
print(f"\tBest params (Precision@10):")

for key, value in study_precision.best_params.items():
    print(f"\t\t{key}: {value}")

print(f"\tMAP@10 in best result:")
best_result = list(filter(lambda best_result: best_result['Precision@10'] == study_precision.best_value, metric_results))[0]
print(f"\t\t{best_result['MAP@10']}")
print(f"\tRecall@10 in best result:")
print(f"\t\t{best_result['Recall@10']}")

NameError: ignored

In [None]:
print(f"\tBest value (Recall@10): {study_recall.best_value:.5f}")
print(f"\tBest params (Recall@10):")

for key, value in study_precision.best_recall.items():
    print(f"\t\t{key}: {value}")

print(f"\tMAP@10 in best result:")
best_result = list(filter(lambda best_result: best_result['Recall@10'] == study_recall.best_value, metric_results))[0]
print(f"\t\t{best_result['MAP@10']}")
print(f"\tPrecision@10 in best result:")
print(f"\t\t{best_result['Precision@10']}")

На графике гиперпараметры отсортированы по значимости (для модели, у которой гиперпараметры подбирались по **MAP@10**):

In [None]:
fig = optuna.visualization.plot_param_importances(study_map)
fig.show()

AttributeError: ignored

На графике гиперпараметры отсортированы по значимости (для модели, у которой гиперпараметры подбирались по **Precision@10**):

In [None]:
fig = optuna.visualization.plot_param_importances(study_precision)
fig.show()

На графике гиперпараметры отсортированы по значимости (для модели, у которой гиперпараметры подбирались по **Recall@10**):

In [None]:
fig = optuna.visualization.plot_param_importances(study_recall)
fig.show()