In [None]:
!pip install rectools
!pip install lightfm
!pip install optuna

In [90]:
import os

In [91]:
import requests
import pandas as pd 
import numpy as np

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import LightFMWrapperModel

import matplotlib.pyplot as plt

import typing as tp
from tqdm import tqdm

from lightfm import LightFM

import optuna

In [92]:
# download dataset by chunks
url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"

req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)


kion dataset download: 100%|██████████| 78.8M/78.8M [42:39<00:00, 30.8kiB/s]
kion dataset download:  98%|█████████▊| 77.6M/78.8M [00:05<00:00, 16.4MiB/s]

In [None]:
!unzip kion_train.zip

In [93]:
interactions = pd.read_csv('kion_train/interactions.csv')
users = pd.read_csv('kion_train/users.csv')
items = pd.read_csv('kion_train/items.csv')

## Normalization

In [96]:
users['sex'] = users['sex'].map({'Ж': 1, 'М': 0})

In [99]:
Columns.Datetime = 'last_watch_dt'

In [100]:
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)

## Train/test split

In [101]:
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')

In [102]:
max_date = interactions[Columns.Datetime].max()

In [103]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

In [104]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 6)
test: (490982, 6)


In [105]:
train.drop(train.query("total_dur < 300").index, inplace=True)

In [106]:
# отфильтруем холодных пользователей из теста
cold_users = set(test[Columns.User]) - set(train[Columns.User])

In [107]:
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

# MODELS

In [109]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 16
N_EPOCHS = 10

N_TRIALS = 2 # number of iterations for optuna to tune hyperparameters 

In [110]:
dataset = Dataset.construct(
    interactions_df=train
)

Было взято 3 модели, созданные при помощи LightFM и подобраны гиперпараметры learning rate, количество компонентов и функция потерь. Каждая модель оптимизировалась под различные метрики: "MAP@10", "Precision@10", "Recall@10" соответственно.

Ниже представлена модель, для которой гиперпараметры подбирались для максимизации метрики Mean Average Precision (при k=10):

In [111]:
metric_results = []

In [112]:
def objective_MAP10(trial, dataset):
  param_grid = {
      "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.4),
      "no_components": trial.suggest_int("no_components", 4, 32, step=4),
      "loss": trial.suggest_categorical('loss', ['logistic', 'bpr', 'warp'])
  }

  metrics_name = {
    'Precision': Precision,
    'Recall': Recall,
    'MAP': MAP,
  }

  LightFM_model = LightFMWrapperModel(
      LightFM(
          **param_grid,
          random_state=RANDOM_STATE,
      ),
        epochs=N_EPOCHS,
        num_threads=NUM_THREADS,
  )
  
  metrics = {}

  for metric_name, metric in metrics_name.items():
      metrics[f'{metric_name}@{K_RECOS}'] = metric(k=K_RECOS)

  LightFM_model.fit(dataset)

  recos = LightFM_model.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
  )
  metric_values = calc_metrics(metrics, recos, test, train)
  metric_results.append(metric_values)
  return metric_values['MAP@10']
  

In [113]:
study_map = optuna.create_study(direction = "maximize", study_name = "LightFM_MAP")  # Create a new study.
func = lambda trial: objective_MAP10(trial, dataset)
study_map.optimize(func, n_trials=N_TRIALS, show_progress_bar=True)

[32m[I 2022-12-06 16:39:40,283][0m A new study created in memory with name: LightFM_MAP[0m

Progress bar is experimental (supported from v1.2.0). The interface can change in the future.



  0%|          | 0/2 [00:00<?, ?it/s]

kion dataset download: 100%|██████████| 78.8M/78.8M [05:07<00:00, 16.4MiB/s]

[32m[I 2022-12-06 16:41:56,615][0m Trial 0 finished with value: 0.07457595806086346 and parameters: {'learning_rate': 0.37010448462119316, 'no_components': 24, 'loss': 'logistic'}. Best is trial 0 with value: 0.07457595806086346.[0m


kion dataset download: 100%|██████████| 78.8M/78.8M [07:27<00:00, 16.4MiB/s]

[32m[I 2022-12-06 16:44:16,719][0m Trial 1 finished with value: 0.07478512050655896 and parameters: {'learning_rate': 0.3525137729246125, 'no_components': 24, 'loss': 'logistic'}. Best is trial 1 with value: 0.07478512050655896.[0m


Ниже представлена модель, для которой гиперпараметры подбирались для максимизации метрики Precision (при k=10):

In [114]:
metric_results_pr = []

In [115]:
def objective_Precision10(trial, dataset):
  param_grid = {
      "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.4),
      "no_components": trial.suggest_int("no_components", 4, 32, step=4),
      "loss": trial.suggest_categorical('loss', ['logistic', 'bpr', 'warp'])
  }

  metrics_name = {
    'Precision': Precision,
    'Recall': Recall,
    'MAP': MAP,
  }

  LightFM_model = LightFMWrapperModel(
      LightFM(
          **param_grid,
          random_state=RANDOM_STATE,
      ),
        epochs=N_EPOCHS,
        num_threads=NUM_THREADS,
  )
  
  metrics = {}

  for metric_name, metric in metrics_name.items():
      metrics[f'{metric_name}@{K_RECOS}'] = metric(k=K_RECOS)

  LightFM_model.fit(dataset)

  recos = LightFM_model.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
  )
  metric_values = calc_metrics(metrics, recos, test, train)
  metric_results_pr.append(metric_values)

  return metric_values['Precision@10']
  

In [116]:
study_precision = optuna.create_study(direction = "maximize", study_name = "LightFM_Precision")  # Create a new study.
func = lambda trial: objective_Precision10(trial, dataset)
study_precision.optimize(func, n_trials=N_TRIALS, show_progress_bar=True)

[32m[I 2022-12-06 16:44:16,771][0m A new study created in memory with name: LightFM_Precision[0m

Progress bar is experimental (supported from v1.2.0). The interface can change in the future.



  0%|          | 0/2 [00:00<?, ?it/s]

kion dataset download: 100%|██████████| 78.8M/78.8M [10:13<00:00, 16.4MiB/s]

[32m[I 2022-12-06 16:47:02,533][0m Trial 0 finished with value: 0.004499145843844322 and parameters: {'learning_rate': 0.2328436993091849, 'no_components': 24, 'loss': 'warp'}. Best is trial 0 with value: 0.004499145843844322.[0m


kion dataset download: 100%|██████████| 78.8M/78.8M [12:10<00:00, 16.4MiB/s]

[32m[I 2022-12-06 16:49:00,114][0m Trial 1 finished with value: 0.03178965630172513 and parameters: {'learning_rate': 0.11910412442201511, 'no_components': 16, 'loss': 'logistic'}. Best is trial 1 with value: 0.03178965630172513.[0m


Ниже представлена модель, для которой гиперпараметры подбирались для максимизации метрики Recall (при k=10):

In [123]:
metric_results_rec = []

In [126]:
def objective_Recall10(trial, dataset):
  param_grid = {
      "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.4),
      "no_components": trial.suggest_int("no_components", 4, 32, step = 4),
      "loss": trial.suggest_categorical('loss', ['logistic', 'bpr', 'warp'])
  }

  metrics_name = {
    'Precision': Precision,
    'Recall': Recall,
    'MAP': MAP,
  }

  LightFM_model = LightFMWrapperModel(
      LightFM(
          **param_grid,
          random_state=RANDOM_STATE,
      ),
        epochs=N_EPOCHS,
        num_threads=NUM_THREADS,
  )
  
  metrics = {}

  for metric_name, metric in metrics_name.items():
      metrics[f'{metric_name}@{K_RECOS}'] = metric(k=K_RECOS)

  LightFM_model.fit(dataset)

  recos = LightFM_model.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
  )
  metric_values = calc_metrics(metrics, recos, test, train)
  metric_results_rec.append(metric_values)

  return metric_values['Recall@10']
  

In [127]:
study_recall = optuna.create_study(direction = "maximize", study_name = "LightFM_Recall")  # Create a new study.
func = lambda trial: objective_Recall10(trial, dataset)
study_recall.optimize(func, n_trials=N_TRIALS, show_progress_bar=True)

[32m[I 2022-12-06 16:52:30,456][0m A new study created in memory with name: LightFM_Recall[0m


  0%|          | 0/2 [00:00<?, ?it/s]

kion dataset download: 100%|██████████| 78.8M/78.8M [18:07<00:00, 16.4MiB/s]

[32m[I 2022-12-06 16:54:56,868][0m Trial 0 finished with value: 0.1594849059356297 and parameters: {'learning_rate': 0.07125343624855533, 'no_components': 20, 'loss': 'warp'}. Best is trial 0 with value: 0.1594849059356297.[0m


kion dataset download: 100%|██████████| 78.8M/78.8M [20:17<00:00, 16.4MiB/s]

[32m[I 2022-12-06 16:57:06,463][0m Trial 1 finished with value: 0.15976853999270144 and parameters: {'learning_rate': 0.06147140985707929, 'no_components': 16, 'loss': 'warp'}. Best is trial 1 with value: 0.15976853999270144.[0m


Вывод лучших значений метрик и соответствующих параметров:

In [128]:
print(f"\tBest value (MAP@10): {study_map.best_value:.5f}")
print(f"\tBest params (MAP@10):")

for key, value in study_map.best_params.items():
    print(f"\t\t{key}: {value}")

print(f"\tPrecision@10 in best result:")
best_result = list(filter(lambda best_result: best_result['MAP@10'] == study_map.best_value, metric_results))[0]
print(f"\t\t{best_result['Precision@10']}")
print(f"\tRecall@10 in best result:")
print(f"\t\t{best_result['Recall@10']}")

	Best value (MAP@10): 0.07479
	Best params (MAP@10):
		learning_rate: 0.3525137729246125
		no_components: 24
		loss: logistic
	Precision@10 in best result:
		0.031959602396948054
	Recall@10 in best result:
		0.15125232116485018


In [129]:
print(f"\tBest value (Precision@10): {study_precision.best_value:.5f}")
print(f"\tBest params (Precision@10):")

for key, value in study_precision.best_params.items():
    print(f"\t\t{key}: {value}")

print(f"\tMAP@10 in best result:")
best_result = list(filter(lambda best_result: best_result['Precision@10'] == study_precision.best_value, metric_results_pr))[0]
print(f"\t\t{best_result['MAP@10']}")
print(f"\tRecall@10 in best result:")
print(f"\t\t{best_result['Recall@10']}")

	Best value (Precision@10): 0.03179
	Best params (Precision@10):
		learning_rate: 0.11910412442201511
		no_components: 16
		loss: logistic
	MAP@10 in best result:
		0.07465463988175776
	Recall@10 in best result:
		0.15025238797676735


In [131]:
print(f"\tBest value (Recall@10): {study_recall.best_value:.5f}")
print(f"\tBest params (Recall@10):")

for key, value in study_precision.best_params.items():
    print(f"\t\t{key}: {value}")

print(f"\tMAP@10 in best result:")
best_result = list(filter(lambda best_result: best_result['Recall@10'] == study_recall.best_value, metric_results_rec))[0]
print(f"\t\t{best_result['MAP@10']}")
print(f"\tPrecision@10 in best result:")
print(f"\t\t{best_result['Precision@10']}")

	Best value (Recall@10): 0.15977
	Best params (Recall@10):
		learning_rate: 0.11910412442201511
		no_components: 16
		loss: logistic
	MAP@10 in best result:
		0.07719713208628048
	Precision@10 in best result:
		0.03529125397204741


На графике гиперпараметры отсортированы по значимости (для модели, у которой гиперпараметры подбирались по **MAP@10**):

In [132]:
fig = optuna.visualization.plot_param_importances(study_map)
fig.show()

На графике гиперпараметры отсортированы по значимости (для модели, у которой гиперпараметры подбирались по **Precision@10**):

In [133]:
fig = optuna.visualization.plot_param_importances(study_precision)
fig.show()

На графике гиперпараметры отсортированы по значимости (для модели, у которой гиперпараметры подбирались по **Recall@10**):

In [134]:
fig = optuna.visualization.plot_param_importances(study_recall)
fig.show()

In [None]:
interactions[interactions[Columns.Datetime]]