# Домашнее задание №6. Обучение LightFM модели 1-го уровня

In [1]:
!pip -q install rectools
!pip -q install lightfm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.5/102.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone


In [2]:
import os
from typing import List
from pathlib import Path
import pickle
import warnings

import numpy as np
import pandas as pd
from lightfm import LightFM
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import (
    MAP,
    MeanInvUserFreq,
    Precision,
    Recall,
    Serendipity,
    calc_metrics,
)
from rectools.models import LightFMWrapperModel

warnings.filterwarnings('ignore')

In [3]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

## Подготовка данных

### Загрузка данных

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
DATA_PATH = Path("/content/drive/MyDrive/recsys_course/data_kion")

In [6]:
%%time
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

CPU times: user 3.74 s, sys: 758 ms, total: 4.49 s
Wall time: 7.75 s


In [7]:
Columns.Datetime = "last_watch_dt"
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format="%Y-%m-%d")
max_date = interactions[Columns.Datetime].max()
interactions[Columns.Weight] = np.where(interactions["watched_pct"] > 10, 3, 1)

### Подготовка признаков

In [8]:
def get_users_features(users: pd.DataFrame, interactions: pd.DataFrame, features_to_get: List[str]):
  users.fillna('Unknown', inplace=True)
  users = users.loc[users[Columns.User].isin(interactions[Columns.User])].copy()
  user_features_frames = []
  for feature in features_to_get:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
  user_features = pd.concat(user_features_frames)
  return user_features

In [9]:
def get_items_features(items: pd.DataFrame, interactions: pd.DataFrame):
  items = items.loc[items[Columns.Item].isin(interactions[Columns.Item])].copy()
  # получаем жанры в нужном формате
  items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
  genre_feature = items[["item_id", "genre"]].explode("genre")
  genre_feature.columns = ["id", "value"]
  genre_feature["feature"] = "genre"
  # получаем тип контента в нужном формате
  content_feature = items.reindex(columns=[Columns.Item, "content_type"])
  content_feature.columns = ["id", "value"]
  content_feature["feature"] = "content_type"
  # получаем страну производства в нужном формате
  items["country"] = items["countries"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
  country_feature = items[["item_id", "country"]].explode("country")
  country_feature.columns = ["id", "value"]
  country_feature["feature"] = "country"
  # получаем год выпуска в нужном формате
  items['binned_r_year'] = pd.qcut(items['release_year'], q=10, labels=list(range(10)))
  release_year_feature = items.reindex(columns=[Columns.Item, "binned_r_year"])
  release_year_feature.columns = ["id", "value"]
  release_year_feature["feature"] = "binned_r_year"
  # смержим все признаки в один фрейм
  item_features = pd.concat((genre_feature, content_feature, country_feature, release_year_feature))
  return item_features

### Разбиение данных

In [10]:
max_date = interactions[Columns.Datetime].max()
min_date = interactions[Columns.Datetime].min()

print(f'min дата в interactions: {min_date}')
print(f'max дата в interactions: {max_date}')
print(f'Продолжительность: {max_date - min_date}')

ranker_days_count = 30

interactions = interactions[
    (interactions[Columns.Datetime] < max_date - pd.Timedelta(days=ranker_days_count))
]

min дата в interactions: 2021-03-13 00:00:00
max дата в interactions: 2021-08-22 00:00:00
Продолжительность: 162 days 00:00:00


In [11]:
user_features = get_users_features(users, interactions, ["sex", "age", "income"])
item_features = get_items_features(items, interactions)

In [12]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type", "country", "binned_r_year"],
)

## Обучение модели

Гиперпараметры для модели были подобраны в 4 дз (hw4_ALS_FM_ANN_experiments.ipynb)

In [13]:
lightfm_model = LightFMWrapperModel(
    LightFM(
        no_components=64,
        loss="warp",
        random_state=1008,
        learning_rate=0.013681895729046522,
        user_alpha=0.4233951257813444,
        item_alpha=0.5801959801737767,
    ),
    epochs=1,
    num_threads=2,
    verbose=2
)

In [14]:
%%time

lightfm_model.fit(dataset)

Epoch: 100%|██████████| 1/1 [01:45<00:00, 105.51s/it]

CPU times: user 2min 52s, sys: 413 ms, total: 2min 52s
Wall time: 1min 46s





<rectools.models.lightfm.LightFMWrapperModel at 0x7d77e3e09390>

## Генерируем кандидатов для ранкера

In [15]:
%%time

# топ-50 кандидатов
top_N = 50
candidates = lightfm_model.recommend(dataset.user_id_map.external_ids, dataset, top_N, True)

CPU times: user 7min 44s, sys: 1min 35s, total: 9min 19s
Wall time: 6min 40s


In [16]:
candidates = candidates.rename({"rank": "lfm_rank", "score": "lfm_score"}, axis=1)
candidates.head()

Unnamed: 0,user_id,item_id,lfm_score,lfm_rank
0,176549,15297,-0.000603,1
1,176549,10440,-0.000603,2
2,176549,13865,-0.000604,3
3,176549,4151,-0.000604,4
4,176549,2657,-0.000604,5


## Сохраняем кандидатов

In [17]:
RECOS_PATH = "/content/drive/MyDrive/recsys_course/recommendations"
candidates.to_csv(f"{RECOS_PATH}/candidates_lightfm.csv", index=False)