In [61]:
import pickle
import numpy as np
import pandas as pd
from implicit.bpr import BayesianPersonalizedRanking
from rectools.dataset import Interactions, Dataset
from rectools.model_selection import TimeRangeSplitter
from rectools.metrics import MAP, MeanInvUserFreq, calc_metrics, MRR, NDCG

from implicit.als import AlternatingLeastSquares

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel, PopularModel
from tqdm import tqdm

from lightfm import LightFM

import optuna

In [51]:
interactions_df = pd.read_csv('../artifacts/data_original/interactions.csv')
users = pd.read_csv('../artifacts/data_original/users.csv')
items = pd.read_csv('../artifacts/data_original/items.csv')

interactions_df.rename(columns={'last_watch_dt': Columns.Datetime,
                                'total_dur': Columns.Weight}, inplace=True) 

# Мы хотим учитывать юзеров с 1 или двумя просмотрами, так как этого недостаточно чтобы говорить о пользователях хоть что-нибудь
user_ids_all = interactions_df.groupby(Columns.User)[Columns.Item].nunique().reset_index(name='unique_items_count')
hot_users = user_ids_all[user_ids_all['unique_items_count'] > 20][Columns.User]
interactions_df_hot_users = interactions_df[interactions_df[Columns.User].isin(hot_users)]


users = users[users[Columns.User].isin(interactions_df_hot_users[Columns.User])]
interactions_df_hot_users = interactions_df_hot_users[interactions_df_hot_users[Columns.User].isin(users[Columns.User])]
items = items[items[Columns.Item].isin(interactions_df_hot_users[Columns.Item])]

interactions = Interactions(interactions_df_hot_users)   
catalog = interactions.df[Columns.Item].unique()

interactions.df

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
6,1016458,354,2021-08-14,1672.0,25.0
14,5324,8437,2021-04-18,6598.0,92.0
18,927973,9617,2021-06-19,8422.0,100.0
...,...,...,...,...,...
5476213,706423,3384,2021-03-13,5937.0,100.0
5476223,489587,6945,2021-05-18,229.0,4.0
5476226,435089,13475,2021-07-06,4624.0,85.0
5476239,610017,7107,2021-05-10,1133.0,75.0


In [54]:
N_SPLITS = 3
TEST_SIZE = '7D'

cv = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
cv.get_test_fold_borders(interactions)

[(Timestamp('2021-07-12 00:00:00', freq='14D'),
  Timestamp('2021-07-26 00:00:00', freq='14D')),
 (Timestamp('2021-07-26 00:00:00', freq='14D'),
  Timestamp('2021-08-09 00:00:00', freq='14D')),
 (Timestamp('2021-08-09 00:00:00', freq='14D'),
  Timestamp('2021-08-23 00:00:00', freq='14D'))]

In [72]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 8
N_FACTORS = 8
N_EPOCHS = 1 # Lightfm
LEARNING_RATE = 0.001 # Lightfm

metrics = {
    "precision@10": Precision(k=10),
    "accuracy@10": Recall(k=10),
    "map@10": MAP(k=10),
    "mrr@10": MRR(k=10),
    "ndcg@10": NDCG(k=10),
    "precision@5": Precision(k=5),
    "accuracy@5": Recall(k=5),
    "map@5": MAP(k=5),
    "mrr@5": MRR(k=5),
    "ndcg@5": NDCG(k=5),
    "precision@1": Precision(k=1),
    "accuracy@1": Recall(k=1),
    "map@1": MAP(k=1),
    "mrr@1": MRR(k=1),
    "ndcg@1": NDCG(k=1),
}

models = {
    "popular": PopularModel(),
}

models["ALS"] = ImplicitALSWrapperModel(
    model=AlternatingLeastSquares(
        factors=N_FACTORS, 
        random_state=RANDOM_STATE, 
        num_threads=NUM_THREADS
    )
)
        

lightfm_losses = 'warp'
models[f"LightFM"] = LightFMWrapperModel(
    LightFM(
        no_components=N_FACTORS, 
        loss=lightfm_losses, 
        random_state=RANDOM_STATE,
    ),
    epochs=N_EPOCHS,
    num_threads=NUM_THREADS,
)

[I 2023-12-05 22:20:02,047] A new study created in memory with name: no-name-69deb420-c0a8-4eac-b044-e097748227c4
[W 2023-12-05 22:20:02,050] Trial 0 failed with parameters: {'no_components': 8, 'learning_rate': 0.05} because of the following error: The value None could not be cast to float..
[W 2023-12-05 22:20:02,050] Trial 0 failed with value None.
[W 2023-12-05 22:20:02,050] Trial 1 failed with parameters: {'no_components': 4, 'learning_rate': 0.05} because of the following error: The value None could not be cast to float..
[W 2023-12-05 22:20:02,051] Trial 1 failed with value None.
[W 2023-12-05 22:20:02,051] Trial 2 failed with parameters: {'no_components': 4, 'learning_rate': 0.01} because of the following error: The value None could not be cast to float..
[W 2023-12-05 22:20:02,051] Trial 2 failed with value None.
[W 2023-12-05 22:20:02,052] Trial 3 failed with parameters: {'no_components': 8, 'learning_rate': 0.002} because of the following error: The value None could not be c

In [84]:
# Разделяем на train и test
interactions.df[Columns.Datetime] = pd.to_datetime(interactions.df[Columns.Datetime], format='%Y-%m-%d')
max_date = interactions.df[Columns.Datetime].max()

train = interactions.df[interactions.df[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions.df[interactions.df[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()
train.drop(train.query("weight < 300").index, inplace=True)
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)
TEST_USERS = test[Columns.User].unique()

users.fillna('Unknown', inplace=True)
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

items.fillna('Unknown', inplace=True)
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
countries_feature = items.reindex(columns=[Columns.Item, "countries"])
countries_feature.columns = ["id", "value"]
countries_feature["feature"] = "countries"
item_features = pd.concat((genre_feature, content_feature, countries_feature))

dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type", "countries"],
)

In [87]:
def ials_objective(trial):

    factors = trial.suggest_categorical('factors', [4, 8, 16])
    model = ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            factors=factors,
            random_state=42, 
            num_threads=8,
        ),
        fit_features_together=True,
    )

    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=10,
        filter_viewed=True,
    )

    map10 = calc_metrics(metrics, recos, test, train)['map@10']
    return map10

In [88]:
study = optuna.create_study(direction='maximize')
study.optimize(ials_objective, n_trials=10)
study.best_params

[I 2023-12-05 23:22:39,843] A new study created in memory with name: no-name-eef8171d-b40b-40b9-9edf-e888084de39a
[I 2023-12-05 23:24:34,432] Trial 0 finished with value: 0.0072958207763013415 and parameters: {'factors': 8}. Best is trial 0 with value: 0.0072958207763013415.
[I 2023-12-05 23:26:27,971] Trial 1 finished with value: 0.007247593935454447 and parameters: {'factors': 4}. Best is trial 0 with value: 0.0072958207763013415.
[I 2023-12-05 23:28:23,215] Trial 2 finished with value: 0.0072958207763013415 and parameters: {'factors': 8}. Best is trial 0 with value: 0.0072958207763013415.
[I 2023-12-05 23:30:18,119] Trial 3 finished with value: 0.0072958207763013415 and parameters: {'factors': 8}. Best is trial 0 with value: 0.0072958207763013415.
[I 2023-12-05 23:32:12,198] Trial 4 finished with value: 0.00726333031524715 and parameters: {'factors': 16}. Best is trial 0 with value: 0.0072958207763013415.
[I 2023-12-05 23:34:02,461] Trial 5 finished with value: 0.007247593935454447 

{'factors': 8}

In [89]:
def lfm_objective(trial):

    no_components = trial.suggest_categorical('no_components', [4, 8, 16])
    learning_rate = trial.suggest_float('learning_rate', 0.005, 0.05, log=True)
    rho = trial.suggest_float('rho', 0.9, 0.99, log=True)
    epsilon = trial.suggest_float('epsilon', 1e-6, 1e-5, log=True)
    
    model = LightFMWrapperModel(
        LightFM(
            no_components=no_components,
            learning_rate=learning_rate, 
            loss='warp',
            rho=rho,
            epsilon=epsilon,
            user_alpha=0,
            item_alpha=0,
            random_state=42,
        ),
        epochs=1,
        num_threads=8,
    )
    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=10,
        filter_viewed=True,
    )

    map10 = calc_metrics(metrics, recos, test, train)['map@10']
    return map10

In [90]:
study = optuna.create_study(direction='maximize')
study.optimize(lfm_objective, n_trials=10)
study.best_params

[I 2023-12-05 23:41:37,474] A new study created in memory with name: no-name-234188b2-c773-4510-993d-b542f53cadc8
[I 2023-12-05 23:41:39,897] Trial 0 finished with value: 0.034777984887692175 and parameters: {'no_components': 4, 'learning_rate': 0.010481176460137137, 'rho': 0.9861248015809704, 'epsilon': 6.32391713250286e-06}. Best is trial 0 with value: 0.034777984887692175.
[I 2023-12-05 23:41:42,329] Trial 1 finished with value: 0.037011771696474766 and parameters: {'no_components': 4, 'learning_rate': 0.012757327436289807, 'rho': 0.9447918999506748, 'epsilon': 2.7932521343184393e-06}. Best is trial 1 with value: 0.037011771696474766.
[I 2023-12-05 23:41:45,219] Trial 2 finished with value: 0.03782074408262104 and parameters: {'no_components': 16, 'learning_rate': 0.012534737273083126, 'rho': 0.9720554547981552, 'epsilon': 5.607595823796713e-06}. Best is trial 2 with value: 0.03782074408262104.
[I 2023-12-05 23:41:48,040] Trial 3 finished with value: 0.03711725036836217 and paramete

{'no_components': 16,
 'learning_rate': 0.012534737273083126,
 'rho': 0.9720554547981552,
 'epsilon': 5.607595823796713e-06}