In [59]:
import pickle
import numpy as np
import pandas as pd
from implicit.bpr import BayesianPersonalizedRanking
from rectools.dataset import Interactions, Dataset
from rectools.model_selection import TimeRangeSplitter
from rectools.metrics import MAP, MeanInvUserFreq, calc_metrics, MRR, NDCG

from implicit.als import AlternatingLeastSquares

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel, PopularModel
from tqdm import tqdm

from lightfm import LightFM

import optuna

In [60]:
interactions_df = pd.read_csv('../artifacts/data_original/interactions.csv')
users = pd.read_csv('../artifacts/data_original/users.csv')
items = pd.read_csv('../artifacts/data_original/items.csv')

interactions_df.rename(columns={'last_watch_dt': Columns.Datetime,
                                'total_dur': Columns.Weight}, inplace=True) 

# Мы хотим учитывать юзеров с 1 или двумя просмотрами, так как этого недостаточно чтобы говорить о пользователях хоть что-нибудь
user_ids_all = interactions_df.groupby(Columns.User)[Columns.Item].nunique().reset_index(name='unique_items_count')
hot_users = user_ids_all[user_ids_all['unique_items_count'] > 20][Columns.User]
interactions_df_hot_users = interactions_df[interactions_df[Columns.User].isin(hot_users)]


users = users[users[Columns.User].isin(interactions_df_hot_users[Columns.User])]
interactions_df_hot_users = interactions_df_hot_users[interactions_df_hot_users[Columns.User].isin(users[Columns.User])]
items = items[items[Columns.Item].isin(interactions_df_hot_users[Columns.Item])]

interactions = Interactions(interactions_df_hot_users)   
catalog = interactions.df[Columns.Item].unique()

interactions.df

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
6,1016458,354,2021-08-14,1672.0,25.0
14,5324,8437,2021-04-18,6598.0,92.0
18,927973,9617,2021-06-19,8422.0,100.0
...,...,...,...,...,...
5476213,706423,3384,2021-03-13,5937.0,100.0
5476223,489587,6945,2021-05-18,229.0,4.0
5476226,435089,13475,2021-07-06,4624.0,85.0
5476239,610017,7107,2021-05-10,1133.0,75.0


In [68]:
N_SPLITS = 3
TEST_SIZE = '14D'

cv = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=False,
)
cv.get_test_fold_borders(interactions)

[(Timestamp('2021-07-12 00:00:00', freq='14D'),
  Timestamp('2021-07-26 00:00:00', freq='14D')),
 (Timestamp('2021-07-26 00:00:00', freq='14D'),
  Timestamp('2021-08-09 00:00:00', freq='14D')),
 (Timestamp('2021-08-09 00:00:00', freq='14D'),
  Timestamp('2021-08-23 00:00:00', freq='14D'))]

In [62]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 8
N_FACTORS = 8
NO_COMPONENTS = 16
N_EPOCHS = 10
LEARNING_RATE = 0.012
RHO = 0.972
EPSILON = 5.6 * 1e-5

metrics = {
    "precision@10": Precision(k=10),
    "accuracy@10": Recall(k=10),
    "map@10": MAP(k=10),
    "mrr@10": MRR(k=10),
    "ndcg@10": NDCG(k=10),
    "precision@5": Precision(k=5),
    "accuracy@5": Recall(k=5),
    "map@5": MAP(k=5),
    "mrr@5": MRR(k=5),
    "ndcg@5": NDCG(k=5),
    "precision@1": Precision(k=1),
    "accuracy@1": Recall(k=1),
    "map@1": MAP(k=1),
    "mrr@1": MRR(k=1),
    "ndcg@1": NDCG(k=1),
}

models = {
    "popular": PopularModel(),
}

models["ALS"] = ImplicitALSWrapperModel(
    model=AlternatingLeastSquares(
        factors=N_FACTORS, 
        random_state=RANDOM_STATE, 
        num_threads=NUM_THREADS
    )
)
        

lightfm_losses = 'warp'
models[f"LightFM"] = LightFMWrapperModel(
    LightFM(
        no_components=NO_COMPONENTS, 
        loss=lightfm_losses, 
        random_state=RANDOM_STATE,
        learning_rate=LEARNING_RATE,
        rho=RHO,
        epsilon=EPSILON,
    ),
    epochs=N_EPOCHS,
    num_threads=NUM_THREADS,
)

In [63]:
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
24,269408,Ж,sex
27,384532,М,sex
66,216495,М,sex
81,515668,М,sex
136,824452,М,sex


In [64]:
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

item_features = pd.concat((genre_feature, content_feature))
item_features

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


In [69]:
%%time
    
cv.get_test_fold_borders(interactions)

results = []

fold_iterator = cv.split(interactions, collect_fold_stats=True)

for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=cv.n_splits):
    df_train = interactions.df.iloc[train_ids]
    df_test = interactions.df.iloc[test_ids]
    test_users = np.unique(df_test[Columns.User])
    
    catalog = interactions.df[Columns.Item].unique()
    
    user_features_train = user_features[user_features["id"].isin(df_train[Columns.User])]
    item_features_train = item_features[item_features["id"].isin(df_train[Columns.Item])]   
    
    user_features_test = user_features[user_features["id"].isin(df_test[Columns.User])]
    item_features_test = item_features[item_features["id"].isin(df_test[Columns.Item])] 

    dataset_train = Dataset.construct(
        interactions_df=df_train,
        user_features_df=user_features_train,
        cat_user_features=["sex", "age", "income"],
        item_features_df=item_features_train,
        cat_item_features=["genre", "content_type"],
    )
    dataset_test = Dataset.construct(
        interactions_df=df_test,
        user_features_df=user_features_test,
        cat_user_features=["sex", "age", "income"],
        item_features_df=item_features_test,
        cat_item_features=["genre", "content_type"],
    )

    for model_name, model in models.items():
        print(f"Fitting model {model_name}...")
    
        model.fit(dataset_train)
        recos = model.recommend(
            users=test_users,
            dataset=dataset_test,
            k=K_RECOS,
            filter_viewed=True,
        )
        
        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=interactions.df,
            prev_interactions=interactions.df,
            catalog=catalog,
        )

        res = {"fold": fold_info["i_split"], "model": model_name}
        res.update(metric_values)
        results.append(res)
results

  0%|          | 0/3 [00:00<?, ?it/s]

Fitting model popular...
Fitting model ALS...




  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:06<?, ?it/s]


KeyError: 'Some indices do not exist'

In [66]:
pivot_results = pd.DataFrame(results).drop(columns="fold").groupby(["model"], sort=False).agg(["mean", "std"])
pivot_results

Unnamed: 0_level_0,precision@10,precision@10,accuracy@10,accuracy@10,precision@5,precision@5,accuracy@5,accuracy@5,precision@1,precision@1,...,mrr@5,mrr@5,mrr@1,mrr@1,map@10,map@10,map@5,map@5,map@1,map@1
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,...,mean,std,mean,std,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
popular,0.0,,0.0,,0.0,,0.0,,0.0,,...,0.0,,0.0,,0.0,,0.0,,0.0,
