In [34]:
import pickle
import numpy as np
import pandas as pd
from implicit.bpr import BayesianPersonalizedRanking
from rectools.dataset import Interactions, Dataset
from rectools.model_selection import TimeRangeSplitter
from rectools.metrics import MAP, MeanInvUserFreq, calc_metrics, MRR, NDCG

from implicit.als import AlternatingLeastSquares

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel, PopularModel
from tqdm import tqdm

from lightfm import LightFM

import optuna

In [35]:
interactions_df = pd.read_csv('../artifacts/data_original/interactions.csv')
users = pd.read_csv('../artifacts/data_original/users.csv')
items = pd.read_csv('../artifacts/data_original/items.csv')

interactions_df.rename(columns={'last_watch_dt': Columns.Datetime,
                                'total_dur': Columns.Weight}, inplace=True) 

# Мы хотим учитывать юзеров с 1 или двумя просмотрами, так как этого недостаточно чтобы говорить о пользователях хоть что-нибудь
user_ids_all = interactions_df.groupby(Columns.User)[Columns.Item].nunique().reset_index(name='unique_items_count')
hot_users = user_ids_all[user_ids_all['unique_items_count'] > 5][Columns.User]
interactions_df_hot_users = interactions_df[interactions_df[Columns.User].isin(hot_users)]


users = users[users[Columns.User].isin(interactions_df_hot_users[Columns.User])]
interactions_df_hot_users = interactions_df_hot_users[interactions_df_hot_users[Columns.User].isin(users[Columns.User])]
items = items[items[Columns.Item].isin(interactions_df_hot_users[Columns.Item])]

interactions = Interactions(interactions_df_hot_users)   
catalog = interactions.df[Columns.Item].unique()

interactions.df

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
3,864613,7638,2021-07-05,14483.0,100.0
6,1016458,354,2021-08-14,1672.0,25.0
7,884009,693,2021-08-04,703.0,14.0
...,...,...,...,...,...
5476243,497899,9629,2021-05-29,45.0,1.0
5476244,438585,7829,2021-08-02,6804.0,100.0
5476245,786732,4880,2021-05-12,753.0,0.0
5476247,546862,9673,2021-04-13,2308.0,49.0


In [36]:
interactions_df_hot_users[Columns.User].drop_duplicates()

0           176549
1           699317
3           864613
6          1016458
7           884009
            ...   
4189944     503172
4196639     904576
4226004     616289
4228658     624995
4481301     265537
Name: user_id, Length: 207228, dtype: int64

In [37]:
N_SPLITS = 10
TEST_SIZE = '14D'

cv = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=False,
)
cv.get_test_fold_borders(interactions)

[(Timestamp('2021-04-05 00:00:00', freq='14D'),
  Timestamp('2021-04-19 00:00:00', freq='14D')),
 (Timestamp('2021-04-19 00:00:00', freq='14D'),
  Timestamp('2021-05-03 00:00:00', freq='14D')),
 (Timestamp('2021-05-03 00:00:00', freq='14D'),
  Timestamp('2021-05-17 00:00:00', freq='14D')),
 (Timestamp('2021-05-17 00:00:00', freq='14D'),
  Timestamp('2021-05-31 00:00:00', freq='14D')),
 (Timestamp('2021-05-31 00:00:00', freq='14D'),
  Timestamp('2021-06-14 00:00:00', freq='14D')),
 (Timestamp('2021-06-14 00:00:00', freq='14D'),
  Timestamp('2021-06-28 00:00:00', freq='14D')),
 (Timestamp('2021-06-28 00:00:00', freq='14D'),
  Timestamp('2021-07-12 00:00:00', freq='14D')),
 (Timestamp('2021-07-12 00:00:00', freq='14D'),
  Timestamp('2021-07-26 00:00:00', freq='14D')),
 (Timestamp('2021-07-26 00:00:00', freq='14D'),
  Timestamp('2021-08-09 00:00:00', freq='14D')),
 (Timestamp('2021-08-09 00:00:00', freq='14D'),
  Timestamp('2021-08-23 00:00:00', freq='14D'))]

In [38]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 8
N_FACTORS = 8
NO_COMPONENTS = 16
N_EPOCHS = 10
LEARNING_RATE = 0.012
RHO = 0.972
EPSILON = 5.6 * 1e-5

metrics = {
    "precision@10": Precision(k=10),
    "accuracy@10": Recall(k=10),
    "map@10": MAP(k=10),
    "mrr@10": MRR(k=10),
    "ndcg@10": NDCG(k=10),
    "precision@5": Precision(k=5),
    "accuracy@5": Recall(k=5),
    "map@5": MAP(k=5),
    "mrr@5": MRR(k=5),
    "ndcg@5": NDCG(k=5),
    "precision@1": Precision(k=1),
    "accuracy@1": Recall(k=1),
    "map@1": MAP(k=1),
    "mrr@1": MRR(k=1),
    "ndcg@1": NDCG(k=1),
}

lightfm_losses = 'warp'
model = LightFMWrapperModel(
    LightFM(
        no_components=NO_COMPONENTS, 
        loss=lightfm_losses, 
        random_state=RANDOM_STATE,
        learning_rate=LEARNING_RATE,
        rho=RHO,
        epsilon=EPSILON,
    ),
    epochs=N_EPOCHS,
    num_threads=NUM_THREADS,
)

In [39]:
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
8,846063,Ж,sex
9,401219,Ж,sex


In [40]:
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

item_features = pd.concat((genre_feature, content_feature))

In [41]:
%%time
    
cv.get_test_fold_borders(interactions)

results = []

fold_iterator = cv.split(interactions, collect_fold_stats=True)

for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=cv.n_splits):
    df_train = interactions.df.iloc[train_ids]
    df_test = interactions.df[Columns.UserItem]
    test_users = np.unique(df_test[Columns.User])
    
    catalog = interactions.df[Columns.Item].unique()
    
    user_features_train = user_features[user_features["id"].isin(interactions.df[Columns.User])]
    item_features_train = item_features[item_features["id"].isin(interactions.df[Columns.Item])]    

    dataset = Dataset.construct(
        interactions_df=interactions.df,
        user_features_df=user_features,
        cat_user_features=["sex", "age", "income"],
        item_features_df=item_features,
        cat_item_features=["genre", "content_type"],
    )
    
    model.fit(dataset)

100%|██████████| 10/10 [05:11<00:00, 31.18s/it]

CPU times: user 39min 41s, sys: 15 ms, total: 39min 41s
Wall time: 5min 11s





In [42]:
with open(f"../artifacts/task4_cropped5_experiment_LFM.pkl", "wb") as file:
    pickle.dump(model, file)

In [46]:
with open("../artifacts/task4_cropped5_experiment_LightFM.pkl", "rb") as file:
    model_1 = pickle.load(file)

dataset = Dataset.construct(
        interactions_df=interactions.df,
        user_features_df=user_features,
        cat_user_features=["sex", "age", "income"],
        item_features_df=item_features,
        cat_item_features=["genre", "content_type"],
)

recos = model.recommend(
    users=[3721],
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True,
)
    
recos

Unnamed: 0,user_id,item_id,score,rank
0,3721,10440,-135.853378,1
1,3721,13865,-135.976929,2
2,3721,15297,-136.082047,3
3,3721,4151,-136.146973,4
4,3721,4880,-136.153809,5
5,3721,3734,-136.3992,6
6,3721,7829,-136.466263,7
7,3721,9996,-136.6604,8
8,3721,12192,-136.680435,9
9,3721,11237,-136.754456,10
