In [86]:
import pickle
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from rectools.models import RandomModel, PopularModel
from rectools.dataset import Interactions, Dataset
from rectools.model_selection import TimeRangeSplitter
from rectools import Columns
from rectools.metrics import (
    Precision,
    Accuracy,
    MAP,
    MRR, 
    NDCG,
    calc_metrics,
)
from rectools.models import ImplicitItemKNNWrapperModel

In [87]:
interactions = pd.read_csv("../artifacts/interactions.csv")
interactions.rename(columns={'last_watch_dt': Columns.Datetime, 'total_dur': Columns.Weight}, inplace=True)
dataset = Dataset.construct(interactions)

# Выбираем самое популярное
items_ids_all = interactions.groupby(Columns.Item)[Columns.User].nunique().reset_index(name='unique_users_count')
popular_items = items_ids_all.sort_values(by='unique_users_count', ascending=False).head(10)[Columns.Item]

# Запоминаем отсутствующих юзеров
cold_users = set(range(1100000)).difference(set(interactions[Columns.User]))

# Сохраняем список горячих юзеров
user_ids_all = interactions.groupby(Columns.User)[Columns.Item].nunique().reset_index(name='unique_items_count')
hot_users = user_ids_all[user_ids_all['unique_items_count'] > 12][Columns.User]
print(f"Hot users cout: {hot_users.shape[0]}")

# Десереализуем холодную модель
with open("../artifacts/first_experiment_popular.pkl", "rb") as file:
    warm_model = pickle.load(file)

# Десереализуем горячую модель
with open("../artifacts/task3_cropped12_experiment_tfidf_userknn.pkl", "rb") as file:
    hot_model = pickle.load(file)

# df_hot = interactions[interactions[Columns.User].isin(hot_users)][Columns.User]

df_hot = pd.DataFrame({Columns.User: interactions[interactions[Columns.User].isin(hot_users)][Columns.User]})
recos_hot = hot_model.predict(df_hot)
df_warm = interactions[~interactions[Columns.User].isin(df_hot[Columns.User])].drop_duplicates(subset=Columns.User)

# df_warm = set(df_hot[Columns.User]).difference(set(interactions[Columns.User])
recos_warm = warm_model.recommend(
    users=df_warm[Columns.User],
    dataset=dataset,
    k=10,
    filter_viewed=True,
)
recos_cold = popular_items    

In [88]:
interactions_df = pd.read_csv('../artifacts/interactions.csv')
interactions_df.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    }, 
    inplace=True) 
# Мы хотим учитывать юзеров с 1 или двумя просмотрами, так как этого недостаточно чтобы говорить о пользователях хоть что-нибудь
user_ids_all = interactions_df.groupby(Columns.User)[Columns.Item].nunique().reset_index(name='unique_items_count')
hot_users = user_ids_all[user_ids_all['unique_items_count'] > 10][Columns.User]
interactions_df_hot_users = interactions_df[interactions_df[Columns.User].isin(hot_users)]
interactions = Interactions(interactions_df)

selected_columns = [Columns.Item, 'title', 'release_year', 'genres', 'countries']
item_data = pd.read_csv('../artifacts/items.csv', usecols=selected_columns)

In [89]:
item_data

Unnamed: 0,item_id,title,release_year,genres,countries
0,10711,Поговори с ней,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания
1,2508,Голые перцы,2014.0,"зарубежные, приключения, комедии",США
2,10716,Тактическая сила,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада
3,7868,45 лет,2015.0,"драмы, зарубежные, мелодрамы",Великобритания
4,16268,Все решает мгновение,1978.0,"драмы, спорт, советские, мелодрамы",СССР
...,...,...,...,...,...
15958,6443,Полярный круг,2018.0,"драмы, триллеры, криминал","Финляндия, Германия"
15959,2367,Надежда,2020.0,"драмы, боевики",Россия
15960,10632,Сговор,2017.0,"драмы, триллеры, криминал",Россия
15961,4538,Среди камней,2019.0,"драмы, спорт, криминал",Россия


In [90]:
pd.concat([interactions.df.head(), interactions.df.tail()])

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
2,656683,7107,2021-05-09,10.0,0.0
3,864613,7638,2021-07-05,14483.0,100.0
4,964868,9506,2021-04-30,6725.0,100.0
5476246,648596,12225,2021-08-13,76.0,0.0
5476247,546862,9673,2021-04-13,2308.0,49.0
5476248,697262,15297,2021-08-20,18307.0,63.0
5476249,384202,16197,2021-04-19,6203.0,100.0
5476250,319709,4436,2021-08-15,3921.0,45.0


In [91]:
interactions.df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   item_id      int64         
 2   datetime     datetime64[ns]
 3   weight       float64       
 4   watched_pct  float64       
dtypes: datetime64[ns](1), float64(2), int64(2)
memory usage: 208.9 MB


In [92]:
models = {
    # "random": RandomModel(random_state=42),
    "popular": PopularModel(),
    # "most_raited": PopularModel(popularity="sum_weight")
}

In [93]:
metrics = {
    # "precision@10": Precision(k=10),
    "accuracy@10": Accuracy(k=10),
    # "map@10": MAP(k=10),
    # "mrr@10": MRR(k=10),
    # "ndcg@10": NDCG(k=10),
    # "precision@5": Precision(k=5),
    # "accuracy@5": Accuracy(k=5),
    # "map@5": MAP(k=5),
    # "mrr@5": MRR(k=5),
    # "ndcg@5": NDCG(k=5),
    # "precision@1": Precision(k=1),
    # "accuracy@1": Accuracy(k=1),
    # "map@1": MAP(k=1),
    # "mrr@1": MRR(k=1),
    # "ndcg@1": NDCG(k=1),
}

In [94]:
n_splits = 3

splitter = TimeRangeSplitter(
    test_size="14D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=False,
)

Расчёт метрик:

In [95]:
def get_metrics(models, metrics, splitter, k):
    
    splitter.get_test_fold_borders(interactions)
    
    results = []
    
    fold_iterator = splitter.split(interactions, collect_fold_stats=True)
    
    for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=splitter.n_splits):
        print(f"\n==================== Fold {fold_info['i_split']}")
        print(fold_info)
        
        dataset = Dataset.construct(interactions.df)
    
        df_train = interactions.df.iloc[train_ids]
        dataset_train = Dataset.construct(df_train)
    
        df_test = interactions.df.iloc[test_ids][Columns.UserItem]
        test_users = np.unique(df_test[Columns.User])
    
        catalog = interactions.df[Columns.Item].unique()
    
        for model_name, model in models.items():
            
            model.fit(dataset_train)
            recos = model.recommend(
                users=test_users,
                dataset=dataset,
                k=k,
                filter_viewed=True,
            )
            
            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=df_test,
                prev_interactions=df_train,
                catalog=catalog,
            )
            
            res = {"fold": fold_info["i_split"], "model": model_name}
            res.update(metric_values)
            results.append(res)
            
    return results

In [96]:
def visual_analys(model, interactions, user_ids, item_data):
    
    dataset = Dataset.construct(interactions.df)
    recos = model.recommend(
    users=user_ids,
    dataset=dataset,
    k=10,
    filter_viewed=True,
    )
    
    recos = pd.merge(recos, item_data, on=Columns.Item, how='left')
    
    print(recos.to_string(index=False, max_colwidth=40))

In [97]:
results = get_metrics(models, metrics, splitter, k=10)
results

  0%|          | 0/3 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-07-12 00:00:00', freq='14D'), 'end': Timestamp('2021-07-26 00:00:00', freq='14D'), 'train': 3239125, 'train_users': 646423, 'train_items': 14730, 'test': 637836, 'test_users': 216920, 'test_items': 7851}

{'i_split': 1, 'start': Timestamp('2021-07-26 00:00:00', freq='14D'), 'end': Timestamp('2021-08-09 00:00:00', freq='14D'), 'train': 3892558, 'train_users': 742256, 'train_items': 15085, 'test': 726066, 'test_users': 241149, 'test_items': 8191}

{'i_split': 2, 'start': Timestamp('2021-08-09 00:00:00', freq='14D'), 'end': Timestamp('2021-08-23 00:00:00', freq='14D'), 'train': 4649162, 'train_users': 850489, 'train_items': 15415, 'test': 787191, 'test_users': 257877, 'test_items': 8115}


[{'fold': 0, 'model': 'popular', 'accuracy@10': 0.9991760842715209},
 {'fold': 1, 'model': 'popular', 'accuracy@10': 0.9991715993565945},
 {'fold': 2, 'model': 'popular', 'accuracy@10': 0.9991689428748076}]

In [98]:
user_ids = [666262, 672861, 955527]
for model_name, model in models.items():
    print(f'\nmodel "{model_name}":')
    visual_analys(model, interactions, user_ids, item_data)


model "popular":
 user_id  item_id    score  rank                   title  release_year                                   genres           countries
  666262    15297 175949.0     1         Клиника счастья        2021.0                         драмы, мелодрамы              Россия
  666262    10152 168500.0     2                  Гамбит        2012.0                        криминал, комедии                 США
  666262     3043 108966.0     3      Княжна из хрущёвки        2012.0                       мелодрамы, комедии              Россия
  666262     3190 107640.0     4 Ещё одна из рода Болейн        2008.0                         драмы, мелодрамы Великобритания, США
  666262    15423  80649.0     5       Легенда о Нараяме        1983.0                                    драмы              Япония
  666262     3017  64809.0     6                Затмение        2017.0                     фэнтези, приключения              Россия
  666262     2293  63183.0     7           Шалом, папик!  

In [99]:
pivot_results = pd.DataFrame(results).drop(columns="fold").groupby(["model"], sort=False).agg(["mean", "std"])
pivot_results

Unnamed: 0_level_0,accuracy@10,accuracy@10
Unnamed: 0_level_1,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2
popular,0.999172,4e-06


In [100]:
pivot_results.to_csv("../artifacts/first_reco_result.csv")

In [101]:
for model_name, model in models.items():
    with open(f"../artifacts/first_experiment_{model_name}.pkl", "wb") as file:
        pickle.dump(model, file)

In [102]:
# Opening saved model
with open("../artifacts/first_experiment_most_raited.pkl", "rb") as file:
    model = pickle.load(file)

# The model has now been deserialized, next is to make use of it as you normally would.
recos = model.recommend(
    users=user_ids,
    dataset=dataset,
    k=10,
    filter_viewed=True,
)
    

NameError: name 'dataset' is not defined

In [None]:
recos

In [None]:
interactions_df = pd.read_csv('../artifacts/interactions.csv')
users = pd.read_csv('../artifacts/users.csv')
items = pd.read_csv('../artifacts/items.csv')

interactions_df.rename(columns={'last_watch_dt': Columns.Datetime,
                                'total_dur': Columns.Weight}, inplace=True) 

# Мы хотим учитывать юзеров с 1 или двумя просмотрами, так как этого недостаточно чтобы говорить о пользователях хоть что-нибудь
user_ids_all = interactions_df.groupby(Columns.User)[Columns.Item].nunique().reset_index(name='unique_items_count')
hot_users = user_ids_all[user_ids_all['unique_items_count'] > 10][Columns.User]
interactions_df_hot_users = interactions_df[interactions_df[Columns.User].isin(hot_users)]
interactions_df_hot_users