In [2]:
from rectools.dataset import Interactions, Dataset
from rectools import Columns

import pandas as pd
import pickle

In [43]:
interactions = pd.read_csv("../artifacts/interactions.csv")
interactions.rename(columns={'last_watch_dt': Columns.Datetime, 'total_dur': Columns.Weight}, inplace=True)
dataset = Dataset.construct(interactions)

# Выбираем самое популярное
items_ids_all = interactions.groupby(Columns.Item)[Columns.User].nunique().reset_index(name='unique_users_count')
popular_items = items_ids_all.sort_values(by='unique_users_count', ascending=False).head(10)[Columns.Item]

# Запоминаем отсутствующих юзеров
cold_users = set(range(1100000)).difference(set(interactions[Columns.User]))

# Сохраняем список горячих юзеров
user_ids_all = interactions.groupby(Columns.User)[Columns.Item].nunique().reset_index(name='unique_items_count')
hot_users = user_ids_all[user_ids_all['unique_items_count'] > 12][Columns.User]
print(f"Hot users cout: {hot_users.shape[0]}")

# Десереализуем холодную модель
with open("../artifacts/first_experiment_popular.pkl", "rb") as file:
    warm_model = pickle.load(file)

# Десереализуем горячую модель
with open("../artifacts/task3_cropped12_experiment_tfidf_userknn.pkl", "rb") as file:
    hot_model = pickle.load(file)

# df_hot = interactions[interactions[Columns.User].isin(hot_users)][Columns.User]

df_hot = pd.DataFrame({Columns.User: interactions[interactions[Columns.User].isin(hot_users)]["user_id"]})
recos_hot = hot_model.predict(df_hot)
df_warm = interactions[~interactions[Columns.User].isin(df_hot[Columns.User])].drop_duplicates(subset=Columns.User)

# df_warm = set(df_hot[Columns.User]).difference(set(interactions[Columns.User])
recos_warm = warm_model.recommend(
    users=df_warm[Columns.User],
    dataset=dataset,
    k=10,
    filter_viewed=True,
)
recos_cold = popular_items    

Hot users cout: 103211


In [23]:
# df_warm[df_warm[Columns.User] == 774973]

In [24]:
df_warm

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
5,1032142,6686,2021-05-13,11286,100.0
8,648682,1449,2021-06-13,26246,75.0
...,...,...,...,...,...
5476245,786732,4880,2021-05-12,753,0.0
5476246,648596,12225,2021-08-13,76,0.0
5476247,546862,9673,2021-04-13,2308,49.0
5476248,697262,15297,2021-08-20,18307,63.0


In [29]:
user_id = 2421

# Горячий
if hot_users.isin([user_id]).any():
    # return list(range(k_recs))
    print(f"user_id {user_id} hot start predict")
    # user_id_kostyl = pd.DataFrame({Columns.User: [user_id]})
    # recos = hot_model.predict(user_id_kostyl)

    recos = recos_hot[ recos_hot[Columns.User].isin([user_id])]["item_id"]
    print(f"user_id {user_id} is hot; recos {recos}; len{len(recos)}")

# Теплый
elif user_id not in cold_users:
    # return list(range(k_recs))
    print(f"user_id {user_id} warm start predict")
    # recos = warm_model.recommend(
    #     users=[user_id],
    #     dataset=dataset,
    #     k=k_recs,
    #     filter_viewed=True)
    recos = recos_warm[recos_warm[Columns.User].isin([user_id])]["item_id"]
    print(f"user_id {user_id} is warm; recos {recos}; len{len(recos)}")

# Холодный
else:
    print(f"user_id {user_id} cold start predict")
    recos = recos_cold
    print(f"user_id {user_id} is cold; recos {recos}; len{len(recos)}")
    

user_id 2421 warm start predict
user_id 2421 is warm; recos 2786300     10152
2786301      3043
2786302      3190
2786303     15423
2786304      3017
2786305      2293
2786306      9851
2786307     14817
2786308       657
2786309      1819
9077560     10152
9077561      3043
9077562      3190
9077563     15423
9077564      3017
9077565      2293
9077566      9851
9077567     14817
9077568       657
9077569      1819
11548220    10152
11548221     3043
11548222     3190
11548223    15423
11548224     3017
11548225     2293
11548226     9851
11548227    14817
11548228      657
11548229     1819
12898030    10152
12898031     3043
12898032     3190
12898033    15423
12898034     3017
12898035     2293
12898036     9851
12898037    14817
12898038      657
12898039     1819
15368200    10152
15368201     3043
15368202     3190
15368203    15423
15368204     3017
15368205     2293
15368206     9851
15368207    14817
15368208      657
15368209     1819
19328210    10152
19328211     3043
1932

In [35]:
recos_warm

Unnamed: 0,user_id,item_id,score,rank
0,656683,15297,175949.0,1
1,656683,10152,168500.0,2
2,656683,3043,108966.0,3
3,656683,3190,107640.0,4
4,656683,15423,80649.0,5
...,...,...,...,...
26749305,319709,3017,64809.0,36
26749306,319709,2293,63183.0,37
26749307,319709,9851,51416.0,38
26749308,319709,14817,40663.0,39


In [32]:
recos

2786300     10152
2786301      3043
2786302      3190
2786303     15423
2786304      3017
2786305      2293
2786306      9851
2786307     14817
2786308       657
2786309      1819
9077560     10152
9077561      3043
9077562      3190
9077563     15423
9077564      3017
9077565      2293
9077566      9851
9077567     14817
9077568       657
9077569      1819
11548220    10152
11548221     3043
11548222     3190
11548223    15423
11548224     3017
11548225     2293
11548226     9851
11548227    14817
11548228      657
11548229     1819
12898030    10152
12898031     3043
12898032     3190
12898033    15423
12898034     3017
12898035     2293
12898036     9851
12898037    14817
12898038      657
12898039     1819
15368200    10152
15368201     3043
15368202     3190
15368203    15423
15368204     3017
15368205     2293
15368206     9851
15368207    14817
15368208      657
15368209     1819
19328210    10152
19328211     3043
19328212     3190
19328213    15423
19328214     3017
19328215  

In [28]:
recos_warm

Unnamed: 0,user_id,item_id,score,rank
0,656683,15297,175949.0,1
1,656683,10152,168500.0,2
2,656683,3043,108966.0,3
3,656683,3190,107640.0,4
4,656683,15423,80649.0,5
...,...,...,...,...
26749305,319709,3017,64809.0,36
26749306,319709,2293,63183.0,37
26749307,319709,9851,51416.0,38
26749308,319709,14817,40663.0,39


In [42]:
aa = df_warm.drop_duplicates(subset=Columns.User)
aa[aa[Columns.User].isin([2421])]

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
569754,2421,9728,2021-08-13,1382,20.0


In [None]:
users_csv = pd.read_csv("../artifacts/users.csv")
users_csv

In [None]:
interactions[Columns.User].isin([302]).any()

In [38]:
recos_warm.head(10)

0    15297
1    10152
2     3043
3     3190
4    15423
5     3017
6     2293
7     9851
8    14817
9      657
Name: item_id, dtype: int64

In [70]:
items_ids_all

Unnamed: 0,item_id,unique_users_count
0,0,37
1,1,23
2,2,38
3,3,18
4,4,6
...,...,...
15701,16514,2
15702,16515,1
15703,16516,74
15704,16517,1


In [74]:
top_items_for_highest_users = items_ids_all.sort_values(by='unique_users_count', ascending=False).head(1000)[Columns.Item]
top_items_for_highest_users

9906     10440
14534    15297
9242      9728
13171    13865
3941      4151
         ...  
214        218
3511      3697
10801    11382
13549    14263
5206      5469
Name: item_id, Length: 1000, dtype: int64

In [7]:
interactions[Columns.User] 

0          176549
1          699317
2          656683
3          864613
4          964868
            ...  
5476246    648596
5476247    546862
5476248    697262
5476249    384202
5476250    319709
Name: user_id, Length: 5476251, dtype: int64

In [10]:
hot_users

2               2
3               3
5               5
10             11
12             13
           ...   
962133    1097508
962138    1097513
962141    1097516
962146    1097521
962165    1097544
Name: user_id, Length: 103211, dtype: int64

In [11]:
df_test = interactions[interactions[Columns.User].isin([666262])]["item_id"]
df_test

2686271     7957
3293190     4785
3393617    12981
Name: item_id, dtype: int64