In [1]:
import os
from copy import deepcopy

from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender
import pandas as pd

from rectools.dataset import Dataset
from rectools.model_selection import TimeRangeSplitter
from rectools.models import PopularModel, RandomModel
from rectools.metrics import MAP, NDCG, Precision, Recall, MeanInvUserFreq, Serendipity
import warnings

from dev_eval import calculate_metrics, read_kion_dataset, visualize
from userknn import UserKnn

pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore")

# Data

Подгружаем датасет кион используя дополнительную функцию

In [2]:
kion_data = read_kion_dataset(fast_check=1)
interactions = kion_data["interactions"]
users = kion_data["users"]
items = kion_data["items"]

# Гиперпараметры

Смотреть только на MAP скучно, поэтому давайте смотреть на все метрики, которые брали в прошлый раз, но только для `k_recos=10`, так как все равно рекомендуем 10.

In [3]:
metrics = {
    "MAP@10": MAP(k=10),
    "NDCG@10": NDCG(k=10),
    "precision@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty@10": MeanInvUserFreq(k=10),
    "serendipity@10": Serendipity(k=10),
}

Сплиттер: 
* `rectools.model_selection.TimeRangeSplitter`,
* 3 фолда на кросс-валидации
* попробуем 7 и 14 дней


In [4]:
cv_7d = TimeRangeSplitter(
    test_size="7D",  # по неделе
    n_splits=3,  # 4 фолда для кросс-валидации
    filter_already_seen=True,  # исключение просмотренных айтемов
    filter_cold_items=True,  # исключение холодных айтемов
    filter_cold_users=True,  #  исключение холодных юзеров
)
cv_14d = TimeRangeSplitter(
    test_size="14D",  # по неделе
    n_splits=3,  # 4 фолда для кросс-валидации
    filter_already_seen=True,  # исключение просмотренных айтемов
    filter_cold_items=True,  # исключение холодных айтемов
    filter_cold_users=True,  #  исключение холодных юзеров
)

In [5]:
k_recos = 10

# Популярное

Будем использовать популярное:
- как бейзлайн
- лучший из популярного будем использовать в дополнение к юзеркнн для холодных порльзователей и в случае недостаточности рекомендаций.

In [6]:
models_popular = {
    "popular__n_users": PopularModel(popularity="n_users"),
    "popular__n_interactions": PopularModel(popularity="n_interactions"),
    "popular__mean_weight": PopularModel(popularity="mean_weight"),
}

In [7]:
%%time
result_data = calculate_metrics(models_popular, kion_data, metrics, cv_7d, k_recos=k_recos, style=True, verbose=0)
display(result_data)

  0%|          | 0/3 [00:00<?, ?it/s]

Metric,MAP,MAP,NDCG,NDCG,precision,precision,recall,recall,novelty,novelty,serendipity,serendipity
At,10,10,10,10,10,10,10,10,10,10,10,10
Stat,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
popular__mean_weight,4e-06,3e-06,2e-06,1e-06,3e-06,0.0,1.6e-05,1e-05,18.563409,0.051314,0.0,0.0
popular__n_interactions,0.068934,0.004929,0.036913,0.002218,0.031054,0.001912,0.159616,0.010803,3.437632,0.004487,0.0,0.0
popular__n_users,0.068934,0.004929,0.036913,0.002218,0.031054,0.001912,0.159616,0.010803,3.437632,0.004487,0.0,0.0


CPU times: user 47.8 s, sys: 989 ms, total: 48.8 s
Wall time: 48.7 s


In [8]:
%%time
result_data = calculate_metrics(models_popular, kion_data, metrics, cv_14d, k_recos=k_recos, style=True, verbose=0)
display(result_data)

  0%|          | 0/3 [00:00<?, ?it/s]

Metric,MAP,MAP,NDCG,NDCG,precision,precision,recall,recall,novelty,novelty,serendipity,serendipity
At,10,10,10,10,10,10,10,10,10,10,10,10
Stat,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
popular__mean_weight,3e-06,3e-06,2e-06,2e-06,2e-06,2e-06,1.1e-05,1.2e-05,18.433416,0.191683,0.0,0.0
popular__n_interactions,0.080294,0.010014,0.049524,0.00514,0.042783,0.005425,0.191317,0.031802,3.470736,0.044057,0.0,0.0
popular__n_users,0.080294,0.010014,0.049524,0.00514,0.042783,0.005425,0.191317,0.031802,3.470736,0.044057,0.0,0.0


CPU times: user 53 s, sys: 911 ms, total: 53.9 s
Wall time: 53.8 s


Популярное на взаимодействиях работает одинаково с популярным на пользователях, воспользуемся первым. Но сначала проверим, все ли с ним хорошо.

In [9]:
item_data = ["title", "genres"]
k_recos = 10
users_list = [
    79446,
    1074610,
]
dataset_for_train = Dataset.construct(interactions.df)

In [10]:
model = deepcopy(models_popular["popular__n_interactions"])
model.fit(dataset_for_train)

<rectools.models.popular.PopularModel at 0x7f63035eaf10>

In [11]:
visualize(model=model, dataset=kion_data, user_list=users_list, item_data=item_data, k_recos=20, display=display)

Visual report
----------------------------------------------------------
User: 79446
Already watched films amount: 33
Display last 10 watched:



Unnamed: 0,item_id,datetime,weight,watched_pct,item_id_x,title,genres,item_id_y
15,512,2021-08-15,3303.0,58.0,512,Рядовой Чээрин,военные,10230
32,1896,2021-08-01,3720.0,57.0,1896,Явление,"драмы, военные",674
19,7597,2021-08-01,5752.0,100.0,7597,Препод: История Галатеи,"драмы, триллеры, криминал",1717
9,16415,2021-07-28,6847.0,100.0,16415,Весна,"фантастика, ужасы, мелодрамы",624
7,4880,2021-07-25,2634.0,9.0,4880,Афера,комедии,55043
14,12356,2021-07-18,4071.0,77.0,12356,13 грехов,"ужасы, триллеры",6874
30,9194,2021-07-17,6760.0,95.0,9194,Роберт — король Шотландии,боевики,8030
18,9728,2021-07-17,52.0,1.0,9728,Гнев человеческий,"боевики, триллеры",132865
5,10240,2021-07-16,6126.0,100.0,10240,Клаустрофобы,триллеры,4336
25,10464,2021-07-13,8.0,0.0,10464,Вирус страха,"драмы, триллеры",10375



Recommended films amount: 20
(Amount of all films: 15706)
Display first 10 recommendations:


Unnamed: 0,item_id,score,rank,item_id_x,title,genres,item_id_y
0,10440,202457.0,1,10440,Хрустальный,"триллеры, детективы",202457
1,15297,193123.0,2,15297,Клиника счастья,"драмы, мелодрамы",193123
2,4151,91167.0,3,4151,Секреты семейной жизни,комедии,91167
3,3734,74803.0,4,3734,Прабабушка легкого поведения,комедии,74803
4,2657,68581.0,5,2657,Подслушано,"драмы, триллеры",68581
5,142,45367.0,6,142,Маша,"драмы, триллеры",45367
6,6809,40372.0,7,6809,Дуров,документальное,40372
7,12192,38242.0,8,12192,Фемида видит,"драмы, детективы, комедии",38242
8,8636,35631.0,9,8636,Белый снег,"драмы, спорт",35631
9,4740,34325.0,10,4740,Сахаров. Две жизни,документальное,34325


----------------------------------------------------------
User: 1074610
Already watched films amount: 1
Display last 10 watched:



Unnamed: 0,item_id,datetime,weight,watched_pct,item_id_x,title,genres,item_id_y
0,15297,2021-07-28,1402.0,13.0,15297,Клиника счастья,"драмы, мелодрамы",193123



Recommended films amount: 20
(Amount of all films: 15706)
Display first 10 recommendations:


Unnamed: 0,item_id,score,rank,item_id_x,title,genres,item_id_y
0,10440,202457.0,1,10440,Хрустальный,"триллеры, детективы",202457
1,9728,132865.0,2,9728,Гнев человеческий,"боевики, триллеры",132865
2,13865,122119.0,3,13865,Девятаев,"драмы, военные, приключения",122119
3,4151,91167.0,4,4151,Секреты семейной жизни,комедии,91167
4,3734,74803.0,5,3734,Прабабушка легкого поведения,комедии,74803
5,2657,68581.0,6,2657,Подслушано,"драмы, триллеры",68581
6,4880,55043.0,7,4880,Афера,комедии,55043
7,142,45367.0,8,142,Маша,"драмы, триллеры",45367
8,6809,40372.0,9,6809,Дуров,документальное,40372
9,12192,38242.0,10,12192,Фемида видит,"драмы, детективы, комедии",38242


# UserKnn

Будем использовать ряд моделей UserKnn с популярной для дополнения недостающих значений.

Пробовал разные метрики расстояний (косинусное, tf-idf и bm25) с разным количеством users для построения kNN (30 и 70).

In [12]:
models_userknn = {
    "userknn__cos_70": UserKnn(
        model=CosineRecommender(), popular_model=deepcopy(models_popular["popular__n_interactions"]), N_users=70
    ),
    "userknn__cos_30": UserKnn(
        model=CosineRecommender(), popular_model=deepcopy(models_popular["popular__n_interactions"]), N_users=30
    ),
    "userknn__bm25_70": UserKnn(
        model=BM25Recommender(), popular_model=deepcopy(models_popular["popular__n_interactions"]), N_users=70
    ),
    "userknn__bm25_30": UserKnn(
        model=BM25Recommender(), popular_model=deepcopy(models_popular["popular__n_interactions"]), N_users=30
    ),
    "userknn__tfidf_70": UserKnn(
        model=TFIDFRecommender(), popular_model=deepcopy(models_popular["popular__n_interactions"]), N_users=70
    ),
    "userknn__tfidf_30": UserKnn(
        model=TFIDFRecommender(), popular_model=deepcopy(models_popular["popular__n_interactions"]), N_users=30
    ),
}

In [13]:
%%time
result_data = calculate_metrics(models_userknn, kion_data, metrics, cv_7d, k_recos=k_recos, style=True, verbose=0)
display(result_data)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/797423 [00:00<?, ?it/s]

  0%|          | 0/797423 [00:00<?, ?it/s]

  0%|          | 0/797423 [00:00<?, ?it/s]

  0%|          | 0/797423 [00:00<?, ?it/s]

  0%|          | 0/797423 [00:00<?, ?it/s]

  0%|          | 0/797423 [00:00<?, ?it/s]

  0%|          | 0/850489 [00:00<?, ?it/s]

  0%|          | 0/850489 [00:00<?, ?it/s]

  0%|          | 0/850489 [00:00<?, ?it/s]

  0%|          | 0/850489 [00:00<?, ?it/s]

  0%|          | 0/850489 [00:00<?, ?it/s]

  0%|          | 0/850489 [00:00<?, ?it/s]

  0%|          | 0/906071 [00:00<?, ?it/s]

  0%|          | 0/906071 [00:00<?, ?it/s]

  0%|          | 0/906071 [00:00<?, ?it/s]

  0%|          | 0/906071 [00:00<?, ?it/s]

  0%|          | 0/906071 [00:00<?, ?it/s]

  0%|          | 0/906071 [00:00<?, ?it/s]

Metric,MAP,MAP,NDCG,NDCG,precision,precision,recall,recall,novelty,novelty,serendipity,serendipity
At,10,10,10,10,10,10,10,10,10,10,10,10
Stat,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
userknn__bm25_30,0.030053,0.0022,0.018727,0.001299,0.018627,0.001266,0.095763,0.00628,15.151622,0.422586,0.0,0.0
userknn__bm25_70,0.030053,0.0022,0.018727,0.001299,0.018627,0.001266,0.095763,0.00628,15.151622,0.422586,0.0,0.0
userknn__cos_30,0.031749,0.002105,0.020301,0.001182,0.02077,0.001075,0.106955,0.005425,15.119545,0.510257,0.0,0.0
userknn__cos_70,0.031749,0.002105,0.020301,0.001182,0.02077,0.001075,0.106955,0.005425,15.119545,0.510257,0.0,0.0
userknn__tfidf_30,0.031154,0.002076,0.019811,0.001152,0.020152,0.001053,0.103729,0.005442,15.130861,0.585134,0.0,0.0
userknn__tfidf_70,0.031154,0.002076,0.019811,0.001152,0.020152,0.001053,0.103729,0.005442,15.130861,0.585134,0.0,0.0


CPU times: user 2d 52min 32s, sys: 4min, total: 2d 56min 33s
Wall time: 3h 50min 14s


Давайте проверим лучший: "userknn__cos_70".

In [14]:
model = deepcopy(models_userknn["userknn__cos_70"])
model.fit(dataset_for_train)

  0%|          | 0/962179 [00:00<?, ?it/s]

In [15]:
%%time
visualize(model=model, dataset=kion_data, user_list=users_list, item_data=item_data, k_recos=10, display=display)

Visual report
----------------------------------------------------------
User: 79446
Already watched films amount: 33
Display last 10 watched:



Unnamed: 0,item_id,datetime,weight,watched_pct,item_id_x,title,genres,item_id_y
15,512,2021-08-15,3303.0,58.0,512,Рядовой Чээрин,военные,10230
32,1896,2021-08-01,3720.0,57.0,1896,Явление,"драмы, военные",674
19,7597,2021-08-01,5752.0,100.0,7597,Препод: История Галатеи,"драмы, триллеры, криминал",1717
9,16415,2021-07-28,6847.0,100.0,16415,Весна,"фантастика, ужасы, мелодрамы",624
7,4880,2021-07-25,2634.0,9.0,4880,Афера,комедии,55043
14,12356,2021-07-18,4071.0,77.0,12356,13 грехов,"ужасы, триллеры",6874
30,9194,2021-07-17,6760.0,95.0,9194,Роберт — король Шотландии,боевики,8030
18,9728,2021-07-17,52.0,1.0,9728,Гнев человеческий,"боевики, триллеры",132865
5,10240,2021-07-16,6126.0,100.0,10240,Клаустрофобы,триллеры,4336
25,10464,2021-07-13,8.0,0.0,10464,Вирус страха,"драмы, триллеры",10375



Recommended films amount: 10
(Amount of all films: 15706)
Display first 10 recommendations:


Unnamed: 0,item_id,score,rank,item_id_x,title,genres,item_id_y
0,4352,3.81791,1,4352.0,Картина маслом,комедии,464.0
1,1646,3.755157,2,1646.0,Девушка грез,"драмы, комедии",573.0
2,925,3.715795,3,925.0,Дальневосточная экспедиция,"русские, познавательные, документальное",1.0
3,1746,3.702636,4,1746.0,Дорога домой 2: Затерянные в Сан-Франциско,"мелодрамы, семейное, комедии",15.0
4,1830,3.643151,5,,,,
5,2288,3.62474,6,2288.0,Операция «Колибри»,"драмы, зарубежные, триллеры",4.0
6,2402,3.44575,7,2402.0,Мир Аоту,мультсериалы,59.0
7,1195,3.373776,8,1195.0,"[4К] Розалила, уникальный храм. Гондурас",документальное,1.0
8,595,3.326166,9,595.0,Астана – любовь моя,"драмы, мелодрамы",1.0
9,1079,3.250239,10,1079.0,Посчитай скрытые предметы,"развитие, сериалы, хочу всё знать",9.0


----------------------------------------------------------
User: 1074610
Already watched films amount: 1
Display last 10 watched:



Unnamed: 0,item_id,datetime,weight,watched_pct,item_id_x,title,genres,item_id_y
0,15297,2021-07-28,1402.0,13.0,15297,Клиника счастья,"драмы, мелодрамы",193123



Recommended films amount: 10
(Amount of all films: 15706)
Display first 10 recommendations:


Unnamed: 0,item_id,score,rank,item_id_x,title,genres,item_id_y
0,15297,0.001482,1,15297,Клиника счастья,"драмы, мелодрамы",193123
1,13865,0.001413,2,13865,Девятаев,"драмы, военные, приключения",122119
2,10440,0.001344,3,10440,Хрустальный,"триллеры, детективы",202457
3,9728,0.001344,4,9728,Гнев человеческий,"боевики, триллеры",132865
4,3734,0.001234,5,3734,Прабабушка легкого поведения,комедии,74803
5,4880,0.001183,6,4880,Афера,комедии,55043
6,6809,0.001094,7,6809,Дуров,документальное,40372
7,2657,0.000986,8,2657,Подслушано,"драмы, триллеры",68581
8,4151,0.000973,9,4151,Секреты семейной жизни,комедии,91167
9,142,0.000461,10,142,Маша,"драмы, триллеры",45367


CPU times: user 2min 4s, sys: 10.3 s, total: 2min 14s
Wall time: 2min 12s


In [16]:
%%time
model.recommend_for_user(dataset=dataset_for_train, user_id=-10)

CPU times: user 27.5 ms, sys: 2 µs, total: 27.5 ms
Wall time: 25.7 ms


[10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809]

Вроде норм

# Сохраним модель

In [17]:
import pickle

In [18]:
model_name = "../models/pickle_data/user_cos_70.pickle"
pickle.dump(model, open(model_name, "wb"))

# Проверка работоспособности

In [19]:
name_cos_70 = "../models/pickle_data/user_cos_70.pickle"
loaded_cos_70 = pickle.load(open(name_cos_70, "rb"))

kion_data = read_kion_dataset(fast_check=1)
interactions = kion_data["interactions"]
data_for_predict = Dataset.construct(interactions.df)

In [20]:
user_id = 5

model.recommend_for_user(dataset=data_for_predict, user_id=user_id)

[2068, 482, 1138, 4, 1572, 51, 11, 32, 10440, 15297]