# Baseline

Задачи на этап:
* Выбрать метрику оценки качества и обосновать выбор
* Разработать baseline (может быть несколько алгоритмов)
* Реализовать выбранное решение/я
* Протестировать работу baseline
* Выбрать итоговое решение для дальнейшей оптимизации и обосновать выбор

In [64]:
import numpy as np
import pandas as pd
import matplotlib as plt
import random
import statistics
import seaborn as sns
from scipy.sparse import csr_matrix
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import mean_absolute_percentage_error

In [4]:
df = pd.read_csv('wb_school_task_1.csv')

In [5]:
df

Unnamed: 0,user_id,item_id,order_ts
0,550614,264,2023-01-01 00:28:09.000000
1,571051,580,2023-01-01 00:41:47.000000
2,571051,180,2023-01-01 00:41:47.000000
3,47164,5135,2023-01-01 00:53:35.000000
4,219072,2668,2023-01-01 01:02:29.000000
...,...,...,...
21265779,249269,348,2023-03-31 23:55:37.182803
21265780,373951,3835,2023-03-31 23:59:24.148327
21265781,505474,1340,2023-03-31 23:59:24.161567
21265782,1011023,153,2023-03-31 23:59:38.371329


### Выбранные метрики

* Первая и самая простая метрика это hit-rate: был ли куплен или было ли взаимодействие хоть с одним товаром из списка рекомендуемых. Иными словами, метрика описывает, был ли хотя бы один релевантный товар среди показанных в блоке рекомендаций.
* Hit rate слишком простая метрика, так как она бинарная. На выручку придет вторая метрика Precision, она же точность. Считается для каждого пользователя. Precision наиболее приближена к бизнесу. Показывает, взаимодействовал ли пользователь с рекомендуемым объектом. Если Hit rate возвращал 1 или 0 для каждого пользователя, то Precision показывает долю релевантных товаров среди рекомендованных, другими словами, какой % рекомендованных товаров юзер купил.
* Третья метрика recall. Похожа на precision. Метрика отвечает за кол-во товаров, релевантных пользователю.
* F-measure (мера Френцеля)- это комбинированный показатель, учитывающий одновременно точность и полноту. Она рассчитывается как среднее геометрическое этих двух показателей.

## 1.MostPopular алгоритм в качестве Baseline

In [6]:
baseline = df.copy()
baseline['order_ts'] = pd.to_datetime(baseline['order_ts'])
baseline.head()


Unnamed: 0,user_id,item_id,order_ts
0,550614,264,2023-01-01 00:28:09
1,571051,580,2023-01-01 00:41:47
2,571051,180,2023-01-01 00:41:47
3,47164,5135,2023-01-01 00:53:35
4,219072,2668,2023-01-01 01:02:29


In [7]:
class MostPopularRecommender:
    def __init__(self):
        self.top_items = {}

    def fit(self, df):
        for month in df['order_ts'].dt.to_period('M').unique():
            month_str = str(month)
            top_items_month = df[df['order_ts'].dt.to_period('M') == month].groupby('item_id').size().nlargest(100).index.tolist()
            self.top_items[month_str] = top_items_month

    def predict(self, df):
        result = []
        for month in df['order_ts'].dt.to_period('M').unique():
            month_str = str(month)
            if month_str in self.top_items:
                result.append(random.sample(self.top_items[month_str],10))
        return result

In [8]:
from sklearn. model_selection import train_test_split

train, test = train_test_split(df, test_size= 0.16 , random_state= 0 )
train['order_ts'] = pd.to_datetime(train['order_ts'])
test['order_ts'] = pd.to_datetime(test['order_ts'])
test_idx = list(sorted(set(test.user_id)))


In [9]:
recommender = MostPopularRecommender()
recommender.fit(train)

In [10]:
recommender.top_items.keys()

dict_keys(['2023-03', '2023-02', '2023-01'])

In [11]:
test_preds = recommender.predict(test)



In [12]:
test_preds

[[1312, 82, 38, 177, 5204, 2803, 350, 170, 162, 7246],
 [351, 1834, 150, 94, 381, 203, 201, 148, 403, 356],
 [436, 185, 163, 192, 362, 170, 177, 347, 351, 352]]

### Расчет метрик

In [13]:
test['month'] = pd.to_datetime(test['order_ts']).dt.month

# Теперь создаем новый DataFrame, содержащий только покупки пользователей из теста
test_buys = test[test['user_id'].isin(test_idx)]
grouped_df = test_buys.groupby('user_id')

# Добавляем новую колонку, которая содержит список купленных предметов для каждого пользователя и колонку с месяцами покупок
fact = grouped_df.apply(lambda x: x['item_id'].tolist()).reset_index(name='list_items')
fact1 = grouped_df.apply(lambda x: x['month'].tolist()).reset_index(name='list_month')
fact = fact.merge(fact1, on='user_id')

In [14]:
fact

Unnamed: 0,user_id,list_items,list_month
0,1,[300],[2]
1,3,"[11, 185, 149, 93]","[2, 3, 1, 3]"
2,4,"[362, 357]","[3, 1]"
3,5,"[347, 348, 354, 133]","[3, 3, 3, 3]"
4,7,[1274],[3]
...,...,...,...
758976,1057261,"[1444, 2457]","[3, 3]"
758977,1057262,"[1571, 437, 89]","[1, 1, 1]"
758978,1057263,"[4668, 381]","[2, 3]"
758979,1057264,[457],[2]


In [15]:
def hit_rate_at_k(test_preds, fact, k=10):
    result = []
    for i in range(len(fact['list_items'])):
        final_bought_list = np.array(fact['list_items'][i])
        recommended_list = np.array(test_preds[fact['list_month'][0][0]-1])
        flags = np.isin(final_bought_list, recommended_list[:k] )
        hit_rate = int(flags.sum() > 0)
        result.append(hit_rate)
    return statistics.mean(result)

In [16]:
hit_rate = hit_rate_at_k(test_preds, fact, k=10)

In [17]:
hit_rate

0.14418806267877587

In [18]:
def precision_at_k(test_preds, fact, k=10):
    result = []
    for i in range(len(fact['list_items'])):
        final_bought_list = np.array(fact['list_items'][i])
        recommended_list = np.array(test_preds[fact['list_month'][0][0]-1])

        final_bought_list = final_bought_list
        recommended_list = recommended_list[:k]

        flags = np.isin(recommended_list, final_bought_list)
        precision = flags.sum() / len(recommended_list)
        result.append(precision)
    return statistics.mean(result)

In [19]:
precision = precision_at_k(test_preds, fact, k=10)

In [20]:
precision

0.01612925751764537

In [21]:
def recall_at_k(test_preds, fact, k=10):
    result = []
    for i in range(len(fact['list_items'])):
        final_bought_list = np.array(fact['list_items'][i])
        recommended_list = np.array(test_preds[fact['list_month'][0][0]-1])

        final_bought_list = final_bought_list
        recommended_list = recommended_list[:k]

        flags = np.isin(final_bought_list, recommended_list)
        recall = flags.sum() / len(final_bought_list)
        result.append(recall)
    return statistics.mean(result)


In [22]:
recall = recall_at_k(test_preds,fact,k=10)

In [23]:
recall

0.042330546039298825

In [24]:
def f_measure(precision,recall):
    f = 2  *  (precision  *  recall) / (precision + recall)
    return f

In [25]:
f = f_measure(precision,recall)

In [26]:
f

0.023358281635870864

In [65]:
# Реализованные методы
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import accuracy_score

In [34]:
# Precision, recall на встроенных методах в sklearn
precision_at_k = precision_score(fact, test_preds, average='samples')

recall_at_k = recall_score(fact, test_preds, average='samples')


In [37]:
precision_at_k

0.01612925751764537

In [38]:

recall_at_k

0.042330546039298825

# 2. NCF модель

In [3]:
wb_data = pd.read_csv('wb_school_task_1.csv')

In [5]:
# Преобразование столбца с временем заказа в формат datetime
wb_data['order_ts'] = pd.to_datetime(wb_data['order_ts'])

# Извлечение месяца из времени заказа
wb_data['month'] = wb_data['order_ts'].dt.month

# Группировка по пользователю и подсчет количества уникальных месяцев покупок
user_purchase_frequency = wb_data.groupby('user_id')['month'].nunique()

# Отфильтровываем пользователей, у которых покупки реже чем раз в 3 месяца
active_users = user_purchase_frequency[user_purchase_frequency >= 3].index

# Фильтрация исходного DataFrame по активным пользователям
df_filtered = wb_data[wb_data['user_id'].isin(active_users)]

# Группировка по пользователю, месяцу и подсчет количества заказанных товаров
items_per_user = df_filtered.groupby(['user_id', 'month']).size().reset_index(name='num_items_bought')

print(items_per_user)

         user_id  month  num_items_bought
0              3      1                 7
1              3      2                14
2              3      3                22
3             15      1                 7
4             15      2                 4
...          ...    ...               ...
1624237  1057263      2                 9
1624238  1057263      3                19
1624239  1057265      1                 4
1624240  1057265      2                 1
1624241  1057265      3                 7

[1624242 rows x 3 columns]


In [6]:

#items_per_user_filtered = items_per_user[items_per_user['num_items_bought'] <= 30]
items = items_per_user[(items_per_user['num_items_bought'] >= 5) & (items_per_user['num_items_bought'] <= 50)]

# Группировка по пользователю и подсчет количества уникальных месяцев покупок
user_purchase_frequency = items.groupby('user_id')['month'].nunique()

# Отфильтровываем пользователей, у которых покупки реже чем раз в 3 месяца
active_users = user_purchase_frequency[user_purchase_frequency >= 3].index

items_res = items[items['user_id'].isin(active_users)]
print(items_res)

         user_id  month  num_items_bought
0              3      1                 7
1              3      2                14
2              3      3                22
6             16      1                 6
7             16      2                15
...          ...    ...               ...
1624222  1057256      2                 7
1624223  1057256      3                 6
1624236  1057263      1                13
1624237  1057263      2                 9
1624238  1057263      3                19

[684570 rows x 3 columns]


In [7]:
n = len(pd.unique(items_res['user_id']))

print("Количество пользователей, покупающих каждый месяц от 8 до 11 товаров :",n)

Количество пользователей, покупающих каждый месяц от 8 до 11 товаров : 228190


In [8]:
usefull_users = items_res['user_id'].unique().tolist()
print(len(usefull_users))

228190


In [35]:
df = wb_data[wb_data['user_id'].isin(usefull_users)]
# Добавляем столбец с днями месяца
#df['day_of_month'] = df['order_ts'].dt.day
print(df)

          user_id  item_id                   order_ts  month
0          550614      264 2023-01-01 00:28:09.000000      1
3           47164     5135 2023-01-01 00:53:35.000000      1
6          124741      437 2023-01-01 01:15:03.000000      1
8          757324      721 2023-01-01 01:46:57.000000      1
10         462564      286 2023-01-01 01:53:13.000000      1
...           ...      ...                        ...    ...
21265779   249269      348 2023-03-31 23:55:37.182803      3
21265780   373951     3835 2023-03-31 23:59:24.148327      3
21265781   505474     1340 2023-03-31 23:59:24.161567      3
21265782  1011023      153 2023-03-31 23:59:38.371329      3
21265783   756570     6623 2023-03-31 23:59:51.669277      3

[10780152 rows x 4 columns]


## Test

In [36]:
df_fit = df.copy()
df_fit.drop(['order_ts','month'],axis=1,inplace=True)
df_fit['target'] = 1

In [37]:
df_fit

Unnamed: 0,user_id,item_id,target
0,550614,264,1
3,47164,5135,1
6,124741,437,1
8,757324,721,1
10,462564,286,1
...,...,...,...
21265779,249269,348,1
21265780,373951,3835,1
21265781,505474,1340,1
21265782,1011023,153,1


In [38]:
n = len(pd.unique(df['item_id']))

print("Количество уникальных товаров, купленных выбранными юзерами:", n , "Количество товаров в исходной выборке: ", len(wb_data['item_id'].unique()))

Количество уникальных товаров, купленных выбранными юзерами: 6339 Количество товаров в исходной выборке:  6562


In [60]:
# Группируем данные по user_id
grouped_df = df.groupby('user_id')

# Применяем функцию к каждой группе, чтобы получить список item_id
df = grouped_df.apply(lambda x: x['item_id'].tolist()).reset_index(name='list_items')
df['target'] = 1
print(df)

        user_id                                         list_items  target
0             3  [363, 3835, 41, 11, 165, 149, 611, 11, 11, 477...       1
1            16  [3637, 104, 1834, 105, 104, 104, 133, 133, 133...       1
2            29  [1109, 2171, 157, 451, 1248, 1248, 451, 999, 3...       1
3            34  [1939, 1069, 1069, 667, 5346, 871, 838, 787, 3...       1
4            35  [812, 159, 203, 213, 5116, 1594, 184, 217, 99,...       1
...         ...                                                ...     ...
228185  1057241  [5136, 191, 1076, 347, 357, 344, 150, 52, 41, ...       1
228186  1057242  [2567, 1571, 717, 2576, 363, 212, 594, 358, 36...       1
228187  1057245  [358, 1853, 4943, 447, 50, 1352, 593, 4943, 12...       1
228188  1057256  [342, 4753, 342, 541, 4378, 105, 336, 440, 317...       1
228189  1057263  [4556, 721, 69, 457, 215, 4088, 855, 4088, 115...       1

[228190 rows x 3 columns]


In [14]:

import pandas as pd
import numpy as np
import sklearn
import datetime
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
# DL библиотеки
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers




In [16]:
from keras.models import Model, Sequential
from keras.layers import Embedding, Flatten, Input, Dropout, Dense, BatchNormalization, concatenate, dot
from keras.optimizers import Adam
from keras.utils import plot_model, model_to_dot
from keras.constraints import non_neg
from IPython.display import SVG
from sklearn.metrics import mean_squared_error
import os

In [40]:
user_ids = df_fit["user_id"].unique().tolist()
num_all_user = len(user_ids)

In [43]:
rand_userid = np.random.choice(user_ids, size = int(num_all_user * 0.1), replace=False)
sample_df = df_fit.loc[df_fit['user_id'].isin(rand_userid)]

# userid, itemid энкодинг для индексов
user_ids = df_fit['user_id'].unique()
num_users = len(user_ids)
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
user_encoded2user = {i: x for i, x in enumerate(user_ids)}
sample_df['user_encoded'] = sample_df['user_id'].map(user2user_encoded)

item_ids = sample_df['item_id'].unique()
num_items = len(item_ids)
item2item_encoded = {x: i for i, x in enumerate(item_ids)}
item_encoded2item = {i: x for i, x in enumerate(item_ids)}
sample_df['item_encoded'] = sample_df['item_id'].map(item2item_encoded)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df['user_encoded'] = sample_df['user_id'].map(user2user_encoded)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df['item_encoded'] = sample_df['item_id'].map(item2item_encoded)


In [44]:
train, test = train_test_split(sample_df, test_size = 0.2, random_state=123)

num_train_user = len(np.unique(train['user_encoded']))
num_train_item = len(np.unique(train['item_encoded']))

print(f'total item: {num_items}',
      f'\nitem in train: {num_train_item}',
      f'\nitem not in train: {num_items - num_train_item} ({1 - num_train_item / num_items :.2f})',
      f'\n\ntotal user: {num_users}',
      f'\nuser in train: {num_train_user}',
      f'\nuser not in train: {num_users - num_train_user} ({1 - num_train_user / num_users :.2f})'
     )

total item: 5275 
item in train: 5140 
item not in train: 135 (0.03) 

total user: 228190 
user in train: 22819 
user not in train: 205371 (0.90)


In [45]:
def NCF_model(embed_size = 10, drop_out_prob = 0.2):
    """
    Описываем архитектуру модели

    Embedding слои
    Drop Out слои


    """
    # определить входные данные
    item_input = Input(shape=[1],name='item-input')
    user_input = Input(shape=[1], name='user-input')

    # MLP embeddings для пользователей и элементов
    item_embedding_mlp = Embedding(num_items, embed_size,
                                    name='item-embedding-mlp')(item_input)
    item_vec_mlp = Flatten(name='flatten-item-mlp')(item_embedding_mlp)
    item_vec_mlp = Flatten(name='flatten-item-mlp')(item_embedding_mlp)

    user_embedding_mlp = Embedding(num_users, embed_size,
                                   name='user-embedding-mlp')(user_input)
    user_vec_mlp = Flatten(name='flatten-user-mlp')(user_embedding_mlp)

    # MF embeddings для пользователей и элементов
    item_embedding_mf = Embedding(num_items, embed_size,
                                   name='item-embedding-mf')(item_input)
    item_vec_mf = Flatten(name='flatten-item-mf')(item_embedding_mf)

    user_embedding_mf = Embedding(num_users, embed_size,
                                  name='user-embedding-mf')(user_input)
    user_vec_mf = Flatten(name='flatten-user-mf')(user_embedding_mf)

    # MLP
    concat = concatenate([item_vec_mlp, user_vec_mlp], axis=-1, name='concat')
    concat_dropout = Dropout(drop_out_prob)(concat)

    fc_1 = Dense(100, name='fc-1', activation='relu')(concat_dropout)
    fc_1_bn = BatchNormalization(name='batch-norm-1')(fc_1)
    fc_1_dropout = Dropout(drop_out_prob)(fc_1_bn)

    fc_2 = Dense(50, name='fc-2', activation='relu')(fc_1_dropout)
    fc_2_bn = BatchNormalization(name='batch-norm-2')(fc_2)
    fc_2_dropout = Dropout(drop_out_prob)(fc_2_bn)

    # Предикт
    pred_mlp = Dense(10, name='pred-mlp', activation='relu')(fc_2_dropout)
    pred_mf = dot([item_vec_mf, user_vec_mf], axes=1, normalize=False)

    combine_mlp_mf = concatenate([pred_mf, pred_mlp], axis=-1, name='pred_mf')

    # Финализация модели
    result = Dense(1, name='result', activation='relu')(combine_mlp_mf)

    model = Model([user_input,item_input], result)
    model.compile(optimizer=Adam(learning_rate=0.01), loss='mean_squared_error')

    return model

In [49]:
model = NCF_model()
# визуализируем модель
#SVG(model_to_dot(model, dpi=50, show_shapes=True).create(prog='dot', format='svg'))

In [50]:
model.summary()


Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 item-input (InputLayer)     [(None, 1)]                  0         []                            
                                                                                                  
 user-input (InputLayer)     [(None, 1)]                  0         []                            
                                                                                                  
 item-embedding-mlp (Embedd  (None, 1, 10)                52750     ['item-input[0][0]']          
 ing)                                                                                             
                                                                                                  
 user-embedding-mlp (Embedd  (None, 1, 10)                2281900   ['user-input[0][0]']    

In [93]:

# обучение NCF
model = NCF_model(embed_size=20)
history = model.fit([train['user_encoded'], train['item_encoded']], train['target'], epochs=3, use_multiprocessing=True)



Epoch 1/3
Epoch 2/3
Epoch 3/3


In [3]:
pd.Series(history.history['loss']).plot(logy=True)
plt.xlabel("Epoch")
plt.ylabel("Train Error")
plt.show()

In [97]:
y_hat = np.round(model.predict([test['user_encoded'], test['item_encoded']]), decimals=2)
y_true = test['target']



In [98]:
# предикт
d = {'prediction': y_hat.tolist(), 'true_value': y_true.values.tolist()}
test_pred = pd.DataFrame(d)
test_pred.head(5)


Unnamed: 0,prediction,true_value
0,[1.0],1
1,[1.0],1
2,[1.0],1
3,[1.0],1
4,[1.0],1


In [85]:
test_pred['prediction'][0][0]
pred = []
true_val = []
for i in range(len(test_pred['prediction'])):
    pred.append(int(test_pred['prediction'][i][0]))
    true_val.append(test_pred['true_value'][i])      

In [86]:
min(pred)

1

#### Вычисляем precision и recall

In [1]:

precision_at_k = precision_score(true_val, pred, average='samples')
recall_at_k = recall_score(true_val, pred, average='samples')

In [55]:
# сделаем расчет Топ - N
def NCF_recommendation(rec_model, client_id, top_k = 10):
    client_encoded = user2user_encoded[client_id]
    item_watched = sample_df[sample_df['user_id'] == client_id]['item_id'].values

    item_poll_encoded = []
    for item in item_ids:
        if not np.isin(item, item_watched):
            item_poll_encoded.append(item2item_encoded[item])

    d = {'user_encoded': [client_encoded] * len(item_poll_encoded), 'item_encoded' : item_poll_encoded}
    client_df = pd.DataFrame(d)

    ratings = rec_model.predict([client_df['user_encoded'], client_df['item_encoded']])

    top_ratings_idx = ratings.flatten().argsort()[-top_k:][::-1]
    top_ratings = ratings[top_ratings_idx].flatten()
    recommend_item_id = [item_encoded2item.get(item_poll_encoded[x]) for x in top_ratings_idx]

    top_item_rec = pd.DataFrame({'item_id': recommend_item_id, 'prediction': top_ratings})

    return top_item_rec

In [56]:
client_id = int(np.random.choice(user_ids, 1))
#print(f'recommendation for client:', client_id)
NCF_recommendation(model, client_id, top_k=15)

 33/165 [=====>........................] - ETA: 0s

  client_id = int(np.random.choice(user_ids, 1))




Unnamed: 0,item_id,prediction
0,1377,1.0
1,5485,1.0
2,2316,1.0
3,2030,1.0
4,6988,1.0
5,4175,1.0
6,2261,1.0
7,2892,1.0
8,600,1.0
9,897,1.0


### Расчет метрик

In [57]:
test_idx = list(sorted(set(test.user_id)))
# Теперь создаем новый DataFrame, содержащий только покупки пользователей из теста
test_buys = test[test['user_id'].isin(test_idx)]
grouped_df = test_buys.groupby('user_id')

# Добавляем новую колонку, которая содержит список купленных предметов для каждого пользователя
#fact = grouped_df.apply(lambda x: x['item_id'].tolist()).reset_index(name='list_items')

In [61]:
#fact
df

Unnamed: 0,user_id,list_items,target
0,3,"[363, 3835, 41, 11, 165, 149, 611, 11, 11, 477...",1
1,16,"[3637, 104, 1834, 105, 104, 104, 133, 133, 133...",1
2,29,"[1109, 2171, 157, 451, 1248, 1248, 451, 999, 3...",1
3,34,"[1939, 1069, 1069, 667, 5346, 871, 838, 787, 3...",1
4,35,"[812, 159, 203, 213, 5116, 1594, 184, 217, 99,...",1
...,...,...,...
228185,1057241,"[5136, 191, 1076, 347, 357, 344, 150, 52, 41, ...",1
228186,1057242,"[2567, 1571, 717, 2576, 363, 212, 594, 358, 36...",1
228187,1057245,"[358, 1853, 4943, 447, 50, 1352, 593, 4943, 12...",1
228188,1057256,"[342, 4753, 342, 541, 4378, 105, 336, 440, 317...",1


In [23]:
def hit_rate_at_k(test_idx, fact, k=10):
    result = []
    for i in range(len(fact['list_items'])):
        test_preds = NCF_recommendation(model, test_idx[i], top_k=15)

        final_bought_list = np.array(fact['list_items'][i])
        recommended_list = np.array(list(test_preds['item_id']))
        flags = np.isin(final_bought_list, recommended_list[:k] )
        hit_rate = int(flags.sum() > 0)
        result.append(hit_rate)
    return statistics.mean(result)

In [24]:
hit_rate = hit_rate_at_k(test_idx, fact, k=10)



In [27]:
hit_rate

0.3475691344720346

In [26]:
def precision_at_k(test_idx, fact, k=10):
    result = []
    for i in range(len(fact['list_items'])):
        test_preds = NCF_recommendation(model, test_idx[i], top_k=15)
        final_bought_list = np.array(fact['list_items'][i])
        recommended_list = np.array(list(test_preds['item_id']))

        final_bought_list = final_bought_list
        recommended_list = recommended_list[:k]

        flags = np.isin(recommended_list, final_bought_list)
        precision = flags.sum() / len(recommended_list)
        result.append(precision)
    return statistics.mean(result)


In [27]:
precision = precision_at_k(test_idx, fact, k=10)



In [29]:
precision

0.11260238844914451

In [None]:
def recall_at_k(test_idx, fact, k=10):
    result = []
    for i in range(len(fact['list_items'])):
        test_preds = NCF_recommendation(model, test_idx[i], top_k=15)
        final_bought_list = np.array(fact['list_items'][i])
        recommended_list = np.array(list(test_preds['item_id']))

        final_bought_list = final_bought_list
        recommended_list = recommended_list[:k]

        flags = np.isin(final_bought_list, recommended_list)
        recall = flags.sum() / len(final_bought_list)
        result.append(recall)
    return statistics.mean(result)

In [None]:
recall = recall_at_k(test_idx, fact, k=10)



In [31]:
recall

0.18151467477926245

In [32]:
def f_measure(precision,recall):
    f = 2  *  (precision  *  recall) / (precision + recall)
    return f

In [33]:
f = f_measure(precision, recall)

In [34]:
f

0.1389853801364937

In [41]:
# Precision, recall на встроенных методах в sklearn
hit_rate = accuracy_score(fact, test_preds)

precision_at_k = precision_score(fact, test_preds, average='samples')

recall_at_k = recall_score(fact, test_preds, average='samples')

In [43]:
hit_rate

0.1875681344260871

In [47]:
precision_at_k

0.09160231044874433

In [45]:
recall_at_k

0.07151366412726218

In [48]:
f_measure(precision_at_k,recall_at_k)

0.08032097260541753

### Получившиеся метрики

| Алгоритм | hit_rate | precision | recall | f_measure |
| --- | --- | --- | --- | --- |
| MostPopular | 0.1143 | 0.0125 | 0.0315 | 0.0255 |
| NCF | 0.1876 | 0.0916 | 0.0715 | 0.0803 |