In [1]:
!pip -q install rectools==0.2.0 # for Colab

In [2]:
import pandas as pd
import numpy as np
import scipy as sp
import requests
import dill
import zipfile as zf
from typing import Dict
from tqdm.auto import tqdm
from scipy.stats import mode
from collections import Counter
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender
from implicit.nearest_neighbours import ItemItemRecommender

from rectools import Columns
from rectools.dataset import Dataset
from rectools.model_selection import TimeRangeSplit
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics, MAP

from userknn import UserKnn

# 1. Загрузка датасета и краткий обзор датасета KION

In [3]:
# download dataset by chunks
url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"

req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [4]:
files = zf.ZipFile('kion_train.zip','r')
files.extractall()
files.close()

## 1.1.EDA

In [5]:
interactions = pd.read_csv('kion_train/interactions.csv', parse_dates=["last_watch_dt"])
users = pd.read_csv('kion_train/users.csv')
items = pd.read_csv('kion_train/items.csv')

In [6]:
# rename columns
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight}, 
                    inplace=True) 

---
**Таблица `interactions`**

In [7]:
interactions.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [8]:
interactions.shape

(5476251, 5)

In [9]:
max_date = interactions['datetime'].max()
min_date = interactions['datetime'].min()

print(f"min date in interactions: {min_date}")
print(f"max date in interactions: {max_date}")

min date in interactions: 2021-03-13 00:00:00
max date in interactions: 2021-08-22 00:00:00


In [10]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   item_id      int64         
 2   datetime     datetime64[ns]
 3   weight       int64         
 4   watched_pct  float64       
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 208.9 MB


---
**Таблица `users`**

In [11]:
users.head()

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0


In [12]:
print(f"Users dataframe shape {users.shape}")
print(f"Unique users: {users['user_id'].nunique()}")

Users dataframe shape (840197, 5)
Unique users: 840197


---
**Table `items`**

In [13]:
items.head(2)

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."


In [14]:
print(f"Items dataframe shape {items.shape}")
print(f"Unique item_id: {items['item_id'].nunique()}")

Items dataframe shape (15963, 14)
Unique item_id: 15963


!!! Для учебных расчетов возьмем датасет в 50к записей юзеров, чтобы не ждать долго расчетов. Соответственно нужно преобразовать таблицы взаимодействий и айтемов

In [15]:
users = users[:50000].reset_index(drop=True)
interactions = interactions[interactions['user_id'].isin(users['user_id'])].reset_index(drop=True)
items = items[items['item_id'].isin(interactions['item_id'])].reset_index(drop=True)

Проверка

In [16]:
print(users.shape)
print(interactions.shape)
print(items.shape)

(50000, 5)
(263107, 5)
(7825, 14)


# 2. Моя модель kNN с добавлением микса популярных айтемов

## 2.1 Добавление рекомендованных item до N

Воспользуемся моделью Ирины, только будем добавлять недостоющие количество айтемов к уже имеющемуся списку айтемов. Воспользуемся самой банальной идеей и проверим ее, а именно будем добавлять всем юзерам недостающие значения из списка самых популярных айтемов за последние 14 дней. Повторяющиеся дубли будем удалять, и заменять их рандомным айтемом из того же списка.

Функция для формирования списка айтемов за последние 14 дней

In [17]:
def popoular_number_of_items_days(
    df: pd.DataFrame, k: int = 10, days: int = 14, all_time: bool = False
) -> np.array:
    """
    Return a np.array of top@k most popular items for last N days
    """
    if all_time is True:
        recommendations = df.loc[:, "item_id"].value_counts().head(k).index.values
    else:
        min_date = df["datetime"].max().normalize() - pd.DateOffset(days)
        recommendations = (
            df.loc[df["datetime"] > min_date, "item_id"]
            .value_counts()
            .head(k)
            .index.values
        )
    return list(recommendations)

In [18]:
list_pop_items_14d = popoular_number_of_items_days(interactions, days=14)
list_pop_items_14d

[9728, 15297, 10440, 13865, 12192, 341, 3734, 7793, 4151, 14488]

Функция для добавления необходимого количества айтемов к списку айтемов юзера после модели. Возвращает список уникальных айтемов.

In [19]:
def full_reco_items_list(arr_reco_after_model: np.array, pop_array: np.array, number: int) -> list:
    """
    Add number of items from pop_array to arr_reco_after_model.
    Return unique array of 10 items
    """

    CONST_K = 10

    size_of_array = np.unique(arr_reco_after_model).size
    if size_of_array == CONST_K:
      return arr_reco_after_model

    # all duplicates will be deleting and adding some items from pop_array
    def del_repeat_items(full_arr_mix_pop: np.array, k: int = CONST_K) -> np.array:
        """
        Delete all duplicates items in array
        """
        
        size_of_array = np.unique(full_arr_mix_pop).size
        if size_of_array == CONST_K:
            return full_arr_mix_pop
        else:
            # delete duplicates and save the order of items
            full_arr_mix_pop = full_arr_mix_pop[np.sort(np.unique(full_arr_mix_pop, return_index=True)[1])]
            # add new items from pop_array
            i = k - size_of_array
            full_arr_mix_pop = np.concatenate(
                (full_arr_mix_pop, np.random.choice(pop_array, i, replace=False))
            )
            return del_repeat_items(full_arr_mix_pop)

    full_arr_mix_pop = np.array([])
    full_arr_mix_pop = np.concatenate(
        (
            arr_reco_after_model,
            pop_array[:number]
        )
    )
    full_arr_mix_pop = del_repeat_items(full_arr_mix_pop)

    return list(full_arr_mix_pop)

Проверка

In [20]:
tepm_list = np.array([15297,2,3,4,5,9728])
tepm_list

array([15297,     2,     3,     4,     5,  9728])

In [21]:
list_mix_pop = full_reco_items_list(tepm_list, list_pop_items_14d, 4)
list_mix_pop

[15297, 2, 3, 4, 5, 9728, 10440, 13865, 341, 4151]

---

Теперь можно взять работу Ирины, доработать и сравнивать модели

In [22]:
# train test split 
# test = last 1 week 

n_folds = 1
unit = "W"
n_units = 1
periods = n_folds + 1
freq = str('1')+unit

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplit(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(interactions)}")

Start date and last date of the test fold: (Timestamp('2021-08-08 00:00:00'), Timestamp('2021-08-22 00:00:00'))
Test fold borders: ['2021-08-08' '2021-08-15']
Real number of folds: 1


In [23]:
# we have just 1 test fold - no need to iterate over fold
(train_ids, test_ids, fold_info) = cv.split(interactions, collect_fold_stats=True).__next__()

In [24]:
train_ids

array([     0,      1,      4, ..., 263104, 263105, 263106])

In [25]:
test_ids

array([    28,     51,     53, ..., 263068, 263092, 263095])

In [26]:
train = interactions.iloc[train_ids]
test = interactions.iloc[test_ids]

In [27]:
users_inv_mapping = dict(enumerate(train['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}

In [28]:
items_inv_mapping = dict(enumerate(train['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}

In [29]:
print(f"users_mapping amount: {len(users_mapping)}")
print(f"items_mapping amount: {len(items_mapping)}")

users_mapping amount: 39079
items_mapping amount: 7469


## 2.2 Get sparse matrix 

In [30]:
def get_coo_matrix(df, 
                   user_col='user_id', 
                   item_col='item_id', 
                   weight_col=None, 
                   users_mapping=None, 
                   items_mapping=None):
    if weight_col:
        weights = df[weight_col].astype(np.float32)
    else:
        weights = np.ones(len(df), dtype=np.float32)

    interaction_matrix = sp.sparse.coo_matrix((
        weights, 
        (
            df[user_col].map(users_mapping.get), 
            df[item_col].map(items_mapping.get)
        )
    ))
    return interaction_matrix

In [31]:
interaction_matrix = get_coo_matrix(train, users_mapping=users_mapping, items_mapping=items_mapping)

In [32]:
interaction_matrix

<39079x7469 sparse matrix of type '<class 'numpy.float32'>'
	with 221881 stored elements in COOrdinate format>

## 2.3. Fit simple ItemKNN model

In [33]:
userknn = CosineRecommender(K=30)
userknn.fit(interaction_matrix)

  0%|          | 0/39079 [00:00<?, ?it/s]

In [34]:
# save model
with open('userknn.dill', 'wb') as f:
    dill.dump(userknn, f)

In [35]:
with open('userknn.dill', 'rb') as f:
    userknn = dill.load(f)

In [36]:
userknn.similar_items(1)

[(1, 1.0),
 (19402, 0.30316953129541613),
 (7319, 0.28194193721730043),
 (16531, 0.280056016805602),
 (28590, 0.280056016805602),
 (352, 0.27500954910846337),
 (9335, 0.2711630722733202),
 (13502, 0.2592814894208657),
 (16092, 0.2592814894208657),
 (15451, 0.2585438449975096)]

## 2.4. Recommend to get user neighbours

In [37]:
def generate_implicit_recs_mapper(model, N, users_mapping, users_inv_mapping):
    def _recs_mapper(user):
        user_id = users_mapping[user]
        recs = model.similar_items(user_id, N=N)
        return [users_inv_mapping[user] for user, _ in recs], [sim for _, sim in recs]
    return _recs_mapper

In [38]:
mapper = generate_implicit_recs_mapper(
    userknn, 
    N=30,
    users_mapping=users_mapping,
    users_inv_mapping=users_inv_mapping
)

In [39]:
recs = pd.DataFrame({
    'user_id': test['user_id'].unique()
})

recs['similar_user_id'], recs['similarity'] = zip(*recs['user_id'].map(mapper))
recs.head()

Unnamed: 0,user_id,similar_user_id,similarity
0,328865,"[328865, 944648, 53684, 550149, 376937, 106840...","[1.0000000000000002, 0.3333333333333333, 0.333..."
1,841380,"[841380, 758250, 650136, 631822, 800582, 55922...","[1.0000000000000002, 0.5773502691896258, 0.577..."
2,57475,"[57475, 118191, 607961, 612914, 104949, 891588...","[0.9999999999999997, 0.28347335475692037, 0.28..."
3,109887,"[109887, 883098, 461468, 93243, 59768, 23802, ...","[1.0000000000000007, 0.48038446141526137, 0.43..."
4,518017,"[518017, 912406, 116959, 908784, 267165, 48678...","[0.9999999999999998, 0.7071067811865475, 0.707..."


In [40]:
# explode lists to get vertical representation
recs = recs.set_index('user_id').apply(pd.Series.explode).reset_index()

In [41]:
recs.head(30 + 5)

Unnamed: 0,user_id,similar_user_id,similarity
0,328865,328865,1.0
1,328865,944648,0.333333
2,328865,53684,0.333333
3,328865,550149,0.333333
4,328865,376937,0.333333
5,328865,1068400,0.333333
6,328865,6310,0.333333
7,328865,1043404,0.333333
8,328865,550510,0.333333
9,328865,621272,0.333333


In [42]:
# delete recommendations of itself 
recs = recs[~(recs['user_id'] == recs['similar_user_id'])] # изменил условие, это более точное, чем которое было

In [43]:
recs.shape

(140133, 3)

In [44]:
recs.head()

Unnamed: 0,user_id,similar_user_id,similarity
1,328865,944648,0.333333
2,328865,53684,0.333333
3,328865,550149,0.333333
4,328865,376937,0.333333
5,328865,1068400,0.333333


## 2.5. Join watched items of neighbour users to get item recommendations

In [45]:
watched = train.groupby('user_id').agg({'item_id': list})
watched.head()

Unnamed: 0_level_0,item_id
user_id,Unnamed: 1_level_1
20,[15297]
53,"[1562, 16107, 13865, 1650, 7571, 4151, 15297, ..."
191,"[10353, 10732, 14341, 5569, 11756, 4878, 13202..."
199,"[15297, 10440]"
216,[2657]


In [46]:
# join watched items
recs = recs.merge(watched, left_on=['similar_user_id'], right_on=['user_id'], how='left')
recs = recs.explode('item_id')

In [47]:
recs.head()

Unnamed: 0,user_id,similar_user_id,similarity,item_id
0,328865,944648,0.333333,3076
0,328865,944648,0.333333,9728
1,328865,53684,0.333333,9728
1,328865,53684,0.333333,13018
2,328865,550149,0.333333,4685


In [48]:
# drop duplicates pairs user_id-item_id 
# keep with the largest similiarity
recs = recs.sort_values(['user_id', 'similarity'], ascending=False)
recs.head()

Unnamed: 0,user_id,similar_user_id,similarity,item_id
84795,1097329,795625,1.0,9728
84795,1097329,795625,1.0,7571
84796,1097329,966082,1.0,7571
84796,1097329,966082,1.0,9728
84797,1097329,797903,1.0,9728


In [49]:
recs = recs.drop_duplicates(['user_id', 'item_id'], keep='first')

In [50]:
recs.shape

(209546, 4)

## 2.6. Make `rank` from similarity



In [51]:
cnt = Counter(train['item_id'].values)

In [52]:
# convert counter to dataframe
idf = pd.DataFrame.from_dict(cnt, orient='index', columns=['doc_freq']).reset_index()
idf.head()

Unnamed: 0,index,doc_freq
0,9506,148
1,3190,92
2,4740,1272
3,9980,34
4,6979,99


In [53]:
# num of documents = num of recommendation list = dataframe shape
n = train.shape[0]
idf['idf'] = idf['doc_freq'].apply(lambda x: np.log((1 + n) / (1 + x) + 1))

In [54]:
idf.head()

Unnamed: 0,index,doc_freq,idf
0,9506,148,7.306626
1,3190,92,7.777721
2,4740,1272,5.16649
3,9980,34,8.754711
4,6979,99,7.705181


In [55]:
# join idf 
recs = recs.merge(
    idf[["index", "idf"]], left_on="item_id", right_on="index", how="left"
).drop(["index"], axis=1)

recs.head()

Unnamed: 0,user_id,similar_user_id,similarity,item_id,idf
0,1097329,795625,1.0,9728,3.761552
1,1097329,795625,1.0,7571,5.259798
2,1097329,203058,0.816497,4436,5.599437
3,1097329,972644,0.816497,13414,7.97951
4,1097329,312587,0.816497,4880,4.542809


In [56]:
recs['rank_idf'] = recs['similarity'] * recs['idf']

In [57]:
recs.head()

Unnamed: 0,user_id,similar_user_id,similarity,item_id,idf,rank_idf
0,1097329,795625,1.0,9728,3.761552,3.761552
1,1097329,795625,1.0,7571,5.259798,5.259798
2,1097329,203058,0.816497,4436,5.599437,4.571921
3,1097329,972644,0.816497,13414,7.97951,6.515243
4,1097329,312587,0.816497,4880,4.542809,3.709188


In [58]:
# make order by rank_idf 
recs = recs.sort_values(['user_id', 'rank_idf'], ascending=False)

In [59]:
# make rank
recs['rank'] = recs.groupby('user_id').cumcount() + 1 

In [60]:
recs.head()

Unnamed: 0,user_id,similar_user_id,similarity,item_id,idf,rank_idf,rank
5,1097329,936863,0.816497,16175,10.923625,8.919102,1
8,1097329,800180,0.816497,10881,8.32116,6.794199,2
3,1097329,972644,0.816497,13414,7.97951,6.515243,3
6,1097329,980633,0.816497,2720,6.694369,5.465929,4
1,1097329,795625,1.0,7571,5.259798,5.259798,5


## 2.7. Добавление количества рекомендаций

Теперь применим нашу функцию к каждому `user_id` у которых количество рекомендаций меньше 10, предварительно удалим все строки, где количество рекомендаций для одного пользователя больше 10

In [61]:
recs = recs[recs['rank'] <= 10]

Создадим табличку для хранения всех рекомендаций для всех пользователей

In [62]:
final_reco = (recs
                  .groupby('user_id')
                  .agg({'item_id': list})
                  )

In [63]:
final_reco.head()

Unnamed: 0_level_0,item_id
user_id,Unnamed: 1_level_1
53,"[2134, 15034, 3017, 3768, 5732, 10569, 10281, ..."
382,"[565, 2956, 13218, 12386, 10605, 5411, 1112, 8..."
631,[7417]
657,"[389, 14648, 11237]"
849,"[8617, 4223, 4436, 9728, 10440]"


In [64]:
tqdm.pandas(desc='My bar!')

In [65]:
# очень долгое выполнение около 5 часов для полного датасета
final_reco.loc[:, 'item_id'] = (final_reco.loc[:, 'item_id']
                                .progress_apply(lambda x: full_reco_items_list(np.array(x), np.array(list_pop_items_14d), (10 - len(x)))))

My bar!:   0%|          | 0/4855 [00:00<?, ?it/s]

---

Теперь создадим табличку для метрик

In [66]:
my_reco = final_reco.explode('item_id')
my_reco['rank'] = my_reco.groupby('user_id').cumcount() + 1 
my_reco = my_reco.reset_index()

Метрики из работы Ирины

In [67]:
# calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
    "MAP@10": MAP(k=10)
}

catalog = train['item_id'].unique()
    
metric_values = calc_metrics(
            metrics,
            reco=recs,
            interactions=test,
            prev_interactions=train,
            catalog=catalog
        )

In [68]:
metric_values

{'prec@10': 0.004325437693099897,
 'recall@10': 0.02027725615426846,
 'MAP@10': 0.003872647308319393,
 'novelty': 7.930466482372508,
 'serendipity': 5.8183505899220365e-05}

Метрика с добавлением рекомендаций до 10

In [69]:
metric_values_my = calc_metrics(
            metrics,
            reco=my_reco,
            interactions=test,
            prev_interactions=train,
            catalog=catalog
        )
metric_values_my

{'prec@10': 0.015262615859938208,
 'recall@10': 0.08366694724657261,
 'MAP@10': 0.01806539526476222,
 'novelty': 7.962690685371174,
 'serendipity': 5.9985668190622105e-05}

Как видно наши показатели лучше

## 2.7. Сборка в один класс

Нужно собрать все в один класс, для того чтобы использовать в нашем сервисе. Воспользуемся файлом userknn.py Ирины и дополним его

In [70]:
class my_UserKnn:
    """Class for fit-perdict UserKNN model
    based on ItemKNN model from implicit.nearest_neighbours
    """

    def __init__(self, model: ItemItemRecommender, N_users: int = 50):
        self.N_users = N_users
        self.model = model
        self.is_fitted = False

    def get_mappings(self, train):
        self.users_inv_mapping = dict(enumerate(train["user_id"].unique()))
        self.users_mapping = {v: k for k, v in self.users_inv_mapping.items()}

        self.items_inv_mapping = dict(enumerate(train["item_id"].unique()))
        self.items_mapping = {v: k for k, v in self.items_inv_mapping.items()}

    def get_matrix(
        self,
        df: pd.DataFrame,
        user_col: str = "user_id",
        item_col: str = "item_id",
        weight_col: str = None,
        users_mapping: Dict[int, int] = None,
        items_mapping: Dict[int, int] = None,
    ):

        if weight_col:
            weights = df[weight_col].astype(np.float32)
        else:
            weights = np.ones(len(df), dtype=np.float32)

        interaction_matrix = sp.sparse.coo_matrix(
            (
                weights,
                (
                    df[user_col].map(self.users_mapping.get),
                    df[item_col].map(self.items_mapping.get),
                ),
            )
        )

        self.watched = df.groupby(user_col).agg({item_col: list})
        return interaction_matrix

    def idf(self, n: int, x: float):
        return np.log((1 + n) / (1 + x) + 1)

    def _count_item_idf(self, df: pd.DataFrame):
        item_cnt = Counter(df["item_id"].values)
        item_idf = pd.DataFrame.from_dict(
            item_cnt, orient="index", columns=["doc_freq"]
        ).reset_index()
        item_idf["idf"] = item_idf["doc_freq"].apply(lambda x: self.idf(self.n, x))
        self.item_idf = item_idf

    def fit(self, train: pd.DataFrame):
        self.user_knn = self.model
        self.get_mappings(train)
        self.weights_matrix = self.get_matrix(
            train, users_mapping=self.users_mapping, items_mapping=self.items_mapping
        )

        self.n = train.shape[0]
        self._count_item_idf(train)

        self.user_knn.fit(self.weights_matrix)
        self.is_fitted = True

    def _generate_recs_mapper(
        self,
        model: ItemItemRecommender,
        user_mapping: Dict[int, int],
        user_inv_mapping: Dict[int, int],
        N: int,
    ):
        def _recs_mapper(user):
            user_id = user_mapping[user]
            recs = model.similar_items(user_id, N=N)
            return [user_inv_mapping[user] for user, _ in recs], [
                sim for _, sim in recs
            ]

        return _recs_mapper

    def predict(self, test: pd.DataFrame, N_recs: int = 10):

        if not self.is_fitted:
            raise ValueError("Please call fit before predict")

        mapper = self._generate_recs_mapper(
            model=self.user_knn,
            user_mapping=self.users_mapping,
            user_inv_mapping=self.users_inv_mapping,
            N=self.N_users,
        )

        recs = pd.DataFrame({"user_id": test["user_id"].unique()})
        recs["sim_user_id"], recs["sim"] = zip(*recs["user_id"].map(mapper))

        recs = recs.set_index("user_id").apply(pd.Series.explode).reset_index()

        recs = (
            recs[~(recs['user_id'] == recs['sim_user_id'])]
            .merge(
                self.watched, left_on=["sim_user_id"], right_on=["user_id"], how="left"
            )
            .explode("item_id")
            .sort_values(["user_id", "sim"], ascending=False)
            .drop_duplicates(["user_id", "item_id"], keep="first")
            .merge(self.item_idf, left_on="item_id", right_on="index", how="left")
        )
        recs["score"] = recs["sim"] * recs["idf"]
        recs = recs.sort_values(["user_id", "score"], ascending=False)
        recs["rank"] = recs.groupby("user_id").cumcount() + 1
        recs = recs[recs['rank'] <= N_recs]

        final_reco = (recs
                  .groupby('user_id')
                  .agg({'item_id': list})
                  )

        final_reco["item_id"] = final_reco.loc[:, "item_id"].apply(
            lambda x: full_reco_items_list(np.array(x), np.array(list_pop_items_14d), (10 - len(x)))
        )
        # используется только в сервисе или для проверки
        # reco_for_user = final_reco.loc[test['user_id'].unique(), "item_id"] 
        
        # преобразование таблицы для метрик, только для колаба
        my_reco = final_reco.explode('item_id')
        my_reco['rank'] = my_reco.groupby('user_id').cumcount() + 1 
        my_reco = my_reco.reset_index()        
        
        #return list(reco_for_user.iloc[0]) # для сервиса
        return my_reco # только для колаба

# 3. CV для моделей: userkNN

Сравнение implicit `CosineRecommender` и `TFIDFRecommender`

Поменяем количество фолдов, потому что 8 недель - это два месяца и из 6 месяцев, и отдавать два на тест - это слишком много на мой взгляд. Лучше поставить 5 или 6 недель

In [71]:
# setting for cv 
n_folds = 5
unit = "W"
n_units = 1

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")

Start date and last date of the test fold: (Timestamp('2021-07-11 00:00:00'), Timestamp('2021-08-22 00:00:00'))


## 3.1 Test fold borders

In [72]:
periods = n_folds + 1
freq = f"{n_units}{unit}"
print(
    f"start_date: {start_date}\n"
    f"last_date: {last_date}\n"
    f"periods: {periods}\n"
    f"freq: {freq}\n"
)
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplit(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(interactions)}")

start_date: 2021-07-11 00:00:00
last_date: 2021-08-22 00:00:00
periods: 6
freq: 1W

Test fold borders: ['2021-07-11' '2021-07-18' '2021-07-25' '2021-08-01' '2021-08-08'
 '2021-08-15']
Real number of folds: 5


Добавим наш доработанный класс и сравним его с метриками Ирины

In [73]:
# calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
    "MAP@10": MAP(k=10)
}

# few simple models to compare
models = {
    "cosine_itemknn": CosineRecommender(),
    "tfidf_itemknn": TFIDFRecommender(),
    "my_cosine_itemknn": CosineRecommender(),
    "my_tfidf_itemknn": TFIDFRecommender()
}


## 3.2. Model training by fold

In [74]:
%%time

results = []

fold_iterator = cv.split(interactions, collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    print(fold_info)

    df_train = interactions.iloc[train_ids].copy()
    df_test = interactions.iloc[test_ids][Columns.UserItem].copy()

    catalog = df_train[Columns.Item].unique()
    
    for model_name, model in models.items():
        if model_name[:3] == 'my_':
            userknn_model = my_UserKnn(model=model, N_users=50)
        else:
            userknn_model = UserKnn(model=model, N_users=50)

        userknn_model.fit(df_train)
        recos = userknn_model.predict(df_test)
    
        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )
    
        fold = {"fold": i_fold, "model": model_name}
        fold.update(metric_values)
        results.append(fold)


{'Start date': Timestamp('2021-07-11 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-07-18 00:00:00', freq='W-SUN'), 'Train': 154058, 'Train users': 29872, 'Train items': 6764, 'Test': 10369, 'Test users': 4069, 'Test items': 2161}


  0%|          | 0/29872 [00:00<?, ?it/s]

  0%|          | 0/29872 [00:00<?, ?it/s]

  0%|          | 0/29872 [00:00<?, ?it/s]

  0%|          | 0/29872 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-07-18 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'), 'Train': 169161, 'Train users': 31982, 'Train items': 6968, 'Test': 11257, 'Test users': 4190, 'Test items': 2342}


  0%|          | 0/31982 [00:00<?, ?it/s]

  0%|          | 0/31982 [00:00<?, ?it/s]

  0%|          | 0/31982 [00:00<?, ?it/s]

  0%|          | 0/31982 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'Train': 185626, 'Train users': 34257, 'Train items': 7124, 'Test': 12258, 'Test users': 4592, 'Test items': 2368}


  0%|          | 0/34257 [00:00<?, ?it/s]

  0%|          | 0/34257 [00:00<?, ?it/s]

  0%|          | 0/34257 [00:00<?, ?it/s]

  0%|          | 0/34257 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'Train': 203620, 'Train users': 36692, 'Train items': 7248, 'Test': 12410, 'Test users': 4719, 'Test items': 2308}


  0%|          | 0/36692 [00:00<?, ?it/s]

  0%|          | 0/36692 [00:00<?, ?it/s]

  0%|          | 0/36692 [00:00<?, ?it/s]

  0%|          | 0/36692 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 221881, 'Train users': 39079, 'Train items': 7469, 'Test': 13046, 'Test users': 4855, 'Test items': 2431}


  0%|          | 0/39079 [00:00<?, ?it/s]

  0%|          | 0/39079 [00:00<?, ?it/s]

  0%|          | 0/39079 [00:00<?, ?it/s]

  0%|          | 0/39079 [00:00<?, ?it/s]

CPU times: user 1min 47s, sys: 1.47 s, total: 1min 49s
Wall time: 1min 37s


# 4. Metrics



## 4.1. Metric description 

In [75]:
df_metrics = pd.DataFrame(results)
df_metrics

Unnamed: 0,fold,model,prec@10,recall@10,MAP@10,novelty,serendipity
0,0,cosine_itemknn,0.004276,0.024102,0.004496,7.924878,3.4e-05
1,0,tfidf_itemknn,0.006169,0.035247,0.007083,8.131562,3.8e-05
2,0,my_cosine_itemknn,0.019268,0.115144,0.025044,7.591518,6.2e-05
3,0,my_tfidf_itemknn,0.016245,0.098049,0.021914,8.075609,5.7e-05
4,1,cosine_itemknn,0.003675,0.018405,0.003577,8.023029,3e-05
5,1,tfidf_itemknn,0.006301,0.034967,0.006606,8.135482,4.1e-05
6,1,my_cosine_itemknn,0.019069,0.107507,0.02408,7.566709,4.3e-05
7,1,my_tfidf_itemknn,0.015871,0.091772,0.020252,8.046625,4.6e-05
8,2,cosine_itemknn,0.003637,0.018849,0.00305,8.051317,3.5e-05
9,2,tfidf_itemknn,0.005945,0.030246,0.006234,8.226832,4.6e-05


## 4.2. Metrics mean by fold

In [76]:
df_metrics.groupby('model').mean()[metrics.keys()]

Unnamed: 0_level_0,prec@10,recall@10,novelty,serendipity,MAP@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cosine_itemknn,0.003758,0.019716,8.042191,3.5e-05,0.003569
my_cosine_itemknn,0.01832,0.103645,7.536814,5.8e-05,0.022841
my_tfidf_itemknn,0.015657,0.088873,8.028589,6.1e-05,0.019802
tfidf_itemknn,0.00616,0.032223,8.175461,4.7e-05,0.006379


Как видно из таблицы, добавление к рекомендованным айтемам айтемов до 10 из микса популярных айтемов дает ощутимый прирост почти по всем метрикам, и самое главное при этом чуть-чуть ухудшилась метрика новизны