In [1]:
!pip -q install rectools==0.2.0 # for Colab

In [2]:
import pandas as pd
import numpy as np
import scipy as sp
import requests
import dill
import zipfile as zf
from typing import Dict
from tqdm.auto import tqdm
from scipy.stats import mode
from collections import Counter
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender
from implicit.nearest_neighbours import ItemItemRecommender

from rectools import Columns
from rectools.dataset import Dataset
from rectools.model_selection import TimeRangeSplit
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics

from userknn import UserKnn

# 1. Загрузка датасета и краткий обзор датасета KION

In [3]:
# download dataset by chunks
url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"

req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [4]:
files = zf.ZipFile('kion_train.zip','r')
files.extractall()
files.close()

## 1.1.EDA

In [5]:
interactions = pd.read_csv('kion_train/interactions.csv', parse_dates=["last_watch_dt"])
users = pd.read_csv('kion_train/users.csv')
items = pd.read_csv('kion_train/items.csv')

In [6]:
# rename columns
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight}, 
                    inplace=True) 

---
**Таблица `interactions`**

In [7]:
interactions.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [8]:
interactions.shape

(5476251, 5)

In [9]:
max_date = interactions['datetime'].max()
min_date = interactions['datetime'].min()

print(f"min date in interactions: {min_date}")
print(f"max date in interactions: {max_date}")

min date in interactions: 2021-03-13 00:00:00
max date in interactions: 2021-08-22 00:00:00


In [10]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   item_id      int64         
 2   datetime     datetime64[ns]
 3   weight       int64         
 4   watched_pct  float64       
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 208.9 MB


---
**Таблица `users`**

In [11]:
users.head()

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0


In [12]:
print(f"Users dataframe shape {users.shape}")
print(f"Unique users: {users['user_id'].nunique()}")

Users dataframe shape (840197, 5)
Unique users: 840197


---
**Table `items`**

In [13]:
items.head(2)

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."


In [14]:
print(f"Items dataframe shape {items.shape}")
print(f"Unique item_id: {items['item_id'].nunique()}")

Items dataframe shape (15963, 14)
Unique item_id: 15963


!!! Для учебных расчетов возьмем датасет в 10к записей юзеров, чтобы не ждать долго расчетов. Соответственно нужно преобразовать таблицы взаимодействий и айтемов

In [15]:
users = users[:10000].reset_index(drop=True)
interactions = interactions[interactions['user_id'].isin(users['user_id'])].reset_index(drop=True)
items = items[items['item_id'].isin(interactions['item_id'])].reset_index(drop=True)

Проверка

In [16]:
print(users.shape)
print(interactions.shape)
print(items.shape)

(10000, 5)
(54114, 5)
(4910, 14)


# 2. Моя модель kNN с добавлением микса популярных айтемов

##2.1 Добавление рекомендованных item до N

Воспользуемся моделью Ирины, только будем добавлять недостоющие количество айтемов. Одна из идей - это добавлять в разной пропорции недостающие айтемы при рекомендациях из трех списков популярных айтемов. К примеру нужно добавить 7 айтемов после kNN модели:
* 50% будет добавляться семплами из списка самых популярных за прошедший 1 месяц
* 30% семплы из списка популярных за прошедшие 3 месяца
* 20% семплы из списка популярных за все время

Итого получится:
* 4шт из поп30дней
* 2шт из поп90дней
* 1шт из поп_алл_тайм

Все повторы айтемов будут дропаться и заменяться другими рандомными значениями из списка поп_алл_тайм

Напишем функцию, которая будет возвращать рандомные значения из этих списков, согласно логике выше. На вход она будет принимать только необходимое количество для выдачи результатов.

In [17]:
def mix_popular_items(df_inter: pd.DataFrame, number: int) -> list:
    """
    Return list consisting of input number(max=10) mix of popular items.
    near 50% of items - from list top 10 last 30 days
    near 30% of items - from list top 10 last 90 days
    near 20% of items - from list top 10 for all time
    """
    df = df_inter.copy()

    # np.array top@k pop for different periods
    def popoular_number_of_items_days(
        df: pd.DataFrame, k: int = 10, days: int = 14, all_time: bool = False
    ) -> np.array:
        """
        Return a np.array of top@k most popular items for last N days
        """
        recommendations = []

        if all_time == True:
            recommendations = df.loc[:, "item_id"].value_counts().head(k).index.values
        else:
            min_date = df["datetime"].max().normalize() - pd.DateOffset(days)
            recommendations = (
                df.loc[df["datetime"] > min_date, "item_id"]
                .value_counts()
                .head(k)
                .index.values
            )
        return recommendations

    # all duplicates will be deleting and adding some items from pop_all_time
    def del_repeat_items(arr_mix_pop: np.array) -> np.array:
        """
        Delete all duplicates items in array
        """
        if len(set(arr_mix_pop)) == number:
            return arr_mix_pop
        else:
            i = number - len(set(arr_mix_pop))
            arr_mix_pop = np.concatenate(
                (arr_mix_pop, np.random.choice(pop_all_time, i, replace=False))
            )
            return del_repeat_items(arr_mix_pop)

    arr_mix_pop = np.array([])
    pop_30d = popoular_number_of_items_days(df, k=10, days=30)
    pop_90d = popoular_number_of_items_days(df, k=10, days=90)
    pop_all_time = popoular_number_of_items_days(df, k=10, all_time=True)

    if number == 1:
        i, j, k = 1, 0, 0
    elif number == 2:
        i, j, k = 1, 1, 0
    elif number == 3:
        i, j, k = 2, 1, 0
    elif number == 4:
        i, j, k = 2, 1, 1
    elif number == 5:
        i, j, k = 3, 1, 1
    elif number == 6:
        i, j, k = 3, 2, 1
    elif number == 7:
        i, j, k = 4, 2, 1
    elif number == 8:
        i, j, k = 4, 2, 2
    elif number == 9:
        i, j, k = 5, 2, 2
    elif number == 10:
        i, j, k = 5, 3, 2
    else:
      arr_mix_pop = np.array([])
      return list(arr_mix_pop)

    arr_mix_pop = np.concatenate(
        (
            np.random.choice(pop_30d, i, replace=False),
            np.random.choice(pop_90d, j, replace=False),
            np.random.choice(pop_all_time, k, replace=False),
        )
    )

    arr_mix_pop = del_repeat_items(arr_mix_pop)

    return list(set(arr_mix_pop))


In [18]:
list_mix_pop = mix_popular_items(interactions, 4)

In [19]:
list_mix_pop

[12192, 7793, 142, 2657]

---

Теперь можно взять работу Ирины, доработать и сравнивать модели

In [20]:
# train test split 
# test = last 1 week 

n_folds = 1
unit = "W"
n_units = 1
periods = n_folds + 1
freq = str('1')+unit

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplit(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(interactions)}")

Start date and last date of the test fold: (Timestamp('2021-08-08 00:00:00'), Timestamp('2021-08-22 00:00:00'))
Test fold borders: ['2021-08-08' '2021-08-15']
Real number of folds: 1


In [21]:
# we have just 1 test fold - no need to iterate over fold
(train_ids, test_ids, fold_info) = cv.split(interactions, collect_fold_stats=True).__next__()

In [22]:
train_ids

array([    0,     1,     2, ..., 54109, 54110, 54113])

In [23]:
test_ids

array([   23,    43,    50, ..., 53974, 54015, 54080])

In [24]:
train = interactions.iloc[train_ids]
test = interactions.iloc[test_ids]

In [25]:
users_inv_mapping = dict(enumerate(train['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}

In [26]:
items_inv_mapping = dict(enumerate(train['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}

In [27]:
print(f"users_mapping amount: {len(users_mapping)}")
print(f"items_mapping amount: {len(items_mapping)}")

users_mapping amount: 7784
items_mapping amount: 4610


## 2.2 Get sparse matrix 

In [28]:
def get_coo_matrix(df, 
                   user_col='user_id', 
                   item_col='item_id', 
                   weight_col=None, 
                   users_mapping=None, 
                   items_mapping=None):
    if weight_col:
        weights = df[weight_col].astype(np.float32)
    else:
        weights = np.ones(len(df), dtype=np.float32)

    interaction_matrix = sp.sparse.coo_matrix((
        weights, 
        (
            df[user_col].map(users_mapping.get), 
            df[item_col].map(items_mapping.get)
        )
    ))
    return interaction_matrix

In [29]:
interaction_matrix = get_coo_matrix(train, users_mapping=users_mapping, items_mapping=items_mapping)

In [30]:
interaction_matrix

<7784x4610 sparse matrix of type '<class 'numpy.float32'>'
	with 45686 stored elements in COOrdinate format>

## 2.3. Fit simple ItemKNN model

In [31]:
userknn = CosineRecommender(K=30)
userknn.fit(interaction_matrix)

  0%|          | 0/7784 [00:00<?, ?it/s]

In [32]:
# save model
with open('userknn.dill', 'wb') as f:
    dill.dump(userknn, f)

In [33]:
with open('userknn.dill', 'rb') as f:
    userknn = dill.load(f)

In [34]:
userknn.similar_items(1)

[(1, 0.9999999999999997),
 (1396, 0.545544725589981),
 (1482, 0.5345224838248487),
 (2489, 0.5345224838248487),
 (3079, 0.5345224838248487),
 (3090, 0.5345224838248487),
 (3575, 0.5345224838248487),
 (3925, 0.5345224838248487),
 (4652, 0.5345224838248487),
 (4802, 0.5345224838248487)]

## 2.4. Recommend to get user neighbours

In [35]:
def generate_implicit_recs_mapper(model, N, users_mapping, users_inv_mapping):
    def _recs_mapper(user):
        user_id = users_mapping[user]
        recs = model.similar_items(user_id, N=N)
        return [users_inv_mapping[user] for user, _ in recs], [sim for _, sim in recs]
    return _recs_mapper

In [36]:
mapper = generate_implicit_recs_mapper(
    userknn, 
    N=30,
    users_mapping=users_mapping,
    users_inv_mapping=users_inv_mapping
)

In [37]:
recs = pd.DataFrame({
    'user_id': test['user_id'].unique()
})

recs['similar_user_id'], recs['similarity'] = zip(*recs['user_id'].map(mapper))
recs.head()

Unnamed: 0,user_id,similar_user_id,similarity
0,484565,"[484565, 31029, 1047195, 579268, 582706, 99690...","[0.9999999999999991, 0.15191090506255, 0.15041..."
1,712247,"[712247, 445137, 210533, 1076735, 366288, 5408...","[0.9999999999999998, 0.7071067811865475, 0.707..."
2,1040288,"[1040288, 841157, 634040, 380396, 1080759, 768...","[1.0000000000000002, 0.29277002188455997, 0.23..."
3,375980,"[375980, 470198, 604365, 413030, 776549, 32102...","[0.9999999999999984, 0.2795084971874737, 0.276..."
4,639580,"[639580, 505245, 871518, 414849, 997088, 22524...","[0.9999999999999989, 0.32935478783704736, 0.31..."


In [38]:
# explode lists to get vertical representation
recs = recs.set_index('user_id').apply(pd.Series.explode).reset_index()

In [39]:
recs.head(30 + 5)

Unnamed: 0,user_id,similar_user_id,similarity
0,484565,484565,1.0
1,484565,31029,0.151911
2,484565,1047195,0.150414
3,484565,579268,0.148842
4,484565,582706,0.14825
5,484565,996907,0.145951
6,484565,827676,0.144707
7,484565,784005,0.144115
8,484565,922198,0.143223
9,484565,620712,0.138675


In [40]:
# delete recommendations of itself 
recs = recs[~(recs['user_id'] == recs['similar_user_id'])] # изменил условие, это более точное, чем которое было

In [41]:
recs.shape

(27137, 3)

In [42]:
recs.head()

Unnamed: 0,user_id,similar_user_id,similarity
1,484565,31029,0.151911
2,484565,1047195,0.150414
3,484565,579268,0.148842
4,484565,582706,0.14825
5,484565,996907,0.145951


## 2.5. Join watched items of neighbour users to get item recommendations

In [43]:
watched = train.groupby('user_id').agg({'item_id': list})
watched.head()

Unnamed: 0_level_0,item_id
user_id,Unnamed: 1_level_1
649,"[14705, 15297, 4151, 1833, 4716, 5239, 13058, ..."
718,[10440]
760,"[13865, 10432, 10878, 9728, 7107, 4880, 7556, ..."
914,"[2722, 8741, 11756]"
971,[5434]


In [44]:
# join watched items
recs = recs.merge(watched, left_on=['similar_user_id'], right_on=['user_id'], how='left')
recs = recs.explode('item_id')

In [45]:
recs.head()

Unnamed: 0,user_id,similar_user_id,similarity,item_id
0,484565,31029,0.151911,3553
0,484565,31029,0.151911,15983
0,484565,31029,0.151911,1554
1,484565,1047195,0.150414,5287
1,484565,1047195,0.150414,12686


In [46]:
# drop duplicates pairs user_id-item_id 
# keep with the largest similiarity
recs = recs.sort_values(['user_id', 'similarity'], ascending=False)
recs.head()

Unnamed: 0,user_id,similar_user_id,similarity,item_id
6247,1097119,1089539,0.57735,13865
6248,1097119,836554,0.57735,13865
6249,1097119,777859,0.57735,13865
6250,1097119,683984,0.57735,13865
6251,1097119,406036,0.57735,13865


In [47]:
recs = recs.drop_duplicates(['user_id', 'item_id'], keep='first')

In [48]:
recs.shape

(57502, 4)

## 2.6. Make `rank` from similarity



In [49]:
cnt = Counter(train['item_id'].values)

In [50]:
# convert counter to dataframe
idf = pd.DataFrame.from_dict(cnt, orient='index', columns=['doc_freq']).reset_index()
idf.head()

Unnamed: 0,index,doc_freq
0,3190,28
1,10219,38
2,3500,1
3,9506,24
4,10876,13


In [51]:
# num of documents = num of recommendation list = dataframe shape
n = train.shape[0]
idf['idf'] = idf['doc_freq'].apply(lambda x: np.log((1 + n) / (1 + x) + 1))

In [52]:
idf.head()

Unnamed: 0,index,doc_freq,idf
0,3190,28,7.362908
1,10219,38,7.066861
2,3500,1,10.036466
3,9506,24,7.51124
4,10876,13,8.090818


In [53]:
# join idf 
recs = recs.merge(
    idf[["index", "idf"]], left_on="item_id", right_on="index", how="left"
).drop(["index"], axis=1)

recs.head()

Unnamed: 0,user_id,similar_user_id,similarity,item_id,idf
0,1097119,1089539,0.57735,13865,3.792957
1,1096426,247511,0.333333,11863,6.087452
2,1096426,659514,0.333333,5434,6.620029
3,1096426,520431,0.333333,884,7.95733
4,1096426,44365,0.333333,5693,6.18833


In [54]:
recs['rank_idf'] = recs['similarity'] * recs['idf']

In [55]:
recs.head()

Unnamed: 0,user_id,similar_user_id,similarity,item_id,idf,rank_idf
0,1097119,1089539,0.57735,13865,3.792957,2.189865
1,1096426,247511,0.333333,11863,6.087452,2.029151
2,1096426,659514,0.333333,5434,6.620029,2.206676
3,1096426,520431,0.333333,884,7.95733,2.652443
4,1096426,44365,0.333333,5693,6.18833,2.062777


In [56]:
# make order by rank_idf 
recs = recs.sort_values(['user_id', 'rank_idf'], ascending=False)

In [57]:
# make rank
recs['rank'] = recs.groupby('user_id').cumcount() + 1 

In [58]:
recs.head()

Unnamed: 0,user_id,similar_user_id,similarity,item_id,idf,rank_idf,rank
0,1097119,1089539,0.57735,13865,3.792957,2.189865,1
3,1096426,520431,0.333333,884,7.95733,2.652443,1
7,1096426,83742,0.333333,6684,7.329028,2.443009,2
2,1096426,659514,0.333333,5434,6.620029,2.206676,3
4,1096426,44365,0.333333,5693,6.18833,2.062777,4


## 2.7. Добавление количества рекомендаций

Теперь применим нашу функцию к каждому `user_id` у которых количество рекомендаций меньше 10, предварительно удалим все строки, где количество рекомендаций для одного пользователя больше 10

In [59]:
recs = recs[recs['rank'] <= 10]

In [60]:
recs_temp = recs.copy()

Создадим табличку для хранения всех рекомендаций для всех пользователей

In [61]:
final_reco = (recs_temp
                  .groupby('user_id')
                  .agg({'item_id': list})
                  )

In [62]:
final_reco.head()

Unnamed: 0_level_0,item_id
user_id,Unnamed: 1_level_1
971,"[5434, 8314, 7793, 3996, 16428, 10972, 14879, ..."
3072,"[1131, 11312, 2220, 8447, 1132, 6809]"
5526,"[2616, 12026, 3773, 15399, 6382, 16228, 142, 4..."
5694,"[6809, 4880]"
6188,"[10636, 13414, 10878, 12659, 366, 4495, 7793, ..."


In [63]:
tqdm.pandas(desc='My bar!')

In [64]:
# очень долгое выполнение около 5 часов для полного датасета
final_reco.loc[:, 'item_id'] = final_reco.loc[:, 'item_id'].progress_apply(lambda x: x + mix_popular_items(train, 10 - len(x)))

My bar!:   0%|          | 0/965 [00:00<?, ?it/s]

In [65]:
final_reco.head()

Unnamed: 0_level_0,item_id
user_id,Unnamed: 1_level_1
971,"[5434, 8314, 7793, 3996, 16428, 10972, 14879, ..."
3072,"[1131, 11312, 2220, 8447, 1132, 6809, 12192, 4..."
5526,"[2616, 12026, 3773, 15399, 6382, 16228, 142, 4..."
5694,"[6809, 4880, 2657, 15297, 13865, 142, 4880, 78..."
6188,"[10636, 13414, 10878, 12659, 366, 4495, 7793, ..."


In [66]:
# Сохраним табличку для использования в сервисе
if final_reco.shape[0] != 965:
  final_reco.to_pickle("final_reco.pickle")

На самом деле тут есть несколько вариантов использования нашей функции по добавлению айтемов из микса популярных айтемов:
* вызывать функцию для каждого юзера, как сделали выше, учитывая, что при каждом вызове функции список с айтемами формируется рандомно, то можно говорить о некой "индивидуальности" для каждого юзера
* вызвать функцию 9 раз и сформировать 9 списков-констант из микса популярных айтемов, и использовать каждый из них в соответствующих ситуациях. Тем самым для двух юзеров, которым необходимо добавить по 5 айтемов, будет использован один и тот же список для добавления. На мой взгляд в этом случае теряется "индивидуальный" подход, и, возможно, значение метрик ухудшится.
* усложнить функцию и для каждых 10к юзеров(к примеру) формировать новые списки для добавления айтемов.

Я решил остановится на 1 варианте, так как считаю, что это будет более хороший вариант, но и самый затраный по времени, а сервис придется перевести в "оффлайн". Зато в лидерборде, скорее всего займем 1 место =)

---

Теперь создадим табличку для метрик

In [67]:
my_reco = final_reco.explode('item_id')
my_reco['rank'] = my_reco.groupby('user_id').cumcount() + 1 
my_reco = my_reco.reset_index()

Метрики из работы Ирины

In [68]:
# calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

catalog = train['item_id'].unique()
    
metric_values = calc_metrics(
            metrics,
            reco=recs,
            interactions=test,
            prev_interactions=train,
            catalog=catalog
        )

In [69]:
metric_values

{'prec@10': 0.003212435233160622,
 'recall@10': 0.0139284032030146,
 'novelty': 7.8631252608643,
 'serendipity': 5.069756314515962e-05}

Метрика с добавлением рекомендаций до 10

In [70]:
metric_values_my = calc_metrics(
            metrics,
            reco=my_reco,
            interactions=test,
            prev_interactions=train,
            catalog=catalog
        )
metric_values_my

{'prec@10': 0.009637305699481865,
 'recall@10': 0.05146036935735198,
 'novelty': 7.7485997360495675,
 'serendipity': 4.725028941364236e-05}

Как видно наши показатели лучше

## 2.7. Сборка в один класс

Нужно собрать все в один класс, для того чтобы использовать в нашем сервисе. Воспользуемся файлом userknn.py Ирины и дополним его

In [71]:
class my_UserKnn:
    """Class for fit-perdict UserKNN model
    based on ItemKNN model from implicit.nearest_neighbours
    """

    def __init__(self, model: ItemItemRecommender, N_users: int = 50):
        self.N_users = N_users
        self.model = model
        self.is_fitted = False

    def get_mappings(self, train):
        self.users_inv_mapping = dict(enumerate(train["user_id"].unique()))
        self.users_mapping = {v: k for k, v in self.users_inv_mapping.items()}

        self.items_inv_mapping = dict(enumerate(train["item_id"].unique()))
        self.items_mapping = {v: k for k, v in self.items_inv_mapping.items()}

    def get_matrix(
        self,
        df: pd.DataFrame,
        user_col: str = "user_id",
        item_col: str = "item_id",
        weight_col: str = None,
        users_mapping: Dict[int, int] = None,
        items_mapping: Dict[int, int] = None,
    ):

        if weight_col:
            weights = df[weight_col].astype(np.float32)
        else:
            weights = np.ones(len(df), dtype=np.float32)

        interaction_matrix = sp.sparse.coo_matrix(
            (
                weights,
                (
                    df[user_col].map(self.users_mapping.get),
                    df[item_col].map(self.items_mapping.get),
                ),
            )
        )

        self.watched = df.groupby(user_col).agg({item_col: list})
        return interaction_matrix

    def idf(self, n: int, x: float):
        return np.log((1 + n) / (1 + x) + 1)

    def _count_item_idf(self, df: pd.DataFrame):
        item_cnt = Counter(df["item_id"].values)
        item_idf = pd.DataFrame.from_dict(
            item_cnt, orient="index", columns=["doc_freq"]
        ).reset_index()
        item_idf["idf"] = item_idf["doc_freq"].apply(lambda x: self.idf(self.n, x))
        self.item_idf = item_idf

    def fit(self, train: pd.DataFrame):
        self.user_knn = self.model
        self.get_mappings(train)
        self.weights_matrix = self.get_matrix(
            train, users_mapping=self.users_mapping, items_mapping=self.items_mapping
        )

        self.n = train.shape[0]
        self._count_item_idf(train)

        self.user_knn.fit(self.weights_matrix)
        self.is_fitted = True

    def _generate_recs_mapper(
        self,
        model: ItemItemRecommender,
        user_mapping: Dict[int, int],
        user_inv_mapping: Dict[int, int],
        N: int,
    ):
        def _recs_mapper(user):
            user_id = user_mapping[user]
            recs = model.similar_items(user_id, N=N)
            return [user_inv_mapping[user] for user, _ in recs], [
                sim for _, sim in recs
            ]

        return _recs_mapper

    def predict(self, test: pd.DataFrame, N_recs: int = 10):

        if not self.is_fitted:
            raise ValueError("Please call fit before predict")

        mapper = self._generate_recs_mapper(
            model=self.user_knn,
            user_mapping=self.users_mapping,
            user_inv_mapping=self.users_inv_mapping,
            N=self.N_users,
        )

        recs = pd.DataFrame({"user_id": test["user_id"].unique()})
        recs["sim_user_id"], recs["sim"] = zip(*recs["user_id"].map(mapper))
        recs = recs.set_index("user_id").apply(pd.Series.explode).reset_index()

        recs = (
            recs[~(recs["sim"] >= 1)]
            .merge(
                self.watched, left_on=["sim_user_id"], right_on=["user_id"], how="left"
            )
            .explode("item_id")
            .sort_values(["user_id", "sim"], ascending=False)
            .drop_duplicates(["user_id", "item_id"], keep="first")
            .merge(self.item_idf, left_on="item_id", right_on="index", how="left")
        )

        recs["score"] = recs["sim"] * recs["idf"]
        recs = recs.sort_values(["user_id", "score"], ascending=False)
        recs["rank"] = recs.groupby("user_id").cumcount() + 1
        recs = recs[recs['rank'] <= 10]

        final_reco = (recs_temp
                  .groupby('user_id')
                  .agg({'item_id': list})
                  )
        # очень долгое выполнение около 5 часов для полного датасета
        final_reco.loc[:, 'item_id'] = (final_reco
                                        .loc[:, 'item_id']
                                        .apply(lambda x: x + \
                                               mix_popular_items(train, 10 - len(x)))
                                        )
        
        # преобразование таблицы для метрик
        my_reco = final_reco.explode('item_id')
        my_reco['rank'] = my_reco.groupby('user_id').cumcount() + 1 
        my_reco = my_reco.reset_index()        
        
        return my_reco

# 3. CV для моделей: userkNN

Сравнение implicit `CosineRecommender` и `TFIDFRecommender`

Поменяем количество фолдов, потому что 8 недель - это два месяца и из 6 месяцев отдавать два на тест - это слишком много на мой взгляд. Лучше поставить 5 или 6 недель

In [72]:
# setting for cv 
n_folds = 5
unit = "W"
n_units = 1

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")

Start date and last date of the test fold: (Timestamp('2021-07-11 00:00:00'), Timestamp('2021-08-22 00:00:00'))


## 3.1 Test fold borders

In [73]:
periods = n_folds + 1
freq = f"{n_units}{unit}"
print(
    f"start_date: {start_date}\n"
    f"last_date: {last_date}\n"
    f"periods: {periods}\n"
    f"freq: {freq}\n"
)
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplit(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(interactions)}")

start_date: 2021-07-11 00:00:00
last_date: 2021-08-22 00:00:00
periods: 6
freq: 1W

Test fold borders: ['2021-07-11' '2021-07-18' '2021-07-25' '2021-08-01' '2021-08-08'
 '2021-08-15']
Real number of folds: 5


In [74]:
# calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

# few simple models to compare
models = {
    "cosine_itemknn": CosineRecommender(),
    "tfidf_itemknn": TFIDFRecommender(),
    "my_cosine_itemknn": CosineRecommender(),
    "my_tfidf_itemknn": TFIDFRecommender(),
}


## 3.2. Model training by fold

In [75]:
%%time

results = []

fold_iterator = cv.split(interactions, collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    print(fold_info)

    df_train = interactions.iloc[train_ids].copy()
    df_test = interactions.iloc[test_ids][Columns.UserItem].copy()

    catalog = df_train[Columns.Item].unique()
    
    for model_name, model in models.items():
        if model_name[:3] == 'my_':
            userknn_model = my_UserKnn(model=model, N_users=50)
        else:
            userknn_model = UserKnn(model=model, N_users=50)
        
        userknn_model.fit(df_train)
    
        recos = userknn_model.predict(df_train, df_test)
    
        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )
    
        fold = {"fold": i_fold, "model": model_name}
        fold.update(metric_values)
        results.append(fold)


{'Start date': Timestamp('2021-07-11 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-07-18 00:00:00', freq='W-SUN'), 'Train': 32152, 'Train users': 6019, 'Train items': 4008, 'Test': 2009, 'Test users': 827, 'Test items': 868}


  0%|          | 0/6019 [00:00<?, ?it/s]

  0%|          | 0/6019 [00:00<?, ?it/s]

  0%|          | 0/6019 [00:00<?, ?it/s]

  0%|          | 0/6019 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-07-18 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'), 'Train': 35120, 'Train users': 6418, 'Train items': 4177, 'Test': 2234, 'Test users': 825, 'Test items': 978}


  0%|          | 0/6418 [00:00<?, ?it/s]

  0%|          | 0/6418 [00:00<?, ?it/s]

  0%|          | 0/6418 [00:00<?, ?it/s]

  0%|          | 0/6418 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'Train': 38476, 'Train users': 6837, 'Train items': 4328, 'Test': 2358, 'Test users': 893, 'Test items': 976}


  0%|          | 0/6837 [00:00<?, ?it/s]

  0%|          | 0/6837 [00:00<?, ?it/s]

  0%|          | 0/6837 [00:00<?, ?it/s]

  0%|          | 0/6837 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'Train': 42100, 'Train users': 7335, 'Train items': 4444, 'Test': 2458, 'Test users': 952, 'Test items': 999}


  0%|          | 0/7335 [00:00<?, ?it/s]

  0%|          | 0/7335 [00:00<?, ?it/s]

  0%|          | 0/7335 [00:00<?, ?it/s]

  0%|          | 0/7335 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 45686, 'Train users': 7784, 'Train items': 4610, 'Test': 2512, 'Test users': 965, 'Test items': 1069}


  0%|          | 0/7784 [00:00<?, ?it/s]

  0%|          | 0/7784 [00:00<?, ?it/s]

  0%|          | 0/7784 [00:00<?, ?it/s]

  0%|          | 0/7784 [00:00<?, ?it/s]

CPU times: user 1min 20s, sys: 1.29 s, total: 1min 21s
Wall time: 1min 21s


# 4. Metrics



## 4.1. Metric description 

In [76]:
df_metrics = pd.DataFrame(results)
df_metrics

Unnamed: 0,fold,model,prec@10,recall@10,novelty,serendipity
0,0,cosine_itemknn,0.004474,0.023481,7.804875,7e-06
1,0,tfidf_itemknn,0.005683,0.027392,7.808798,7e-06
2,0,my_cosine_itemknn,0.013301,0.064182,8.247248,0.000199
3,0,my_tfidf_itemknn,0.01318,0.064182,8.247702,0.000199
4,1,cosine_itemknn,0.004,0.021404,7.837846,5e-06
5,1,tfidf_itemknn,0.006424,0.033254,7.861392,6e-06
6,1,my_cosine_itemknn,0.019394,0.081292,8.076013,0.000276
7,1,my_tfidf_itemknn,0.019636,0.081494,8.072751,0.000276
8,2,cosine_itemknn,0.003807,0.022496,7.802561,3e-06
9,2,tfidf_itemknn,0.006383,0.039588,7.828361,4e-06


## 4.2. Metrics mean by fold

In [77]:
df_metrics.groupby('model').mean()[metrics.keys()]

Unnamed: 0_level_0,prec@10,recall@10,novelty,serendipity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cosine_itemknn,0.003584,0.019586,7.8151,5e-06
my_cosine_itemknn,0.021093,0.089292,7.971998,0.000287
my_tfidf_itemknn,0.021159,0.090383,7.971807,0.000287
tfidf_itemknn,0.005762,0.030667,7.828616,6e-06


Как видно из таблицы, добавление к рекомендованным айтемам айтемов до 10 из микса популярных айтемов дает ощутимый прирост почти по всем метрикам, и самое главное при этом не ухудшилась метрика новизны