In [None]:
# import sys
# !{sys.executable} -m pip install rectools==0.2.0

In [21]:
import os
import pickle
import random
import warnings

import numpy as np
import pandas as pd
from implicit.nearest_neighbours import (
    BM25Recommender,
    CosineRecommender,
    TFIDFRecommender,
)
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import (
    MAP,
    MeanInvUserFreq,
    Precision,
    Recall,
    Serendipity,
    calc_metrics,
)
from rectools.model_selection import TimeRangeSplit
from rectools.models import ImplicitItemKNNWrapperModel
from rectools.models.popular import PopularModel 

# from service.api.models_zoo import UserKNN

warnings.filterwarnings("ignore")

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

seed = 42
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
# download dataset by chunks
# !wget https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip -O ../data/data_original.zip
# !unzip ../data/data_original.zip -d ../data

--2022-12-04 17:08:24--  https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip
Resolving storage.yandexcloud.net (storage.yandexcloud.net)... 213.180.193.243
Connecting to storage.yandexcloud.net (storage.yandexcloud.net)|213.180.193.243|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 78795385 (75M) [application/zip]
Saving to: ‘../data/data_original.zip’


2022-12-04 17:08:33 (8.64 MB/s) - ‘../data/data_original.zip’ saved [78795385/78795385]

Archive:  ../data/data_original.zip
   creating: ../data/kion_train/
  inflating: ../data/kion_train/interactions.csv  
  inflating: ../data/__MACOSX/kion_train/._interactions.csv  
  inflating: ../data/kion_train/users.csv  
  inflating: ../data/__MACOSX/kion_train/._users.csv  
  inflating: ../data/kion_train/items.csv  
  inflating: ../data/__MACOSX/kion_train/._items.csv  


In [None]:
interactions = pd.read_csv('../data/kion_train/interactions.csv')
users = pd.read_csv('../data/kion_train/users.csv')
items = pd.read_csv('../data/kion_train/items.csv')

# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight},
                    inplace=True)

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

## Train test split

In [10]:
# train test split
# test = last 1 week
n_folds = 1
unit = "W"
n_units = 1
periods = n_folds + 1

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  # TimeDelta возвращает длительность промежутка между датами
print(f"Start date and last date of the test fold: {start_date, last_date}")

date_range = pd.date_range(start=start_date, periods=periods, freq=unit, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplit(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(interactions)}")

Start date and last date of the test fold: (Timestamp('2021-08-08 00:00:00'), Timestamp('2021-08-22 00:00:00'))
Test fold borders: ['2021-08-08' '2021-08-15']
Real number of folds: 1


In [11]:
(train_ids, test_ids, fold_info) = cv.split(interactions, collect_fold_stats=True).__next__()

In [12]:
train = interactions.loc[train_ids].reset_index(drop=True)
test = interactions.loc[test_ids].reset_index(drop=True)

In [13]:
train.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [14]:
# Create dataset
train_df = Dataset.construct(
    train,
)
test_df = Dataset.construct(
    test,
)

In [15]:
N = 10 # Количество рекомендаций

In [16]:
# Fit model
model_tfidf = ImplicitItemKNNWrapperModel(TFIDFRecommender(K=10)) # K = 10, 25, 50 
model_tfidf.fit(train_df)

# Make recommendations
recos_tfidf = model_tfidf.recommend(
    users=train[Columns.User].unique(),
    dataset=train_df,
    k=N,
    filter_viewed=True,
)

In [17]:
# Fit model
model_bm25 = ImplicitItemKNNWrapperModel(BM25Recommender(K=10, K1=2)) # K = 10, 25, 50. Изменение коэффициентов K1 и b особой роли не играет.
model_bm25.fit(train_df)

# Make recommendations
recos_bm25 = model_bm25.recommend(
    users=train[Columns.User].unique(),
    dataset=train_df,
    k=N,
    filter_viewed=True,
)

In [18]:
# Fit model
model_cossim = ImplicitItemKNNWrapperModel(CosineRecommender(K=10)) # K = 10, 25, 50 
model_cossim.fit(train_df)

# Make recommendations
recos_cossim = model_cossim.recommend(
    users=train[Columns.User].unique(),
    dataset=train_df,
    k=N,
    filter_viewed=True,
)

In [19]:
metrics = {
    "mAP@10": MAP(k=10),
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

catalog = train['item_id'].unique()

In [20]:
metric_values_tfidf = calc_metrics(
    metrics,
    reco=recos_tfidf,
    interactions=test,
    prev_interactions=train,
    catalog=catalog
)

metric_values_bm25 = calc_metrics(
    metrics,
    reco=recos_bm25,
    interactions=test,
    prev_interactions=train,
    catalog=catalog
)

metric_values_cossim = calc_metrics(
    metrics,
    reco=recos_cossim,
    interactions=test,
    prev_interactions=train,
    catalog=catalog
)
print('KNN с поиском похожих юзеров через tf-idf выдает:\n', metric_values_tfidf)
print('KNN с поиском похожих юзеров через BM25 выдает:\n', metric_values_bm25)
print('KNN с поиском похожих юзеров через Cos sim выдает:\n', metric_values_cossim)

KNN с поиском похожих юзеров через tf-idf выдает:
 {'prec@10': 0.03379975093888197, 'recall@10': 0.16785348935939565, 'mAP@10': 0.0780738513664305, 'novelty': 7.445808587669819, 'serendipity': 2.450980496715003e-05}
KNN с поиском похожих юзеров через BM25 выдает:
 {'prec@10': 0.03937715109380976, 'recall@10': 0.19876239246077565, 'mAP@10': 0.0956302459933854, 'novelty': 4.052741401774423, 'serendipity': 8.128819126883973e-06}
KNN с поиском похожих юзеров через Cos sim выдает:
 {'prec@10': 0.022914603414294542, 'recall@10': 0.1191115419628571, 'mAP@10': 0.05829229137022718, 'novelty': 9.589126024970618, 'serendipity': 1.4627631415421597e-05}


По метрикам лучше всего себя показал BM25 (ожидаемо) и хуже всего обычное косинусное расстояние. 

Обучим популярное.

In [23]:
pop = PopularModel(popularity='n_users')
pop.fit(train_df);

In [26]:
recopop = pop.recommend(
    users=train[Columns.User].unique(), 
    dataset=train_df, 
    k=N, 
    filter_viewed=False  
)

Для пользователей, которым BM25 выдал меньше 10 рекомендаций (или не выдал вообще), будем добавлять популярное.

In [None]:
recoms = pd.concat([recos_bm25, recopop])
recoms = recoms.drop_duplicates(keep='first', subset=['user_id', 'item_id'])
recoms['rank'] = recoms.groupby('user_id')['user_id'].rank(method='first')
recoms = recoms[recoms['rank'] <= 10]
recoms = recoms[['user_id', 'item_id']]
recoms.to_csv('BM25pop.csv.gz', index=False, compression='gzip')

Другой вариант: объединим предсказания 3-х видов KNN. Объединяем в другом порядке - другой вариант ранжирования.

In [None]:
recoms = pd.concat([recos_bm25, recos_cossim, recos_tfidf])
recoms = recoms.drop_duplicates(keep='first', subset=['user_id', 'item_id'])
recoms['rank'] = recoms.groupby('user_id')['user_id'].rank(method='first')
recoms = recoms[recoms['rank'] <= 10]
recoms = recoms[['user_id', 'item_id']]
recoms.to_csv('BM25TFCOS.csv.gz', index=False, compression='gzip')

# Final Model

In [None]:
model = UserKNN(dist_model=BM25Recommender(K=10, K1=2), n_neighbors=10)

In [None]:
model.fit(train)

  0%|          | 0/842129 [00:00<?, ?it/s]

In [None]:
model.predict(user_id=699317)

[1659, 2365, 8727, 5533, 12988, 9506, 5005, 15171, 3474, 11985]

In [None]:
with open('../data/knn_bm25.pickle', 'wb') as f:
      pickle.dump(model, f)
