In [None]:
# import sys
# !{sys.executable} -m pip install rectools==0.2.0

In [3]:
import os
import pickle
import random
import warnings

import numpy as np
import pandas as pd
from implicit.nearest_neighbours import (
    BM25Recommender,
    CosineRecommender,
    TFIDFRecommender,
)
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import (
    MAP,
    MeanInvUserFreq,
    Precision,
    Recall,
    Serendipity,
    calc_metrics,
)
from rectools.model_selection import TimeRangeSplit
from rectools.models import ImplicitItemKNNWrapperModel
from rectools.models.popular import PopularModel

from service.api.models_zoo import UserKNN

warnings.filterwarnings("ignore")

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

seed = 42
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
# download dataset by chunks
# !wget https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip -O ../data/data_original.zip
# !unzip ../data/data_original.zip -d ../data

--2022-12-04 17:08:24--  https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip
Resolving storage.yandexcloud.net (storage.yandexcloud.net)... 213.180.193.243
Connecting to storage.yandexcloud.net (storage.yandexcloud.net)|213.180.193.243|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 78795385 (75M) [application/zip]
Saving to: ‘../data/data_original.zip’


2022-12-04 17:08:33 (8.64 MB/s) - ‘../data/data_original.zip’ saved [78795385/78795385]

Archive:  ../data/data_original.zip
   creating: ../data/kion_train/
  inflating: ../data/kion_train/interactions.csv  
  inflating: ../data/__MACOSX/kion_train/._interactions.csv  
  inflating: ../data/kion_train/users.csv  
  inflating: ../data/__MACOSX/kion_train/._users.csv  
  inflating: ../data/kion_train/items.csv  
  inflating: ../data/__MACOSX/kion_train/._items.csv  


In [None]:
interactions = pd.read_csv('../data/kion_train/interactions.csv')
users = pd.read_csv('../data/kion_train/users.csv')
items = pd.read_csv('../data/kion_train/items.csv')

# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight},
                    inplace=True)

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

## Train test split

In [11]:
# train test split
# test = last 1 week
n_folds = 1
unit = "W"
n_units = 1
periods = n_folds + 1

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  # TimeDelta возвращает длительность промежутка между датами
print(f"Start date and last date of the test fold: {start_date, last_date}")

date_range = pd.date_range(start=start_date, periods=periods, freq=unit, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplit(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(interactions)}")

Start date and last date of the test fold: (Timestamp('2021-08-08 00:00:00'), Timestamp('2021-08-22 00:00:00'))
Test fold borders: ['2021-08-08' '2021-08-15']
Real number of folds: 1


In [12]:
(train_ids, test_ids, fold_info) = cv.split(interactions, collect_fold_stats=True).__next__()

In [13]:
train = interactions.loc[train_ids].reset_index(drop=True)
test = interactions.loc[test_ids].reset_index(drop=True)

In [14]:
train.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [15]:
# Create dataset
train_df = Dataset.construct(
    train,
)

In [None]:
metrics = {
    "mAP@10": MAP(k=10),
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

catalog = train['item_id'].unique()

In [16]:
N = 10 # Количество рекомендаций

In [18]:
neighbours = [5, 10, 20] # Обучим модели на разном количестве соседей

# Метрики будем складывать в список, чтобы потом поместить их в таблицу
tfidf = []
bm25 = []
cossim = []


for i in neighbours:

  # Fit model
  model_tfidf = ImplicitItemKNNWrapperModel(TFIDFRecommender(K=i))
  model_tfidf.fit(train_df)

  # Make recommendations
  recos_tfidf = model_tfidf.recommend(
      users=train[Columns.User].unique(),
      dataset=train_df,
      k=N,
      filter_viewed=True,
  )

  # Fit model
  model_bm25 = ImplicitItemKNNWrapperModel(BM25Recommender(K=i, K1=2)) # Изменение коэффициентов K1 и b особой роли не играет
  model_bm25.fit(train_df)

  # Make recommendations
  recos_bm25 = model_bm25.recommend(
      users=train[Columns.User].unique(),
      dataset=train_df,
      k=N,
      filter_viewed=True,
  )

  # Fit model
  model_cossim = ImplicitItemKNNWrapperModel(CosineRecommender(K=i)) 
  model_cossim.fit(train_df)

  # Make recommendations
  recos_cossim = model_cossim.recommend(
      users=train[Columns.User].unique(),
      dataset=train_df,
      k=N,
      filter_viewed=True,
  )

  metric_values_tfidf = calc_metrics(
    metrics,
    reco=recos_tfidf,
    interactions=test,
    prev_interactions=train,
    catalog=catalog
  )

  metric_values_bm25 = calc_metrics(
      metrics,
      reco=recos_bm25,
      interactions=test,
      prev_interactions=train,
      catalog=catalog
  )

  metric_values_cossim = calc_metrics(
      metrics,
      reco=recos_cossim,
      interactions=test,
      prev_interactions=train,
      catalog=catalog
  )

  tfidf.append(metric_values_tfidf)
  bm25.append(metric_values_bm25)
  cossim.append(metric_values_cossim)


In [36]:
dftfidf = pd.DataFrame(tfidf, index=['tfidf (k = 5)', 'tfidf (k = 5)', 'tfidf (k = 5)'])
dfbm25 = pd.DataFrame(bm25, index=['bm25 (k = 5)', 'bm25 (k = 10)', 'bm25 (k = 20)'])
dfcossim = pd.DataFrame(cossim, index=['cossim (k = 5)', 'cossim (k = 10)', 'cossim (k = 20)'])

In [39]:
metricstable = pd.concat([dftfidf, dfbm25, dfcossim])
metricstable

Unnamed: 0,prec@10,recall@10,mAP@10,novelty,serendipity
tfidf (k = 5),0.02736,0.133782,0.0702,7.974207,2.9e-05
tfidf (k = 5),0.0338,0.167853,0.078074,7.445809,2.5e-05
tfidf (k = 5),0.034653,0.171268,0.079425,7.221953,2.2e-05
bm25 (k = 5),0.032157,0.158591,0.088671,3.840779,1.7e-05
bm25 (k = 10),0.039377,0.198762,0.09563,4.052741,8e-06
bm25 (k = 20),0.039099,0.199296,0.095693,4.030999,6e-06
cossim (k = 5),0.018192,0.095095,0.051952,10.001041,1.5e-05
cossim (k = 10),0.022915,0.119112,0.058292,9.589126,1.5e-05
cossim (k = 20),0.025375,0.131688,0.060869,9.272995,1.5e-05


По метрикам лучше всего себя показал BM25 (ожидаемо) и хуже всего обычное косинусное расстояние.

Добьем лучшую модель популярными фильмами

In [41]:
pop = PopularModel(popularity='n_users')
pop.fit(train_df);

In [42]:
recopop = pop.recommend(
    users=train[Columns.User].unique(),
    dataset=train_df,
    k=N,
    filter_viewed=False
)

In [53]:
recoms = pd.concat([recos_bm25, recopop])
recoms = recoms.drop_duplicates(keep='first', subset=['user_id', 'item_id'])
recoms['rank'] = recoms.groupby('user_id')['user_id'].rank(method='first')
recoms = recoms[recoms['rank'] <= 10]

In [54]:
metric_values_bm25pop = calc_metrics(
    metrics,
    reco=recoms,
    interactions=test,
    prev_interactions=train,
    catalog=catalog
  )

In [55]:
bm25pop = pd.Series(metric_values_bm25pop) # Подсчет метрик для BM25(k=20) + Popular

prec@10        0.039099
recall@10      0.199296
mAP@10         0.095693
novelty        4.030830
serendipity    0.000006
dtype: float64

In [None]:
recoms.to_csv('BM25pop.csv.gz', index=False, compression='gzip')

Другой вариант: объединим предсказания 3-х видов KNN. Объединяем в другом порядке - другой вариант ранжирования.

In [60]:
recoms = pd.concat([recos_bm25, recos_cossim, recos_tfidf])
recoms = recoms.drop_duplicates(keep='first', subset=['user_id', 'item_id'])
recoms['rank'] = recoms.groupby('user_id')['user_id'].rank(method='first')
recoms = recoms[recoms['rank'] <= 10]


In [61]:
metric_values_blend = calc_metrics(
    metrics,
    reco=recoms,
    interactions=test,
    prev_interactions=train,
    catalog=catalog
  )

In [63]:
blend = pd.Series(metric_values_blend) # Подсчет метрик для 3-х моделей (BM25, TF-IDF, CosSim), k=20
blend

prec@10        0.039099
recall@10      0.199296
mAP@10         0.095693
novelty        4.030999
serendipity    0.000006
dtype: float64

In [None]:
recoms.to_csv('BM25TFCOS.csv.gz', index=False, compression='gzip')

# Final Model

In [None]:
model = UserKNN(dist_model=BM25Recommender(K=10, K1=2), n_neighbors=10)

In [None]:
model.fit(train)

  0%|          | 0/842129 [00:00<?, ?it/s]

In [None]:
model.predict(user_id=699317)

[1659, 2365, 8727, 5533, 12988, 9506, 5005, 15171, 3474, 11985]

In [None]:
with open('../data/knn_bm25.pickle', 'wb') as f:
      pickle.dump(model, f)
