In [None]:
# import sys
# !{sys.executable} -m pip install rectools==0.2.0

In [1]:
import os
import pickle
import random
import warnings

import numpy as np
import pandas as pd
from implicit.nearest_neighbours import (
    BM25Recommender,
    CosineRecommender,
    TFIDFRecommender,
)
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import (
    MAP,
    MeanInvUserFreq,
    Precision,
    Recall,
    Serendipity,
    calc_metrics,
)
from rectools.model_selection import TimeRangeSplit
from rectools.models import ImplicitItemKNNWrapperModel

from service.api.models_zoo import UserKNN

warnings.filterwarnings("ignore")



In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

seed = 42
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

In [3]:
# download dataset by chunks
# !wget https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip -O ../data/data_original.zip
# !unzip ../data/data_original.zip -d ../data

--2022-12-04 17:08:24--  https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip
Resolving storage.yandexcloud.net (storage.yandexcloud.net)... 213.180.193.243
Connecting to storage.yandexcloud.net (storage.yandexcloud.net)|213.180.193.243|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 78795385 (75M) [application/zip]
Saving to: ‘../data/data_original.zip’


2022-12-04 17:08:33 (8.64 MB/s) - ‘../data/data_original.zip’ saved [78795385/78795385]

Archive:  ../data/data_original.zip
   creating: ../data/kion_train/
  inflating: ../data/kion_train/interactions.csv  
  inflating: ../data/__MACOSX/kion_train/._interactions.csv  
  inflating: ../data/kion_train/users.csv  
  inflating: ../data/__MACOSX/kion_train/._users.csv  
  inflating: ../data/kion_train/items.csv  
  inflating: ../data/__MACOSX/kion_train/._items.csv  


In [13]:
interactions = pd.read_csv('../data/kion_train/interactions.csv')
users = pd.read_csv('../data/kion_train/users.csv')
items = pd.read_csv('../data/kion_train/items.csv')

# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight},
                    inplace=True)

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

## Train test split

In [14]:
# train test split
# test = last 1 week
n_folds = 1
unit = "W"
n_units = 1
periods = n_folds + 1

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  # TimeDelta возвращает длительность промежутка между датами
print(f"Start date and last date of the test fold: {start_date, last_date}")

date_range = pd.date_range(start=start_date, periods=periods, freq=unit, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplit(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(interactions)}")

Start date and last date of the test fold: (Timestamp('2021-08-08 00:00:00'), Timestamp('2021-08-22 00:00:00'))
Test fold borders: ['2021-08-08' '2021-08-15']
Real number of folds: 1


In [15]:
(train_ids, test_ids, fold_info) = cv.split(interactions, collect_fold_stats=True).__next__()

In [16]:
train = interactions.loc[train_ids].reset_index(drop=True)
test = interactions.loc[test_ids].reset_index(drop=True)

In [17]:
# prepare user and item features
age_feature = (
    users[["user_id", "age"]]
    .rename({'user_id': 'id', 'age': 'value'}, axis=1)
)
age_feature["feature"] = "age"

age_feature_train = age_feature[age_feature['id'].isin(train['user_id'])]
age_feature_test = age_feature[age_feature['id'].isin(test['user_id'])]


genres_feature = (
    items[["item_id", "genres"]]
    .rename({'item_id': 'id', 'genres': 'value'}, axis=1)
)

genres_feature["value"] = genres_feature["value"].str.split(",")
genres_feature = genres_feature.explode("value")
genres_feature["feature"] = "genres"

genres_feature_train = genres_feature[genres_feature['id'].isin(train['item_id'])]
genres_feature_test = genres_feature[genres_feature['id'].isin(test['item_id'])]

In [30]:
# Create dataset
train_df = Dataset.construct(
    train,
    user_features_df=None, # age_feature_train
    item_features_df=None, # genres_feature_train
    # cat_item_features=['genres'],
    # cat_user_features=['age']

)
test_df = Dataset.construct(
    test,
    user_features_df=None, # age_feature_test
    item_features_df=None,
    # cat_item_features=['genres'],
    # cat_user_features=['age']

)

In [21]:
# Fit model
model_tfidf = ImplicitItemKNNWrapperModel(TFIDFRecommender(K=10))
model_tfidf.fit(train_df)

# Make recommendations
recos_tfidf = model_tfidf.recommend(
    users=train_df[Columns.User].unique(),
    dataset=train_df,
    k=10,
    filter_viewed=True,
)

TypeError: 'Dataset' object is not subscriptable

In [28]:
train

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
...,...,...,...,...,...
4587703,497899,9629,2021-05-29,45,1.0
4587704,438585,7829,2021-08-02,6804,100.0
4587705,786732,4880,2021-05-12,753,0.0
4587706,546862,9673,2021-04-13,2308,49.0


In [31]:
recos_tfidf = model_tfidf.recommend(
    users=test[Columns.User].unique().tolist(),
    dataset=train_df,
    k=10,
    filter_viewed=True,
)

KeyError: 'Some indices not exists'

In [12]:
# Fit model
model_bm25 = ImplicitItemKNNWrapperModel(BM25Recommender(K=10, K1=2))
model_bm25.fit(train_df)

# Make recommendations
recos_bm25 = model_bm25.recommend(
    users=train[Columns.User].unique(),
    dataset=train_df,
    k=10,
    filter_viewed=True,
)

KeyError: 'Some indices not exists'

In [None]:
# Fit model
model_cossim = ImplicitItemKNNWrapperModel(CosineRecommender(K=10))
model_cossim.fit(train_df)

# Make recommendations
recos_cossim = model_cossim.recommend(
    users=train[Columns.User].unique(),
    dataset=train_df,
    k=10,
    filter_viewed=True,
)

In [None]:
metrics = {
    "mAP@10": MAP(k=10),
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

catalog = train['item_id'].unique()

In [None]:
metric_values_tfidf = calc_metrics(
    metrics,
    reco=recos_tfidf,
    interactions=test,
    prev_interactions=train,
    catalog=catalog
)

metric_values_bm25 = calc_metrics(
    metrics,
    reco=recos_bm25,
    interactions=test,
    prev_interactions=train,
    catalog=catalog
)

metric_values_cossim = calc_metrics(
    metrics,
    reco=recos_cossim,
    interactions=test,
    prev_interactions=train,
    catalog=catalog
)
print('KNN с поиском похожих юзеров через tf-idf выдает:\n', metric_values_tfidf)
print('KNN с поиском похожих юзеров через BM25 выдает:\n', metric_values_bm25)
print('KNN с поиском похожих юзеров через Cos sim выдает:\n', metric_values_cossim)

По метрикам лучше всего себя показал BM25 (ожидаемо) и хуже всего обычное косинусное расстояние

In [18]:
recos_bm25 = recos_bm25[recos_bm25['rank'] <= 10]
recos_bm25 = recos_bm25[['user_id', 'item_id']]
recos_bm25.to_csv('../data/KNNBM25withAddFeatures.csv.gz', index=False, compression='gzip')

# Final Model

In [14]:
model = UserKNN(dist_model=BM25Recommender(K=10, K1=2), n_neighbors=10)

In [15]:
model.fit(train)

  0%|          | 0/842129 [00:00<?, ?it/s]

In [16]:
model.predict(user_id=699317)

[1659, 2365, 8727, 5533, 12988, 9506, 5005, 15171, 3474, 11985]

In [17]:
with open('../data/knn_bm25.pickle', 'wb') as f:
      pickle.dump(model, f)
