In [1]:
!pip install rectools==0.2.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rectools==0.2.0
  Downloading RecTools-0.2.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 2.4 MB/s 
Collecting implicit==0.4.4
  Downloading implicit-0.4.4.tar.gz (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 44.2 MB/s 
Collecting Markdown<3.3,>=3.2
  Downloading Markdown-3.2.2-py3-none-any.whl (88 kB)
[K     |████████████████████████████████| 88 kB 9.6 MB/s 
Collecting attrs<22.0.0,>=19.1.0
  Downloading attrs-21.4.0-py2.py3-none-any.whl (60 kB)
[K     |████████████████████████████████| 60 kB 8.3 MB/s 
[?25hCollecting lightfm<2.0,>=1.16
  Downloading lightfm-1.16.tar.gz (310 kB)
[K     |████████████████████████████████| 310 kB 61.7 MB/s 
Collecting nmslib<3.0.0,>=2.0.4
  Downloading nmslib-2.1.1-cp37-cp37m-manylinux2010_x86_64.whl (13.5 MB)
[K     |████████████████████████████████| 13.5 MB 47.9 MB/s 
Collecting pybind11<2.6.2
  Do

In [34]:
import os
import random
import pandas as pd
import numpy as np
import scipy as sp
import requests
from tqdm.auto import tqdm
from scipy.stats import mode 
from pprint import pprint
import warnings



from implicit.nearest_neighbours import (
    TFIDFRecommender, BM25Recommender,
    CosineRecommender, ItemItemRecommender
)
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitItemKNNWrapperModel
from rectools.model_selection import TimeRangeSplit
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics, MAP


pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)
seed = 42
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
warnings.filterwarnings("ignore")

In [5]:
# download dataset by chunks
url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"

req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [6]:
!unzip kion_train.zip

interactions = pd.read_csv('kion_train/interactions.csv')
users = pd.read_csv('kion_train/users.csv')
items = pd.read_csv('kion_train/items.csv')


# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight}, 
                    inplace=True) 

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

Archive:  kion_train.zip
   creating: kion_train/
  inflating: kion_train/interactions.csv  
  inflating: __MACOSX/kion_train/._interactions.csv  
  inflating: kion_train/users.csv    
  inflating: __MACOSX/kion_train/._users.csv  
  inflating: kion_train/items.csv    
  inflating: __MACOSX/kion_train/._items.csv  


## Train test split

In [7]:
# train test split 
# test = last 1 week 
n_folds = 1
unit = "W"
n_units = 1
periods = n_folds + 1

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  # TimeDelta возвращает длительность промежутка между датами
print(f"Start date and last date of the test fold: {start_date, last_date}")
    
date_range = pd.date_range(start=start_date, periods=periods, freq=unit, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplit(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(interactions)}")

Start date and last date of the test fold: (Timestamp('2021-08-08 00:00:00'), Timestamp('2021-08-22 00:00:00'))
Test fold borders: ['2021-08-08' '2021-08-15']
Real number of folds: 1


In [8]:
(train_ids, test_ids, fold_info) = cv.split(interactions, collect_fold_stats=True).__next__()

In [9]:
train = interactions.loc[train_ids].reset_index(drop=True)
test = interactions.loc[test_ids].reset_index(drop=True)

In [80]:
# prepare user and item features
age_feature = (
    users[["user_id", "age"]]
    .rename({'user_id': 'id', 'age': 'value'}, axis=1)
)
age_feature["feature"] = "age"

age_feature_train = age_feature[age_feature['id'].isin(train['user_id'])]
age_feature_test = age_feature[age_feature['id'].isin(test['user_id'])]


genres_feature = (
    items[["item_id", "genres"]]
    .rename({'item_id': 'id', 'genres': 'value'}, axis=1)
)

genres_feature["value"] = genres_feature["value"].str.split(",")
genres_feature = genres_feature.explode("value")
genres_feature["feature"] = "genres"

genres_feature_train = genres_feature[genres_feature['id'].isin(train['item_id'])]
genres_feature_test = genres_feature[genres_feature['id'].isin(test['item_id'])]

In [85]:
# Create dataset
train_df = Dataset.construct(
    train,
    user_features_df=age_feature_train,
    item_features_df=genres_feature_train,
    cat_item_features=['genres'],
    cat_user_features=['age']

)
test_df = Dataset.construct(
    test,
    user_features_df=age_feature_test,
    item_features_df=genres_feature_test,
    cat_item_features=['genres'],
    cat_user_features=['age']

)

In [102]:
# Fit model
model_tfidf = ImplicitItemKNNWrapperModel(TFIDFRecommender(K=10))
model_tfidf.fit(train_df)

# Make recommendations
recos_tfidf = model_tfidf.recommend(
    users=train[Columns.User].unique(),
    dataset=train_df,
    k=10,
    filter_viewed=True,
)

In [103]:
# Fit model
model_bm25 = ImplicitItemKNNWrapperModel(BM25Recommender(K=10, K1=2))
model_bm25.fit(train_df)

# Make recommendations
recos_bm25 = model_bm25.recommend(
    users=train[Columns.User].unique(),
    dataset=train_df,
    k=10,
    filter_viewed=True,
)

In [104]:
# Fit model
model_cossim = ImplicitItemKNNWrapperModel(CosineRecommender(K=10))
model_cossim.fit(train_df)

# Make recommendations
recos_cossim = model_cossim.recommend(
    users=train[Columns.User].unique(),
    dataset=train_df,
    k=10,
    filter_viewed=True,
)

In [105]:
metrics = {
    "mAP@10": MAP(k=10),
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

catalog = train['item_id'].unique()

In [106]:
metric_values_tfidf = calc_metrics(
    metrics,
    reco=recos_tfidf,
    interactions=test,
    prev_interactions=train,
    catalog=catalog
)

metric_values_bm25 = calc_metrics(
    metrics,
    reco=recos_bm25,
    interactions=test,
    prev_interactions=train,
    catalog=catalog
)

metric_values_cossim = calc_metrics(
    metrics,
    reco=recos_cossim,
    interactions=test,
    prev_interactions=train,
    catalog=catalog
)
print('KNN с поиском похожих юзеров через tf-idf выдает:\n', metric_values_tfidf)
print('KNN с поиском похожих юзеров через BM25 выдает:\n', metric_values_bm25)
print('KNN с поиском похожих юзеров через Cos sim выдает:\n', metric_values_cossim)

KNN с поиском похожих юзеров через tf-idf выдает:
 {'prec@10': 0.03379975093888197, 'recall@10': 0.16785348935939565, 'mAP@10': 0.0780738513664305, 'novelty': 7.445808587669819, 'serendipity': 2.450980496715003e-05}
KNN с поиском похожих юзеров через BM25 выдает:
 {'prec@10': 0.03937715109380976, 'recall@10': 0.19876239246077565, 'mAP@10': 0.0956302459933854, 'novelty': 4.052741401774423, 'serendipity': 8.128819126883973e-06}
KNN с поиском похожих юзеров через Cos sim выдает:
 {'prec@10': 0.022914603414294542, 'recall@10': 0.1191115419628571, 'mAP@10': 0.05829229137022718, 'novelty': 9.589126024970618, 'serendipity': 1.4627631415421597e-05}


In [93]:
# recoms = pd.concat([recos_bm25, recos_tfidf, recos_cossim])
# recoms = recoms.drop_duplicates(keep='first', subset=['user_id', 'item_id'])
# recoms['rank'] = recoms.groupby('user_id')['user_id'].rank(method='first')
# recoms = recoms[recoms['rank'] <= 10]
# recoms = recoms[['user_id', 'item_id']]
# recoms.to_csv('BlendingKNNwithAddFeatures.csv.gz', index=False, compression='gzip')

In [95]:
recos_bm25 = recos_bm25[recos_bm25['rank'] <= 10]
recos_bm25 = recos_bm25[['user_id', 'item_id']]
recos_bm25.to_csv('KNNBM25withAddFeatures.csv.gz', index=False, compression='gzip')

In [114]:
!pip install -U --no-cache-dir gdown --pre

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
