In [None]:
!pip install rectools==0.2.0

In [None]:
import os
import random
import pandas as pd
import numpy as np
import scipy as sp
import requests
from tqdm.auto import tqdm
from scipy.stats import mode 
from pprint import pprint
import warnings
import dill


from implicit.nearest_neighbours import (
    TFIDFRecommender, BM25Recommender,
    CosineRecommender, ItemItemRecommender
)
from rectools.models.popular import PopularModel 
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitItemKNNWrapperModel
from rectools.model_selection import TimeRangeSplit
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics, MAP


pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)
seed = 42
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
warnings.filterwarnings("ignore")

In [None]:
# download dataset by chunks
url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"

req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [None]:
!unzip kion_train.zip

Archive:  kion_train.zip
   creating: kion_train/
  inflating: kion_train/interactions.csv  
  inflating: __MACOSX/kion_train/._interactions.csv  
  inflating: kion_train/users.csv    
  inflating: __MACOSX/kion_train/._users.csv  
  inflating: kion_train/items.csv    
  inflating: __MACOSX/kion_train/._items.csv  


In [None]:
interactions = pd.read_csv('kion_train/interactions.csv')
users = pd.read_csv('kion_train/users.csv')
items = pd.read_csv('kion_train/items.csv')


# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight}, 
                    inplace=True) 

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

In [None]:
begin = interactions["datetime"].max().normalize() - pd.DateOffset(days=7)

In [None]:
len(interactions)

5476251

In [None]:
interactions

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
...,...,...,...,...,...
5476246,648596,12225,2021-08-13,76,0.0
5476247,546862,9673,2021-04-13,2308,49.0
5476248,697262,15297,2021-08-20,18307,63.0
5476249,384202,16197,2021-04-19,6203,100.0


In [None]:
interactions = interactions[interactions['datetime'] > begin]

In [None]:
len(interactions)

424436

Popular


In [None]:
sex_feature = users[["user_id", "sex"]]

In [None]:
sex_feature["feature"] = "sex"

In [None]:
sex_feature.columns = ["id", "value", "feature"]

In [None]:
sex_feature = sex_feature[sex_feature['id'].isin(interactions['user_id'])]

In [None]:
sex_feature

In [None]:
type_feature = items[["item_id", "content_type"]]

In [None]:
type_feature["feature"] = "type"

In [None]:
type_feature.columns = ['id', 'value', 'feature']

In [None]:
type_feature = type_feature[type_feature['id'].isin(interactions['item_id'])]

In [None]:
dataset = Dataset.construct(
    interactions_df=interactions,
    #user_features_df=sex_feature,
    # item_features_df=type_feature,
    # cat_item_features=['type']
    #cat_user_features = ['sex']
)

In [None]:
begin = interactions["datetime"].max().normalize() - pd.DateOffset(days=30)

In [None]:
pop = PopularModel(popularity='n_users')
pop.fit(dataset);

In [None]:
recopop = pop.recommend(
    users=interactions[Columns.User].unique(), # В данном случае рекомендуем только пользователю с user_id = 0
    dataset=dataset, # Из какого датасета брать фильмы
    k=10, # Количество рекомендаций
    filter_viewed=False  # True - throw away some items for each user
)

## Train test split

In [None]:
# Fit model
model_bm25 = ImplicitItemKNNWrapperModel(BM25Recommender(K=10, K1=3, B=0.5))
model_bm25.fit(dataset)

<rectools.models.implicit_knn.ImplicitItemKNNWrapperModel at 0x7fe0d88eb910>

In [None]:
import dill

with open('bm25all.dill', 'wb') as f:
    dill.dump(model_bm25, f)

In [None]:
with open('/content/drive/MyDrive/RecSys/data/bm25.dill', 'rb') as f:
    model_bm25 = dill.load(f)

In [None]:
# Make recommendations
recos_bm25 = model_bm25.recommend(
    users=interactions[Columns.User].unique(),
    dataset=dataset,
    k=10,
    filter_viewed=True,
)

In [None]:
0.09564093307946361 # b=0.5, k1=3
0.11178085829577415 # n_units = 4, b=0.75, k1=2
0.11186397166708617 # n_units = 4, b=0.75, k1=1.2

In [None]:
recoms = pd.concat([recos_bm25, recopop])
recoms = recoms.drop_duplicates(keep='first', subset=['user_id', 'item_id'])
recoms['rank'] = recoms.groupby('user_id')['user_id'].rank(method='first')
recoms = recoms[recoms['rank'] <= 10]
recoms = recoms[['user_id', 'item_id']]
recoms.to_csv('BM25popweek.csv.gz', index=False, compression='gzip')

In [None]:
recos_bm25 = recos_bm25[recos_bm25['rank'] <= 10]
recos_bm25 = recos_bm25[['user_id', 'item_id']]
recos_bm25.to_csv('KNNBM25all.csv.gz', index=False, compression='gzip')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
