<a href="https://colab.research.google.com/github/Sergey-Kit/RecoServiceTemplate/blob/hw_4/itmo_recsys_dz_4_ANN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Обучение и валидация на датасете KION

In [None]:
!pip install -r https://raw.githubusercontent.com/Sergey-Kit/RecoServiceTemplate/hw_4/notebooks/requirements.txt

In [4]:
import os

In [5]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"

In [None]:
from pprint import pprint

import numpy as np
import pandas as pd
import dill
from scipy import sparse

from tqdm.auto import tqdm

from implicit.cpu.als import AlternatingLeastSquares
from rectools.models.implicit_als import ImplicitALSWrapperModel

import nmslib

import rectools
from rectools import Columns
from rectools.dataset import Interactions, Dataset
from rectools.metrics import Precision, Recall, calc_metrics
from rectools.metrics import NDCG, MAP, MeanInvUserFreq, Serendipity
from rectools.model_selection import TimeRangeSplitter
from rectools.tools.ann import UserToItemAnnRecommender

RANDOM_STATE = 32

### Load data

In [None]:
!wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O data_KION.zip
!unzip -o data_KION.zip
!rm data_KION.zip

In [8]:
items = pd.read_csv('data_original/items.csv')

interactions = pd.read_csv("data_original/interactions.csv",
                           parse_dates=['last_watch_dt'])
interactions.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
    },
    inplace=True)
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)
interactions = Interactions(interactions)

users = pd.read_csv('data_original/users.csv')

### Preparing

In [9]:
users.fillna('Unknown', inplace=True)

In [10]:
# Производится Leave-time-out разбиение
max_date = interactions.df['datetime'].max()

train = interactions.df[
    (interactions.df['datetime'] < max_date - pd.Timedelta(days=7))]
train.drop(train.query("total_dur < 300").index,
           inplace=True) # удаляем короткие айтемы
train_users = train[Columns.User].unique()

test = interactions.df[
    interactions.df['datetime'] >= max_date - pd.Timedelta(days=7)]
test_users = test[Columns.User].unique()

hot_test = test[test['user_id'].isin(train['user_id'].unique())]
hot_test_users = hot_test[Columns.User].unique()

cold_test = test[~test['user_id'].isin(train['user_id'].unique())]
cold_test_users = cold_test[Columns.User].unique()

catalog = train[Columns.Item].unique()

print(f"train: {train.shape}")
print(f"hot test: {hot_test.shape}")
print(f"cold test: {cold_test.shape}")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop(train.query("total_dur < 300").index,


train: (3832711, 6)
hot test: (333026, 6)
cold test: (157956, 6)


In [11]:
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

#### User features

In [12]:
# Выбрали фичи юзеров : Возраст, Доход, Пол
user_features_names = ["age", "income", "sex"]
cat_user_features_names = [feature for feature in user_features_names if users[feature].dtype == object]
user_features_frames = []
for feature in user_features_names:
    feature_frame = users.reindex(columns=["user_id", feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)

In [None]:
user_features.head()

In [14]:
print('Number user features:', users[user_features_names].nunique().sum())

Number user features: 17


#### Item features

In [15]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

In [16]:
YEAR_FROM = 1990
STEP_SIZE = 5
bins = [year for year in range(YEAR_FROM, int(items['release_year'].max()) + STEP_SIZE, STEP_SIZE)]
bins = [int(items['release_year'].min())] + bins
items['year_bin'] = pd.cut(items['release_year'],
                           bins=bins, include_lowest=True)
items['year_bin'] = items['year_bin'].astype('str')

In [17]:
item_features_names = []

items["genre"] = items["genres"].str.lower().str.\
  replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
item_features_names += ["genre"]
item_features = genre_feature

content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
item_features_names += ["content_type"]

year_feature = items.reindex(columns=[Columns.Item, "year_bin"])
year_feature.columns = ["id", "value"]
year_feature["feature"] = "year_bin"
item_features_names += ["year_bin"]

In [18]:
cat_item_features_names = [feature for feature in item_features_names if items[feature].dtype == object]

In [19]:
item_features = pd.concat((genre_feature, content_feature, year_feature))

In [20]:
print('Number item features:', item_features.value.nunique())

Number item features: 105


In [21]:
dataset_train = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=user_features_names,
    item_features_df=item_features,
    cat_item_features=cat_item_features_names
)

### Metrics

In [22]:
metrics = {
    "prec@1": Precision(k=1),
    "prec@5": Precision(k=5),
    "prec@10": Precision(k=10),
    "recall@1": Recall(k=1),
    "recall@5": Recall(k=5),
    "recall@10": Recall(k=10),
    "MAP@1": MAP(k=1),
    "MAP@5": MAP(k=5),
    "MAP@10": MAP(k=10),
    "NDCG@1": NDCG(k=1),
    "NDCG@5": NDCG(k=5),
    "NDCG@10": NDCG(k=10),
    "novelty@1": MeanInvUserFreq(k=1),
    "novelty@5": MeanInvUserFreq(k=5),
    "novelty@10": MeanInvUserFreq(k=10),
    "serendipity@1": Serendipity(k=1),
    "serendipity@5": Serendipity(k=5),
    "serendipity@10": Serendipity(k=10),
}

### Model Training

In [23]:
K_RECOS = 10
N_FACTORS = 128
REG = 0.5
ALPHA = 10
NUM_THREADS = 16
ITERATIONS = 10

In [30]:
model = AlternatingLeastSquares(factors=N_FACTORS,
                              regularization=REG,
                              alpha=ALPHA,
                              num_threads=NUM_THREADS,
                              random_state=RANDOM_STATE,
                              iterations=ITERATIONS)

In [31]:
model_wrapper = ImplicitALSWrapperModel(model=model,
                                      verbose=1,
                                      fit_features_together=True)

In [32]:
%%time
model_wrapper.fit(dataset_train)

  0%|          | 0/10 [00:00<?, ?it/s]

CPU times: user 6min 34s, sys: 6.41 s, total: 6min 41s
Wall time: 4min 25s


<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7e0e52376a10>

In [33]:
# with open("als_wrapper.dill", "wb") as f:
#     dill.dump(model_wrapper, f)

In [32]:
with open("als_wrapper.dill", "rb") as f:
    model_wrapper = dill.load(f)

In [None]:
%%time
recos_als_train = model_wrapper.recommend(users=train_users,
                                        dataset=dataset_train,
                                        k=10,
                                        filter_viewed=False)

In [42]:
als_predict = (recos_als_train.groupby(["user_id"])).\
  agg({"item_id": lambda x: x.tolist()}).\
  to_dict()["item_id"]

In [44]:
# with open("als_predict_offline.dill", "wb") as f:
#     dill.dump(als_predict, f)

In [30]:
with open("als_predict_offline.dill", "rb") as f:
    als_predict = dill.load(f)

In [35]:
%%time
recos_als_hot_test = model_wrapper.recommend(users=hot_test_users,
                                           dataset=dataset_train,
                                           k=10,
                                           filter_viewed=True)

CPU times: user 57 s, sys: 9.6 s, total: 1min 6s
Wall time: 42.2 s


### Metric calculation

In [None]:
metric_values_train = calc_metrics(
    metrics,
    reco=recos_als_train,
    interactions=train,
    prev_interactions=train,
    catalog=catalog,
)

In [37]:
pd.DataFrame(metric_values_train, index=['AlternatingLeastSquares'])

Unnamed: 0,prec@1,recall@1,prec@5,recall@5,prec@10,recall@10,NDCG@1,NDCG@5,NDCG@10,MAP@1,MAP@5,MAP@10,novelty@1,novelty@5,novelty@10,serendipity@1,serendipity@5,serendipity@10
AlternatingLeastSquares,0.606373,0.320049,0.25961,0.495294,0.175575,0.587255,0.606373,0.326624,0.244567,0.320049,0.423255,0.451658,3.580388,4.099956,4.782352,0.001451,0.000816,0.000676


In [38]:
metric_values_hot_test = calc_metrics(
    metrics,
    reco=recos_als_hot_test,
    interactions=hot_test,
    prev_interactions=train,
    catalog=catalog,
)

In [39]:
pd.DataFrame(metric_values_hot_test, index=['AlternatingLeastSquares'])

Unnamed: 0,prec@1,recall@1,prec@5,recall@5,prec@10,recall@10,NDCG@1,NDCG@5,NDCG@10,MAP@1,MAP@5,MAP@10,novelty@1,novelty@5,novelty@10,serendipity@1,serendipity@5,serendipity@10
AlternatingLeastSquares,0.084176,0.042648,0.04966,0.117898,0.035179,0.161811,0.084176,0.057019,0.04433,0.042648,0.071558,0.078446,3.487526,4.250914,4.988936,3.7e-05,4.1e-05,4.6e-05


### Воспользоваться методом приближенного поиска соседей для выдачи рекомендаций (3 балла)

In [36]:
user_vectors, item_vectors = model_wrapper.get_vectors()

In [37]:
user_id_map = dataset_train.user_id_map
item_id_map = dataset_train.item_id_map

In [38]:
# with open("user_id_map.dill", "wb") as f:
#     dill.dump(dataset_train.user_id_map, f)

In [39]:
# with open("item_id_map.dill", "wb") as f:
#     dill.dump(dataset_train.item_id_map, f)

In [40]:
index_init_params = {"method": "hnsw",
                     "space": "negdotprod",
                     "data_type": nmslib.DataType.DENSE_VECTOR}

In [41]:
ann = UserToItemAnnRecommender(user_vectors=user_vectors,
                               item_vectors=item_vectors,
                               user_id_map=user_id_map,
                               item_id_map=item_id_map,
                               index_init_params= index_init_params)

In [42]:
%%time
ann.fit()

CPU times: user 5.24 s, sys: 55.5 ms, total: 5.3 s
Wall time: 2.83 s


<rectools.tools.ann.UserToItemAnnRecommender at 0x7940650c11e0>

In [43]:
# ann.index.saveIndex(filename='ann_index.dill')

In [44]:
ann.index.loadIndex('ann_index.dill')

In [50]:
ann.get_item_list_for_user(962099, top_n=10).tolist()

[12995, 1449, 9728, 10878, 10994, 849, 8379, 2237, 3682, 12981]

In [51]:
%%time
model_wrapper.recommend(users=[962099], dataset=dataset_train, k=10, filter_viewed=False).item_id.values

CPU times: user 269 ms, sys: 237 ms, total: 507 ms
Wall time: 496 ms


array([12995,  1449,  9728, 10878, 10994,   849, 14361,  8379,  2237,
        3682])

In [47]:
%%time
recos_hot_ann = pd.DataFrame([hot_test_users,
                              ann.get_item_list_for_user_batch(user_ids=hot_test_users,
                                                               top_n=K_RECOS)]).T
recos_hot_ann.columns = [Columns.User, Columns.Item]
recos_hot_ann = recos_hot_ann.explode(Columns.Item)
recos_hot_ann['score'] = 0
recos_hot_ann['rank'] = recos_hot_ann.groupby('user_id').cumcount() + 1
recos_hot_ann['score'] = K_RECOS - recos_hot_ann['rank']

CPU times: user 1min 53s, sys: 967 ms, total: 1min 54s
Wall time: 1min 57s


In [48]:
metric_values_hot_ann = calc_metrics(
    metrics,
    reco=recos_hot_ann,
    interactions=hot_test,
    prev_interactions=train,
    catalog=catalog,
)

In [49]:
pd.DataFrame(metric_values_hot_ann, index=['AlternatingLeastSquares + ANN'])

Unnamed: 0,prec@1,recall@1,prec@5,recall@5,prec@10,recall@10,NDCG@1,NDCG@5,NDCG@10,MAP@1,MAP@5,MAP@10,novelty@1,novelty@5,novelty@10,serendipity@1,serendipity@5,serendipity@10
AlternatingLeastSquares + ANN,0.035414,0.016748,0.038634,0.094214,0.029448,0.137391,0.035414,0.038866,0.032401,0.016748,0.044475,0.051093,3.694268,4.14407,4.779194,1.7e-05,2.7e-05,3.2e-05
