<a href="https://colab.research.google.com/github/Sergey-Kit/RecoServiceTemplate/blob/hw_3_fix_2/itmo_recsys_dz_3_kNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Обучение и валидация на датасете KION

In [None]:
!pip install -r https://raw.githubusercontent.com/Sergey-Kit/RecoServiceTemplate/hw_3/notebooks/requirements.txt

In [15]:
from pprint import pprint

import numpy as np
import pandas as pd

from tqdm.auto import tqdm

from rectools import Columns
from rectools.dataset import Interactions, Dataset
from rectools.metrics import Precision, Recall, calc_metrics
from rectools.metrics import NDCG, MAP, MeanInvUserFreq, Serendipity
from rectools.models import PopularModel
from rectools.model_selection import TimeRangeSplitter

from implicit.nearest_neighbours import CosineRecommender
import dill
import random

RANDOM_STATE = 32

In [4]:
from userknn import UserKnn

## Load data

In [None]:
!wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O data_KION.zip
!unzip -o data_KION.zip
!rm data_KION.zip

In [6]:
items = pd.read_csv(
    "data_original/items.csv",
    header=None,
    skiprows=[0],
    names=[
           Columns.Item,
           'content_type',
           'title',
           'title_orig',
           'release_year',
           'genres',
           'countries',
           'for_kids',
           'age_rating',
           'studios',
           'directors',
           'actors',
           'description',
           'keywords',
          ],
    )
selected_columns = [
                    Columns.Item,
                    'title',
                    'genres',
                    'countries',
                    'age_rating'
                   ]
items = items.loc[:, selected_columns]

interactions_raw = pd.read_csv(
    "data_original/interactions.csv",
    header=None,
    skiprows=[0],
    names=[
           Columns.User,
           Columns.Item,
           Columns.Datetime,
           Columns.Weight,
           'pcnt'
          ],
)
interactions_raw["datetime"] = pd.to_datetime(interactions_raw["datetime"])
interactions = Interactions(interactions_raw)

In [7]:
users = pd.read_csv(
    "data_original/users.csv",
    header=None,
    skiprows=[0],
    names=[Columns.User, 'age', 'income', 'sex', 'kids_flg'],
    )

## Train models

### Preparing

In [10]:
metrics = {
    "prec@1": Precision(k=1),
    "prec@5": Precision(k=5),
    "prec@10": Precision(k=10),
    "recall@1": Recall(k=1),
    "recall@5": Recall(k=5),
    "recall@10": Recall(k=10),
    "MAP@1": MAP(k=1),
    "MAP@5": MAP(k=5),
    "MAP@10": MAP(k=10),
    "NDCG@1": NDCG(k=1),
    "NDCG@5": NDCG(k=5),
    "NDCG@10": NDCG(k=10),
    "novelty@1": MeanInvUserFreq(k=1),
    "novelty@5": MeanInvUserFreq(k=5),
    "novelty@10": MeanInvUserFreq(k=10),
    "serendipity@1": Serendipity(k=1),
    "serendipity@5": Serendipity(k=5),
    "serendipity@10": Serendipity(k=10),
}

K_RECOS = 10

In [9]:
max_date = interactions_raw['datetime'].max()

train = interactions_raw[(interactions_raw['datetime'] < max_date - pd.Timedelta(days=7))]
test = interactions_raw[(interactions_raw['datetime'] >= max_date - pd.Timedelta(days=7))]

catalog = train[Columns.Item].unique()

# оставляем только теплых пользователей в тесте
test_hot = test[test['user_id'].isin(train['user_id'].unique())]

print(f"train: {train.shape}")
print(f"test: {test.shape}")
print(f"test: {test_hot.shape}")

train: (4985269, 5)
test: (490982, 5)
test: (349088, 5)


In [11]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=None,
    item_features_df=None
)

### UserKNN

In [12]:
recommender = CosineRecommender(K=30)

In [13]:
userknn_model = UserKnn(recommender)

In [None]:
%%time
userknn_model.fit(train)

In [None]:
# with open('userknn.dill', 'wb') as f:
#     dill.dump(userknn_model, f)

In [16]:
with open("userknn.dill", "rb") as f:
    userknn_model = dill.load(f)

In [17]:
%%time
recos = userknn_model.predict(test_hot, N_recs=10)

CPU times: user 1min 40s, sys: 8.14 s, total: 1min 48s
Wall time: 1min 49s


In [18]:
part_train_users = random.sample(sorted(train["user_id"].unique()), 100000) # If make more, notebook killed
userknn_predect = userknn_model.predict(pd.DataFrame(part_train_users, columns=["user_id"]), 20)
userknn_predect = (userknn_predect.groupby(["user_id"])).agg({"item_id": lambda x: x.tolist()}).to_dict()["item_id"]

In [19]:
with open("userknn_offline.dill", "wb") as f:
    dill.dump(userknn_predect, f)

### Calc metrics

In [None]:
hot_metric_values = calc_metrics(
                                 metrics,
                                 reco=recos,
                                 interactions=test_hot,
                                 prev_interactions=train,
                                 catalog=catalog,
                                )

In [None]:
pd.DataFrame(hot_metric_values, index=['userKNN'])

Unnamed: 0,prec@1,recall@1,prec@5,recall@5,prec@10,recall@10,NDCG@1,NDCG@5,NDCG@10,MAP@1,MAP@5,MAP@10,novelty@1,novelty@5,novelty@10,serendipity@1,serendipity@5,serendipity@10
userKNN,0.000755,0.000305,0.002413,0.005438,0.003897,0.017671,0.000755,0.001995,0.003176,0.000305,0.001736,0.003353,10.008312,8.646891,7.947655,4.7e-05,6.6e-05,6.8e-05


## Cold user recomendations (3 балла)

In [None]:
popular = PopularModel()
popular.fit(dataset)

<rectools.models.popular.PopularModel at 0x7e86a1ddd270>

In [None]:
popular_recs = popular.recommend(
    users=dataset.user_id_map.external_ids,
    dataset=dataset,
    k=10,
    filter_viewed=False  # True - удаляет просмотренные айтемы из рекомендаций
)

In [None]:
popular_recs = popular_recs.head(10)['item_id'].to_list()

In [None]:
# with open("popular_answer.dill", "wb") as f:
#     dill.dump(popular_recs, f)

In [None]:
# with open("popular_answer.dill", "rb") as f:
#     popular_recs = dill.load(f)

In [None]:
def predict_rec(user_id):
    if user_id in userknn_model.users_mapping:
        print('userKNN')
        answer = userknn_model.eval(user_id).item_id.to_list()
    else:
        print('popular')
        answer = popular_recs
    return answer

In [None]:
users

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
...,...,...,...,...,...
840192,339025,age_65_inf,income_0_20,Ж,0
840193,983617,age_18_24,income_20_40,Ж,1
840194,251008,,,,0
840195,590706,,,Ж,0


In [None]:
cold_user_id = 983617
hot_user_id = 962099

In [None]:
predict_rec(cold_user_id)

popular


[10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809]

In [None]:
predict_rec(hot_user_id)

userKNN


[3282, 3787, 2074, 3568, 2998, 1215, 2420, 3657, 6590, 6404]

### Required num recomendations (3 балла)

Задача: Сделать кол-во рекомендаций равным N, а не меньше N

Решение: Добавим рекомендаций от модели Popular

In [None]:
recos.groupby('user_id').count().query('item_id < 10').shape[0]

50092

In [None]:
bad_user_id = 704055
k_recs = 10

In [None]:
answer = userknn_model.eval(bad_user_id).item_id.to_list()

In [None]:
answer, len(answer)

([7043, 2276, 2061, 10533, 9855, 15037, 4179, 2583], 8)

In [None]:
new_answer = answer + [
    item for item in popular_recs if item not in answer
][:k_recs - len(answer)]

In [None]:
new_answer, len(new_answer)

([7043, 2276, 2061, 10533, 9855, 15037, 4179, 2583, 10440, 15297], 10)