<a href="https://colab.research.google.com/github/Sergey-Kit/RecoServiceTemplate/blob/hw_2/itmo_recsys_dz_2_v2_ipynb%22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Обучение и валидация на датасете KION

In [None]:
!pip install -r https://raw.githubusercontent.com/Sergey-Kit/RecoServiceTemplate/hw_2/notebooks/requirements.txt

In [2]:
from pprint import pprint

import numpy as np
import pandas as pd

from tqdm.auto import tqdm

from rectools import Columns
from rectools.dataset import Interactions, Dataset
from rectools.metrics import Precision, Recall, calc_metrics
from rectools.metrics import NDCG, MAP, MeanInvUserFreq, Serendipity
from rectools.models import RandomModel, PopularModel
from rectools.model_selection import TimeRangeSplitter

RANDOM_STATE = 32



## Load data

In [3]:
!wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O data_KION.zip
!unzip -o data_KION.zip
!rm data_KION.zip

Archive:  data_KION.zip
   creating: data_original/
  inflating: data_original/interactions.csv  
  inflating: __MACOSX/data_original/._interactions.csv  
  inflating: data_original/users.csv  
  inflating: __MACOSX/data_original/._users.csv  
  inflating: data_original/items.csv  
  inflating: __MACOSX/data_original/._items.csv  


In [5]:
items = pd.read_csv(
    "data_original/items.csv",
    header=None,
    skiprows=[0],
    names=[
           Columns.Item,
           'content_type',
           'title',
           'title_orig',
           'release_year',
           'genres',
           'countries',
           'for_kids',
           'age_rating',
           'studios',
           'directors',
           'actors',
           'description',
           'keywords',
          ],
    )
selected_columns = [
                    Columns.Item,
                    'title',
                    'genres',
                    'countries',
                    'age_rating'
                   ]
items = items.loc[:, selected_columns]

interactions_raw = pd.read_csv(
    "data_original/interactions.csv",
    header=None,
    skiprows=[0],
    names=[
           Columns.User,
           Columns.Item,
           Columns.Datetime,
           Columns.Weight,
           'pcnt'
          ],
)
interactions_raw["datetime"] = pd.to_datetime(interactions_raw["datetime"])
interactions = Interactions(interactions_raw)

## CV

In [7]:
n_splits = 3

cv = TimeRangeSplitter(
    test_size="7D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

cv.get_test_fold_borders(interactions)

[(Timestamp('2021-08-02 00:00:00', freq='7D'),
  Timestamp('2021-08-09 00:00:00', freq='7D')),
 (Timestamp('2021-08-09 00:00:00', freq='7D'),
  Timestamp('2021-08-16 00:00:00', freq='7D')),
 (Timestamp('2021-08-16 00:00:00', freq='7D'),
  Timestamp('2021-08-23 00:00:00', freq='7D'))]

## Train models

In [8]:
models = {
    "random": RandomModel(random_state=RANDOM_STATE),
    "popular": PopularModel(),
}

metrics = {
    "prec@1": Precision(k=1),
    "prec@5": Precision(k=5),
    "prec@10": Precision(k=10),
    "recall@1": Recall(k=1),
    "recall@5": Recall(k=5),
    "recall@10": Recall(k=10),
    "MAP@1": MAP(k=1),
    "MAP@5": MAP(k=5),
    "MAP@10": MAP(k=10),
    "NDCG@1": NDCG(k=1),
    "NDCG@5": NDCG(k=5),
    "NDCG@10": NDCG(k=10),
    "novelty@1": MeanInvUserFreq(k=1),
    "novelty@5": MeanInvUserFreq(k=5),
    "novelty@10": MeanInvUserFreq(k=10),
    "serendipity@1": Serendipity(k=1),
    "serendipity@5": Serendipity(k=5),
    "serendipity@10": Serendipity(k=10),
}

K_RECOS = 10

In [9]:
def train_pipeline(interactions, models, metrics, cv, n_splits, k_recos):

  results = []
  fold_iterator = cv.split(interactions, collect_fold_stats=True)

  for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=n_splits):
    print(f"\n==================== Fold {fold_info['i_split']}")
    pprint(fold_info)

    df_train = interactions.df.iloc[train_ids]
    dataset = Dataset.construct(df_train)

    df_test = interactions.df.iloc[test_ids][Columns.UserItem]
    test_users = np.unique(df_test[Columns.User])

    catalog = df_train[Columns.Item].unique()

    for model_name, model in models.items():
      model.fit(dataset)
      recos = model.recommend(
          users=test_users,
          dataset=dataset,
          k=k_recos,
          filter_viewed=True,
          )
      metric_values = calc_metrics(
          metrics,
          reco=recos,
          interactions=df_test,
          prev_interactions=df_train,
          catalog=catalog,
          )
      res = {"fold": fold_info["i_split"], "model": model_name}
      res.update(metric_values)
      results.append(res)

  pivot_results = pd.DataFrame(results).\
    drop(columns="fold").\
    groupby(["model"], sort=False).\
    agg(["mean", "std"])
  pivot_results.to_csv("outer/metrics.csv")
  print(pivot_results)

In [10]:
%%time
!mkdir outer
train_pipeline(interactions, models, metrics, cv, n_splits, K_RECOS)

  0%|          | 0/3 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'i_split': 0,
 'start': Timestamp('2021-08-02 00:00:00', freq='7D'),
 'test': 263681,
 'test_items': 6602,
 'test_users': 98184,
 'train': 4266013,
 'train_items': 15237,
 'train_users': 797423}

{'end': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'i_split': 1,
 'start': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'test': 279422,
 'test_items': 6698,
 'test_users': 103511,
 'train': 4649162,
 'train_items': 15415,
 'train_users': 850489}

{'end': Timestamp('2021-08-23 00:00:00', freq='7D'),
 'i_split': 2,
 'start': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'test': 298878,
 'test_items': 6679,
 'test_users': 110076,
 'train': 5051815,
 'train_items': 15577,
 'train_users': 906071}
           prec@1            recall@1              prec@5            recall@5  \
             mean       std      mean       std      mean       std      mean   
model                                                                           
rando

![cv.png](attachment:cv.png)

## Visual analysis

In [22]:
def visualization_validation(trained_model, interactions_raw, user_ids, items):
  dataset = Dataset.construct(interactions_raw)
  recos = trained_model.recommend(
    users=user_ids,
    dataset=dataset,
    k=10,
    filter_viewed=True,
    )

  interactions_count = interactions_raw[['user_id', 'item_id']].groupby('item_id').count()
  interactions_count.reset_index()
  interactions_count = interactions_count.rename(columns={'user_id':'count_view'})
  interactions_raw = interactions_raw.merge(interactions_count, how='left', on='item_id')
  recos = recos.merge(interactions_count, how='left', on='item_id')


  user_viewed = interactions_raw.query("user_id == @user_ids").merge(items, on="item_id")
  user_recos = recos.query("user_id == @user_ids").merge(items, on="item_id")
  user_viewed['source'] = 'user_viewed'
  user_recos['source'] = 'user_recos'
  user_recos = user_recos.rename(columns={'score': 'weight'})
  combined_table = pd.concat([user_viewed, user_recos]).sort_values(['user_id','source', 'rank'])
  columns_to_keep = ['source', 'rank', 'user_id', 'item_id', 'datetime', 'weight', 'title', 'genres', 'count_view']
  combined_table = combined_table.loc[:, columns_to_keep]

  combined_table.to_csv('combined_table.csv')
  return(combined_table)

In [18]:
dataset = Dataset.construct(interactions_raw)
model = PopularModel()
model.fit(dataset)

<rectools.models.popular.PopularModel at 0x7e1c05e07eb0>

In [23]:
user_ids = [666262, 672861, 955527]
combined_table = visualization_validation(model, interactions_raw, user_ids, items)
combined_table

Unnamed: 0,source,rank,user_id,item_id,datetime,weight,title,genres,count_view
0,user_recos,1.0,666262,10440,NaT,202457.0,Хрустальный,"триллеры, детективы",202457
3,user_recos,2.0,666262,15297,NaT,193123.0,Клиника счастья,"драмы, мелодрамы",193123
6,user_recos,3.0,666262,9728,NaT,132865.0,Гнев человеческий,"боевики, триллеры",132865
9,user_recos,4.0,666262,13865,NaT,122119.0,Девятаев,"драмы, военные, приключения",122119
12,user_recos,5.0,666262,4151,NaT,91167.0,Секреты семейной жизни,комедии,91167
15,user_recos,6.0,666262,3734,NaT,74803.0,Прабабушка легкого поведения,комедии,74803
18,user_recos,7.0,666262,2657,NaT,68581.0,Подслушано,"драмы, триллеры",68581
21,user_recos,8.0,666262,4880,NaT,55043.0,Афера,комедии,55043
24,user_recos,9.0,666262,142,NaT,45367.0,Маша,"драмы, триллеры",45367
27,user_recos,10.0,666262,6809,NaT,40372.0,Дуров,документальное,40372
