# Вебинар 6. Двухуровневые модели рекомендаций


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [3]:
data = pd.read_csv('../data/retail_train.csv')
item_features = pd.read_csv('../data/product.csv')
user_features = pd.read_csv('../data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [4]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [5]:
recommender = MainRecommender(data_train_lvl_1)



In [None]:
recommender.get_als_recommendations(1, N=5)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [7], in <cell line: 1>()
----> 1 recommender.get_als_recommendations(1, N=5)

File ~\Programming\GeekBrains_repo\Recommendation_systems\src\recommenders.py:152, in MainRecommender.get_als_recommendations(self, user, N)
    151 def get_als_recommendations(self, user, N=5):
--> 152     return self._get_recommendations(model=self.model, user=user, N=N)

File ~\Programming\GeekBrains_repo\Recommendation_systems\src\recommenders.py:139, in MainRecommender._get_recommendations(self, model, user, N)
    136 def _get_recommendations(self, model, user, N=5):
    138     recs = [self.id_to_itemid[rec[0]] for rec in 
--> 139                 model.recommend(userid=self.userid_to_id[user], 
    140                                 user_items=csr_matrix(self.user_item_matrix),   # на вход user-item matrix
    141                                 N=N, 
    142                                 filter_already_liked_items=False, 
    143                                 filter_items=[self.itemid_to_id[999_999]], 
    144                                 recalculate_user=True)]
    146     recs = self._extend_with_top_popular(recs, N=N)
    148     return recs

File C:\ProgramData\Anaconda3\lib\site-packages\implicit\cpu\matrix_factorization_base.py:49, in MatrixFactorizationBase.recommend(self, userid, user_items, N, filter_already_liked_items, filter_items, recalculate_user, items)
     47     user_count = 1 if np.isscalar(userid) else len(userid)
     48     if user_items.shape[0] != user_count:
---> 49         raise ValueError("user_items must contain 1 row for every user in userids")
     51 user = self._user_factor(userid, user_items, recalculate_user)
     53 item_factors = self.item_factors

ValueError: user_items must contain 1 row for every user in userids

### Задание 1

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?


In [8]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head

<bound method NDFrame.head of       user_id                                             actual
0           1  [853529, 865456, 867607, 872137, 874905, 87524...
1           2  [15830248, 838136, 839656, 861272, 866211, 870...
2           4  [883932, 970760, 1035676, 1055863, 1097610, 67...
3           6  [1024306, 1102949, 6548453, 835394, 940804, 96...
4           7  [836281, 843306, 845294, 914190, 920456, 93886...
...       ...                                                ...
2149     2496  [831509, 867188, 1013623, 1048851, 5592734, 16...
2150     2497  [820291, 824759, 838797, 859010, 859075, 86077...
2151     2498  [865511, 962991, 1076374, 1102358, 5564901, 15...
2152     2499  [861282, 921744, 1050968, 13842089, 828837, 86...
2153     2500  [856455, 902192, 903476, 931672, 936634, 95170...

[2154 rows x 2 columns]>

In [9]:
result_lvl_1['als'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=50))
result_lvl_1['own'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))
result_lvl_1['similar_items'] = result_lvl_1['user_id'].apply(lambda x:recommender.get_similar_items_recommendation(x, N=50))
result_lvl_1['similar_users'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_similar_users_recommendation(x, N=50))

ValueError: user_items must contain 1 row for every user in userids

In [None]:
result_lvl_1.apply(lambda row: recall_at_k(row['als'], row['item_id'], k=5), axis=1).mean()

In [None]:
result_lvl_1.apply(lambda row: recall_at_k(row['own'], row['item_id'], k=5), axis=1).mean()

In [None]:
result_lvl_1.apply(lambda row: recall_at_k(row['similar_items'], row['item_id'], k=5), axis=1).mean()

In [None]:
result_lvl_1.apply(lambda row: recall_at_k(row['similar_users'], row['item_id'], k=5), axis=1).mean()

### Задание 2.

Обучите модель 2-ого уровня, при этом:
    - Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар
    - Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_lvl_2
    - Вырос ли precision@5, map@5, ndcg@5 при использовании двухуровневой модели?

In [15]:
new_item_features = item_features.merge(data, on='item_id', how='left')
new_item_features.head()

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,quantity_of_sales,weekly_quantity_of_sales,price,...,basket_id,day,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB,3,0.031579,3.49,...,29046618323,157,1,3.49,3313,0.0,2213,23,0.0,0.0
1,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB,3,0.031579,3.49,...,30707611686,247,1,3.49,3266,0.0,1211,36,0.0,0.0
2,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB,3,0.031579,3.49,...,33046710871,410,4,13.96,3191,0.0,1139,59,0.0,0.0
3,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,,1,0.010526,0.99,...,30760265177,250,1,0.99,3235,0.0,936,36,0.0,0.0
4,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,,1,0.010526,1.59,...,33783848749,458,1,1.59,33904,0.0,2034,66,0.0,0.0


In [19]:
# Количество продаж продуктов в неделю
item_quantity = new_item_features.groupby(['item_id'])['quantity'].count().reset_index()
item_quantity.rename(columns={'quantity': 'quantity_of_sales'}, inplace=True)
item_quantity['weekly_quantity_of_sales'] = item_quantity['quantity_of_sales'] / new_item_features['week_no'].nunique()
item_quantity.head()

Unnamed: 0,item_id,quantity_of_sales,weekly_quantity_of_sales
0,25671,3,0.031579
1,26081,1,0.010526
2,26093,1,0.010526
3,26190,1,0.010526
4,26355,1,0.010526


In [20]:
# Средняя цена продуктов
price = new_item_features.groupby('item_id')['sales_value'].sum() / new_item_features.groupby('item_id')['quantity'].sum()
price = price.groupby('item_id').mean().reset_index()
price.columns = ['item_id', 'price']
price['price'].fillna(0, inplace= True)
price.head()

Unnamed: 0,item_id,price
0,25671,3.49
1,26081,0.99
2,26093,1.59
3,26190,1.54
4,26355,0.99


In [21]:
# Добавим полученные признаки в датасет item_features
item_features = item_features.merge(item_quantity, on='item_id')
item_features = item_features.merge(price, on='item_id')
item_features = item_features[item_features['weekly_quantity_of_sales'] > 0]

In [22]:
item_features.head()

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,quantity_of_sales_x,weekly_quantity_of_sales_x,price_x,quantity_of_sales_y,weekly_quantity_of_sales_y,price_y,quantity_of_sales,weekly_quantity_of_sales,price
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB,3,0.031579,3.49,3,0.031579,3.49,3,0.031579,3.49
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,,1,0.010526,0.99,1,0.010526,0.99,1,0.010526,0.99
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,,1,0.010526,1.59,1,0.010526,1.59,1,0.010526,1.59
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ,1,0.010526,1.54,1,0.010526,1.54,1,0.010526,1.54
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ,1,0.010526,0.99,1,0.010526,0.99,1,0.010526,0.99
