# Вебинар 6. Двухуровневые модели рекомендаций


Код для src, utils, metrics вы можете скачать из [этого](https://github.com/geangohn/recsys-tutorial) github репозитория

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [2]:
data = pd.read_csv('./retail_train.csv')
item_features = pd.read_csv('./product.csv')
user_features = pd.read_csv('./hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [None]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [None]:
recommender = MainRecommender(data_train_lvl_1)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [None]:
recommender.get_als_recommendations(2375, N=5)

[899624, 1106523, 1044078, 871756, 844179]

In [None]:
recommender.get_own_recommendations(2375, N=5)

[948640, 918046, 847962, 907099, 873980]

In [None]:
recommender.get_similar_items_recommendation(2375, N=5)

[1046545, 1044078, 1044078, 1078652, 1018809]

In [None]:
recommender.get_similar_users_recommendation(2375, N=5)

[1101502, 979674, 10457044, 974265, 959455]

### Задание 1

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?


In [None]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

In [None]:
result_lvl_1['ALS_recs'] = 0
result_lvl_1['ALS_recs'] = result_lvl_1['ALS_recs'].astype(object)

for i in range(result_lvl_1.shape[0]):
    try:
        result_lvl_1.at[i, 'ALS_recs'] = recommender.get_als_recommendations(result_lvl_1['user_id'][i], N=50)
    except IndexError:

        result_lvl_1.at[i, 'ALS_recs'] = []
        result_lvl_1.at[i, 'ALS_recs'] = recommender._extend_with_top_popular(result_lvl_1['ALS_recs'][i], N=50)
result_lvl_1.head(2)

In [None]:
ALS_recall = result_lvl_1.apply(lambda row: recall_at_k(row['ALS_recs'], row['actual'], k=50), axis=1).mean()
ALS_recall

In [None]:
result_lvl_1['own_recs'] = 0
result_lvl_1['own_recs'] = result_lvl_1['own_recs'].astype(object)
for i in range(result_lvl_1.shape[0]):
    try:
        result_lvl_1.at[i, 'own_recs'] = recommender.get_own_recommendations(result_lvl_1['user_id'][i], N=50)
    except ValueError:
        result_lvl_1.at[i, 'own_recs'] = []
        result_lvl_1.at[i, 'own_recs'] = recommender._extend_with_top_popular(result_lvl_1['own_recs'][i], N=50)
result_lvl_1.head(2)

In [None]:
own_recall = result_lvl_1.apply(lambda row: recall_at_k(row['own_recs'], row['actual'], k=50), axis=1).mean()
own_recall

In [None]:
%%time
result_lvl_1['similar_items_recs'] = 0
result_lvl_1['similar_items_recs'] = result_lvl_1['similar_items_recs'].astype(object)
for i in range(result_lvl_1.shape[0]):
    result_lvl_1.at[i, 'similar_items_recs'] = recommender.get_similar_items_recommendation(result_lvl_1['user_id'][i], N=50)
    
result_lvl_1.head(2)

In [None]:
similar_items_recall = result_lvl_1.apply(lambda row: recall_at_k(row['similar_items_recs'], row['actual'], k=50), axis=1).mean()
similar_items_recall

In [None]:
%%time

result_lvl_1['similar_users_recs'] = 0
result_lvl_1['similar_users_recs'] = result_lvl_1['similar_users_recs'].astype(object)
for i in range(result_lvl_1.shape[0]):
    try:
        result_lvl_1.at[i, 'similar_users_recs'] = recommender.get_similar_users_recommendation(result_lvl_1['user_id'][i], N=50)
    except IndexError:
        result_lvl_1.at[i, 'similar_users_recs'] = []
        result_lvl_1.at[i, 'similar_users_recs'] = recommender._extend_with_top_popular(result_lvl_1['similar_users_recs'][i], N=50)
    except ValueError:
        result_lvl_1.at[i, 'similar_users_recs'] = []
        result_lvl_1.at[i, 'similar_users_recs'] = recommender._extend_with_top_popular(result_lvl_1['similar_users_recs'][i], N=50)
result_lvl_1.head(2)

In [None]:
similar_users_recall = result_lvl_1.apply(lambda row: recall_at_k(row['similar_users_recs'], row['actual'], k=50), axis=1).mean()
similar_users_recall

### B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}

In [None]:
k_list = [20, 50, 100, 200, 500]
k_recall = []

for el in k_list:
    result_lvl_1[f'similar_items_recs_{el}'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_similar_items_recommendation(x, N=el))
    k_recall_el = result_lvl_1.apply(lambda row: recall_at_k(row[f'similar_items_recs_{el}'], row['actual'], k=el), axis=1).mean()
    k_recall.append(k_recall_el)

plt.plot(k_list, k_recall)

In [None]:
k_recall = []

for el in k_list:
    result_lvl_1[f'own_recs_{el}'] = 0
    result_lvl_1[f'own_recs_{el}'] = result_lvl_1[f'own_recs_{el}'].astype(object)
    for i in range(result_lvl_1.shape[0]):
        try:
            result_lvl_1.at[i, f'own_recs_{el}'] = recommender.get_own_recommendations(result_lvl_1['user_id'][i], N=el)
        except ValueError:
            result_lvl_1.at[i, f'own_recs_{el}'] = []
            result_lvl_1.at[i, f'own_recs_{el}'] = recommender._extend_with_top_popular(result_lvl_1['own_recs'][i], N=el)
    
    k_recall_el = result_lvl_1.apply(lambda row: recall_at_k(row[f'own_recs_{el}'], row['actual'], k=el), axis=1).mean()
    k_recall.append(k_recall_el)

plt.plot(k_list, k_recall)

### Задание 2.

Обучите модель 2-ого уровня, при этом:
    - Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар
    - Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_lvl_2
    - Вырос ли precision@5 при использовании двухуровневой модели?

In [None]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']


train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]


users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))


s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['drop'] = 1  



targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('drop', axis=1, inplace=True)

In [None]:
targets_lvl_2['target'].mean()

In [None]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2.shape

In [None]:

users_sales = data.groupby('user_id')['sales_value'].sum().reset_index()
num_baskets = data.groupby('user_id')['basket_id'].nunique().reset_index()
users_sales = users_sales.merge(num_baskets, on='user_id', how='left')
users_sales['mean_cheque'] = users_sales['sales_value'] / users_sales['basket_id']
users_sales.drop(['sales_value', 'basket_id'], axis=1, inplace=True)
users_sales.head(2)

In [None]:
targets_lvl_2 = targets_lvl_2.merge(users_sales, on='user_id', how='left')
targets_lvl_2.head(2)

In [None]:

departments = list(set(targets_lvl_2['department'].tolist()))
departments

In [None]:
%%time
targets_lvl_2['mean_department_price'] = 0

for n in departments:
    dep_targets_lvl_2 = targets_lvl_2[targets_lvl_2['department'] == n]
    ids = dep_targets_lvl_2['item_id'].tolist()
    dep_data = data[data['item_id'].isin(ids)]
    
    dep_sales = dep_data.groupby('user_id').agg({
    'sales_value' : 'sum', 
    'quantity': 'sum'}).reset_index()
    
    dep_sales['dep_mean_price'] = dep_sales['sales_value'] / dep_sales['quantity']
    dep_sales.drop(['sales_value', 'quantity'], axis=1, inplace=True)
    
    for i in range(dep_sales.shape[0]):
        targets_lvl_2.loc[(((targets_lvl_2['user_id'] == dep_sales['user_id'][i]) & (targets_lvl_2['department'] == n)) == True), 'mean_department_price'] = dep_sales['dep_mean_price'][i]

targets_lvl_2.head(2)

In [None]:

week_sales = data.groupby('item_id').agg({ 
    'quantity': 'sum',
    'week_no' : 'nunique'
}).reset_index()
week_sales['item_id_week_sales'] = week_sales['quantity'] / week_sales['week_no']
week_sales.drop(['quantity', 'week_no'], axis=1, inplace=True)
week_sales.head()

In [None]:
targets_lvl_2 = targets_lvl_2.merge(week_sales, on='item_id', how='left')
targets_lvl_2.head(2)

In [None]:

mean_price = data.groupby('item_id').agg({
    'sales_value' : 'sum', 
    'quantity': 'sum'
}).reset_index()

mean_price['mean_price'] = mean_price['sales_value'] / mean_price['quantity']

mean_price.drop(['sales_value', 'quantity'], axis=1, inplace=True)

mean_price.head(2)

In [None]:
targets_lvl_2 = targets_lvl_2.merge(mean_price, on='item_id', how='left')
targets_lvl_2.head(2)

In [None]:

targets_lvl_2['delta_dep_user_price'] = 0

for n in departments:
    dep_targets_lvl_2 = targets_lvl_2[targets_lvl_2['department'] == n]
    ids = dep_targets_lvl_2['item_id'].tolist()
    dep_data = data[data['item_id'].isin(ids)]
    
    dep_mean_price = dep_data['sales_value'].sum() / dep_data['quantity'].sum()
    
    dep_user_sales = data.groupby('user_id').agg({
    'sales_value' : 'sum', 
    'quantity': 'sum'}).reset_index()
    
    dep_user_sales['mean_dep_user_price'] = dep_user_sales['sales_value'] / dep_user_sales['quantity']
    dep_user_sales.drop(['sales_value', 'quantity'], axis=1, inplace=True)
    
    for i in range(dep_user_sales.shape[0]):
        targets_lvl_2.loc[(((targets_lvl_2['user_id'] == dep_user_sales['user_id'][i]) &
                            (targets_lvl_2['department'] == n)) == True), 'delta_dep_user_price'] = dep_user_sales['mean_dep_user_price'][i] - dep_mean_price

targets_lvl_2.head(2)

In [None]:

targets_lvl_2['rel_week_sales'] = 0

for n in departments:
    dep_targets_lvl_2 = targets_lvl_2[targets_lvl_2['department'] == n]
    ids = dep_targets_lvl_2['item_id'].tolist()
    dep_data = data[data['item_id'].isin(ids)]
    
    dep_mean_week_sales = dep_data['quantity'].sum() / dep_data['week_no'].nunique()
    
    dep_user_week_sales = data.groupby('user_id').agg({ 
    'quantity': 'sum',
    'week_no' : 'nunique'
    }).reset_index()
    
    dep_user_week_sales['mean_user_week_sales'] = dep_user_week_sales['quantity'] / dep_user_week_sales['week_no']
    dep_user_week_sales.drop(['quantity', 'week_no'], axis=1, inplace=True)
    
    for i in range(dep_user_week_sales.shape[0]):
        targets_lvl_2.loc[(((targets_lvl_2['user_id'] == dep_user_week_sales['user_id'][i]) &
                            (targets_lvl_2['department'] == n)) == True), 'rel_week_sales'] = dep_user_week_sales['mean_user_week_sales'][i] / dep_mean_week_sales

targets_lvl_2.head(2)

### 2 уровень

In [None]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]

In [None]:
cat_feats = X_train.columns[2:15].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

In [None]:
%%time

lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)
lgb.fit(X_train, y_train)


train_preds = lgb.predict(X_train)
train_preds[:5]

In [None]:
targets_lvl_2['preds'] = train_preds

In [None]:

lgb_recs = targets_lvl_2.groupby('user_id').head(5).groupby('user_id')['item_id'].unique().reset_index()
lgb_recs.head()

In [None]:


result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']
result_lvl_2.head()

In [None]:
result_lvl_2 = result_lvl_2.merge(lgb_recs, on='user_id', how='left')
result_lvl_2.rename(columns={'item_id': 'lgb_recs'}, inplace=True)
result_lvl_2.head()

In [None]:

own_precision_val_1 = result_lvl_1.apply(lambda row: precision_at_k(row['own_recs'], row['actual'], k=50), axis=1).mean()
own_precision_val_1


In [None]:

result_lvl_2[result_lvl_2.lgb_recs.notna()].\
apply(lambda row: precision_at_k(row['lgb_recs'], row['actual'], k=5), axis=1).mean()

### Финальный проект

Мы уже прошли всю необходимуб теорию для финального проекта. Проект осуществляется на данных из вебинара (данные считаны в начале ДЗ).
Рекомендуем вам **начать делать проект сразу после этого домашнего задания**
- Целевая метрика - precision@5. Порог для уcпешной сдачи проекта precision@5 > 25%
- Будет public тестовый датасет, на котором вы сможете измерять метрику
- Также будет private тестовый датасет для измерения финального качества
- НЕ обязательно, но крайне желательно использовать 2-ух уровневые рекоммендательные системы в проекте
- Вы сдаете код проекта в виде github репозитория и csv файл с рекомендациями 