# Вебинар 6. Двухуровневые модели рекомендаций


Код для src, utils, metrics вы можете скачать из [этого](https://github.com/geangohn/recsys-tutorial) github репозитория

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [2]:
data = pd.read_csv('../raw_data/retail_train.csv')
item_features = pd.read_csv('../raw_data/product.csv')
user_features = pd.read_csv('../raw_data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [4]:
recommender = MainRecommender(data_train_lvl_1)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [5]:
#recommender.get_als_recommendations(2375, N=200)

In [6]:
#recommender.get_own_recommendations(2375, N=200)

In [7]:
#recommender.get_similar_items_recommendation(2375, N=200)

In [8]:
#recommender.get_similar_users_recommendation(2375, N=200)

### Задание 1

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 200 кандидатов (k=200)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?


Пользователи, которых не было в тренировочном датасете

In [9]:
user_id_val_lvl_1 = data_val_lvl_1['user_id'].unique()
user_id_train_lvl_1 = data_train_lvl_1['user_id'].unique()
diff = set(user_id_val_lvl_1) - set(user_id_train_lvl_1)
print(f'Список новых пользователей {diff}')
#data_val_lvl_1 = data_val_lvl_1[~data_val_lvl_1['user_id'].isin(diff)]

Список новых пользователей {296, 1813, 1984}


In [10]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [11]:
result_lvl_1['get_als_recommendations'] = result_lvl_1.apply(lambda row: recommender.get_als_recommendations(row['user_id'], N=200), axis=1)
result_lvl_1['get_own_recommendations'] = result_lvl_1.apply(lambda row: recommender.get_own_recommendations(row['user_id'], N=200), axis=1)
result_lvl_1['get_similar_items_recommendation'] = result_lvl_1.apply(lambda row: recommender.get_similar_items_recommendation(row['user_id'], N=200), axis=1)
#result_lvl_1['get_similar_users_recommendation'] = result_lvl_1.apply(lambda row: recommender.get_similar_users_recommendation(row['user_id'], N=200), axis=1)
result_lvl_1['get_top200'] = result_lvl_1.apply(lambda row: recommender._extend_with_top_popular([], N=200), axis=1)

In [12]:
columns = ['get_als_recommendations', 'get_own_recommendations', #'get_similar_users_recommendation',
           'get_similar_items_recommendation', 'get_top200']
for column in columns:
    recall_at_value = result_lvl_1.apply(lambda row: recall_at_k(row[column], row['actual'], 200), axis=1).mean()
    print(f'{column} recall={recall_at_value}')

get_als_recommendations recall=0.0979014374297892
get_own_recommendations recall=0.1352815146098268
get_similar_items_recommendation recall=0.08492360963715664
get_top200 recall=0.08940276042445733


In [13]:
for n_candidates in [20, 50, 100, 150, 200, 500, 1000]:
    column = f'N_{n_candidates}'
    result_lvl_1[column] = result_lvl_1.apply(lambda row: recommender.get_als_recommendations(row['user_id'], N=n_candidates), axis=1)
    recall_at_value = result_lvl_1.apply(lambda row: recall_at_k(row[column], row['actual'], n_candidates), axis=1).mean()
    print(f'N_{n_candidates} recall={recall_at_value}')

N_20 recall=0.031014234324158504
N_50 recall=0.04893839301023467
N_100 recall=0.06946451591431317
N_150 recall=0.08551511185381631
N_200 recall=0.0979014374297892
N_500 recall=0.14781752137213783
N_1000 recall=0.1939263835548486


Чем больше список кандидатов, тем выше шанс предсказания и выше recall.
Количество кандидатов надо выбирать исходя из технических возможностей (скорость предсказания и других ограничений) и точности предсказания двухуровневой модели.

### Задание 2.

Обучите модель 2-ого уровня, при этом:
    - Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар
    - Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_lvl_2
    - Вырос ли precision@5 при использовании двухуровневой модели?

In [14]:
def get_data(data_train_lvl_2):
    users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
    users_lvl_2.columns = ['user_id']
    users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=200))

    s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
    s.name = 'item_id'

    users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
    users_lvl_2['drop'] = 1  # фиктивная пересенная

    targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
    targets_lvl_2['target'] = 1  # тут только покупки 

    targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

    targets_lvl_2['target'].fillna(0, inplace= True)
    targets_lvl_2.drop('drop', axis=1, inplace=True)
    return targets_lvl_2

In [15]:
targets_lvl_2 = get_data(data_train_lvl_2)
targets_lvl_2.head()

Unnamed: 0,user_id,item_id,target
0,2070,899624,1.0
1,2070,1029743,0.0
2,2070,5569471,0.0
3,2070,917033,0.0
4,2070,896085,0.0


In [16]:
#Средний чек
def average_bill(data_train):    
    sum_baskets = data_train.groupby(['user_id', 'basket_id'], as_index=False)['total_sales'].sum()   
    sum_baskets = sum_baskets.drop('basket_id', axis=1)
    return  sum_baskets.groupby('user_id').mean().rename(columns={'total_sales':'average_bill'})

#Кол-во покупок в каждой категории
def quantity_by_department(data_train, item_features):  
    df = data_train.merge(item_features, on=['item_id'], how='left')
    return   df.groupby(['user_id', 'department'], as_index=False)['total_sales'].count().rename(columns={'total_sales':'quantity_by_department'})  

In [17]:
#Кол-во покупок в неделю 
def quantity_item_by_week(data_train):    
    quantity_week = data_train.groupby(['item_id', 'week_no'], as_index=False)['quantity'].sum()   
    quantity_week = quantity_week.drop('week_no', axis=1)
    return  quantity_week.groupby('item_id').mean().rename(columns={'quantity':'quantity_item_by_week'})

#Средняя цена товара в категории
def mean_price_in_department(data_train, item_features):  
    df = data_train.merge(item_features, on=['item_id'], how='left')
    df = df.groupby(['item_id', 'department'], as_index=False)['sales_value'].sum()
    df = df.drop('item_id', axis=1)
    return   df.groupby('department', as_index=False).mean().rename(columns={'sales_value':'mean_price_in_department'})

In [18]:
#(Средняя сумма покупки 1 товара в каждой категории (берем категорию item_id)) - (Цена item_id)
def mean_diff_prices(data_train, item_features):  
    data_mean_price_in_department = mean_price_in_department(data_train, item_features)
    df = data_train.merge(item_features, on=['item_id'], how='left')
    df = df.merge(data_mean_price_in_department, on=['department'], how='left')
    df['diff_prices'] = df['mean_price_in_department'] - df['sales_value']
    return   df.groupby(['item_id', 'user_id'], as_index=False)['diff_prices'].mean().rename(columns={'diff_prices':'mean_diff_prices'})

#(Кол-во покупок юзером конкретной категории в неделю) / (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)
def factor_quantity(data_train, item_features):  
    df = data_train.merge(item_features, on=['item_id'], how='left')
    
    data_mean_quintity_dw = df.groupby(['department',  'week_no'], as_index=False)['quantity'].mean().rename(columns={'quantity':'mean_dw_quantity'})
    data_mean_quintity_dwu = df.groupby(['department',  'week_no', 'user_id'], as_index=False)['quantity'].sum().rename(columns={'quantity':'dwu_quantity'})
        
    df = df.merge(data_mean_quintity_dw, on=['department',  'week_no'], how='left')
    df = df.merge(data_mean_quintity_dwu, on=['department',  'week_no', 'user_id'], how='left')
    df['factor_quantity'] = df['dwu_quantity'] / df['mean_dw_quantity']
    return   df.groupby(['item_id', 'user_id'], as_index=False)['factor_quantity'].mean()

In [19]:
data_train_lvl_2['total_sales'] = data_train_lvl_2['sales_value'] * data_train_lvl_2['quantity']
data_average_bill = average_bill(data_train_lvl_2)
data_quantity_by_department = quantity_by_department(data_train_lvl_2, item_features)
data_quantity_item_by_week = quantity_item_by_week(data_train_lvl_2)
data_mean_price_in_department = mean_price_in_department(data_train_lvl_2, item_features)
data_mean_diff_prices = mean_diff_prices(data_train_lvl_2, item_features)
data_factor_quantity = factor_quantity(data_train_lvl_2, item_features)

In [20]:
def prepare(data, data_average_bill, item_features,
           data_quantity_by_department, data_quantity_item_by_week,
           data_mean_price_in_department, data_mean_diff_prices, data_factor_quantity):
    data = data.merge(data_average_bill, on=['user_id'], how='left')
    data = data.merge(item_features[['item_id', 'department']], on=['item_id'], how='left')
    data = data.merge(data_quantity_by_department, on=['user_id', 'department'], how='left')
    data = data.merge(data_quantity_item_by_week, on=['item_id'], how='left')
    data = data.merge(data_mean_price_in_department, on=['department'], how='left')
    data = data.merge(data_mean_diff_prices, on=['item_id', 'user_id'], how='left')
    data = data.merge(data_factor_quantity , on=['item_id', 'user_id'], how='left')
    data_mean_diff_prices_by_item = data_mean_diff_prices.groupby(['item_id'], as_index=False)['mean_diff_prices'].mean().rename(columns={'mean_diff_prices':'mm_diff_prices'})
    data = data.merge(data_mean_diff_prices_by_item, on=['item_id'], how='left')
    data.loc[data['mean_diff_prices'].isna(), 'mean_diff_prices'] = data.loc[data['mean_diff_prices'].isna(), 'mm_diff_prices']
    data.drop('mm_diff_prices', axis=1, inplace=True)
    data.fillna(0, inplace=True)
    return data

In [21]:
targets_lvl_2 = prepare(targets_lvl_2, data_average_bill, item_features,
           data_quantity_by_department, data_quantity_item_by_week,
           data_mean_price_in_department, data_mean_diff_prices, data_factor_quantity)
targets_lvl_2.head()

Unnamed: 0,user_id,item_id,target,average_bill,department,quantity_by_department,quantity_item_by_week,mean_price_in_department,mean_diff_prices,factor_quantity
0,2070,899624,1.0,7352.295814,PRODUCE,10.0,51.666667,36.299171,33.609171,4.010251
1,2070,1029743,0.0,7352.295814,GROCERY,139.0,211.833333,18.79459,16.076562,0.0
2,2070,5569471,0.0,7352.295814,GROCERY,139.0,14.0,18.79459,14.442608,0.0
3,2070,917033,0.0,7352.295814,GROCERY,139.0,4.2,18.79459,14.596923,0.0
4,2070,896085,0.0,7352.295814,GROCERY,139.0,9.166667,18.79459,16.445006,0.0


In [22]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]

In [23]:
cat_feats=['department']
X_train[cat_feats] = X_train[cat_feats].astype('category')

In [24]:
%%time

lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)
lgb.fit(X_train, y_train)

train_preds = lgb.predict(X_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Wall time: 4.47 s


In [25]:
targets_lvl_2['lgb_scores'] = train_preds
targets_lvl_2.head()

Unnamed: 0,user_id,item_id,target,average_bill,department,quantity_by_department,quantity_item_by_week,mean_price_in_department,mean_diff_prices,factor_quantity,lgb_scores
0,2070,899624,1.0,7352.295814,PRODUCE,10.0,51.666667,36.299171,33.609171,4.010251,1.0
1,2070,1029743,0.0,7352.295814,GROCERY,139.0,211.833333,18.79459,16.076562,0.0,0.0
2,2070,5569471,0.0,7352.295814,GROCERY,139.0,14.0,18.79459,14.442608,0.0,0.0
3,2070,917033,0.0,7352.295814,GROCERY,139.0,4.2,18.79459,14.596923,0.0,0.0
4,2070,896085,0.0,7352.295814,GROCERY,139.0,9.166667,18.79459,16.445006,0.0,0.0


In [26]:
lgb_ranked = targets_lvl_2.groupby('user_id')[['item_id', 'lgb_scores']].apply(
    lambda x: x.sort_values('lgb_scores', ascending=False)[:5]['item_id'].tolist()).to_dict()

In [27]:
bought_list = targets_lvl_2[targets_lvl_2['target']==1]
bought_list = bought_list.groupby('user_id')[['item_id', 'target']].apply(
    lambda x: x.sort_values('target', ascending=False)['item_id'].tolist()).to_dict()

In [28]:
scores = []
for user_id, recommended_item_ids in lgb_ranked.items():
    if user_id in bought_list:
        ap = precision_at_k(recommended_item_ids, bought_list[user_id])
        scores.append(ap)
    else:
        scores.append(0)
model_score = np.mean(scores)
print(model_score)

0.8574744661095636


In [29]:
targets_val_lvl_2 = get_data(data_val_lvl_2)
targets_val_lvl_2 = prepare(targets_val_lvl_2, data_average_bill, item_features,
           data_quantity_by_department, data_quantity_item_by_week,
           data_mean_price_in_department, data_mean_diff_prices, data_factor_quantity)
targets_val_lvl_2.head()

Unnamed: 0,user_id,item_id,target,average_bill,department,quantity_by_department,quantity_item_by_week,mean_price_in_department,mean_diff_prices,factor_quantity
0,338,835300,0.0,22.3475,MEAT,0.0,3.2,54.282992,49.917992,0.0
1,338,1084036,0.0,22.3475,MEAT,0.0,2.6,54.282992,52.384104,0.0
2,338,896613,0.0,22.3475,MEAT,0.0,8.333333,54.282992,49.232492,0.0
3,338,1068292,0.0,22.3475,MEAT,0.0,5.166667,54.282992,50.657992,0.0
4,338,1026118,1.0,22.3475,PRODUCE,12.0,19.333333,36.299171,34.769171,8.822552


In [30]:
X_val = targets_val_lvl_2.drop('target', axis=1)
y_val = targets_val_lvl_2[['target']]
X_val[cat_feats] = X_val[cat_feats].astype('category')

In [31]:
val_preds = lgb.predict(X_val)
targets_val_lvl_2['lgb_scores'] = val_preds
targets_val_lvl_2.head()

Unnamed: 0,user_id,item_id,target,average_bill,department,quantity_by_department,quantity_item_by_week,mean_price_in_department,mean_diff_prices,factor_quantity,lgb_scores
0,338,835300,0.0,22.3475,MEAT,0.0,3.2,54.282992,49.917992,0.0,0.0
1,338,1084036,0.0,22.3475,MEAT,0.0,2.6,54.282992,52.384104,0.0,0.0
2,338,896613,0.0,22.3475,MEAT,0.0,8.333333,54.282992,49.232492,0.0,0.0
3,338,1068292,0.0,22.3475,MEAT,0.0,5.166667,54.282992,50.657992,0.0,0.0
4,338,1026118,1.0,22.3475,PRODUCE,12.0,19.333333,36.299171,34.769171,8.822552,1.0


In [32]:
lgb_ranked = targets_val_lvl_2.groupby('user_id')[['item_id', 'lgb_scores']].apply(
    lambda x: x.sort_values('lgb_scores', ascending=False)[:5]['item_id'].tolist()).to_dict()
bought_list = targets_val_lvl_2[targets_val_lvl_2['target']==1]
bought_list = bought_list.groupby('user_id')[['item_id', 'target']].apply(
    lambda x: x.sort_values('target', ascending=False)['item_id'].tolist()).to_dict()

In [33]:
scores_lvl2 = []
scores_lvl1 = []
for user_id, recommended_item_ids in lgb_ranked.items():
    if user_id in bought_list:
        ap = precision_at_k(recommended_item_ids, bought_list[user_id])
        scores_lvl2.append(ap)
        ap = precision_at_k(recommender.get_als_recommendations(user_id, N=5), bought_list[user_id])
        scores_lvl1.append(ap)
    else:
        scores_lvl2.append(0)
        scores_lvl1.append(0)
model_score_lvl1 = np.mean(scores_lvl1)
model_score_lvl2 = np.mean(scores_lvl2)
print(f'precision_at_k для модели 1 уровня={model_score_lvl1} и для модели 2 уровня={model_score_lvl2}')

precision_at_k для модели 1 уровня=0.1426052889324192 и для модели 2 уровня=0.30333006856023503
