# Проект рекомендательной системы для ритейла

In [111]:
# !pip install --upgrade lightgbm

Collecting lightgbm
  Downloading lightgbm-2.3.1-py2.py3-none-manylinux1_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 2.1 MB/s eta 0:00:01
Installing collected packages: lightgbm
Successfully installed lightgbm-2.3.1


In [45]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [149]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

np.random.seed = 42

# Модель второго уровня
import lightgbm as lgb

# Функции из 1-ого вебинара
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)


In [47]:
%autoreload


from src.metrics import money_precision_at_k, money_recall_at_k, precision_at_k, recall_at_k
from src.utils import prefilter_items, postfilter_items
import src.recommenders as rcm

**Train-test datasets**

In [48]:
data = pd.read_csv('../data/retail_train_sample.csv')

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)

# подсчет цены 
data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))

max_svl = data['sales_value'].max()
min_svl = data['sales_value'].min()
delta = max_svl - min_svl

std_svl = data['sales_value'].std()
mean_svl = data['sales_value'].mean()
print('std_svl ', std_svl, mean_svl)


data['log_q'] = np.log(data['sales_value']+1)
data['norm_svalue'] = (data['sales_value'] - min_svl)*1.0/delta
data['std_svalue'] = (data['sales_value'] - mean_svl)*1.0/std_svl
# выборосим не нужные колонки
data = data[data.columns[1:]]

print(data.shape)


test_size_weeks = 6

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_val = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

print(data_train.shape)
data_train.head(2)

std_svl  4.342602083127377 3.092901482999706
(227849, 16)
(207776, 16)


Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price,log_q,norm_svalue,std_svalue
0,1078,35573861879,524,1082185,1,0.56,375,0.0,1440,76,0.0,0.0,0.56,0.444686,0.000886,-0.583268
1,324,29170411703,165,7168774,2,6.98,367,0.0,1115,24,0.0,0.0,3.49,2.076938,0.011048,0.895108


**Проверочный датасет**

In [49]:
d_test = pd.read_csv('../data/retail_test1.csv')
d_test['price'] = d_test['sales_value'] / (np.maximum(d_test['quantity'], 1))
d_test.head(3)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0,8.49
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0,6.29
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0,1.82


**Продуктовый датасет**

In [50]:
item_features = pd.read_csv('../data/product.csv')
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)

item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


**Датасет цен на items**

In [51]:
items_price = data.groupby('item_id')['price'].mean().reset_index()
items_price.columns = ['item_id', 'price']
items_price.head(2)

Unnamed: 0,item_id,price
0,26081,0.99
1,26540,0.97


In [52]:
# количество подкатегорий товаров
item_features.commodity_desc.nunique()

308

In [53]:
def calc_model(data_train, data_test, items_price, item_features=None, take_n_popular=2600, **kwargs):
    
    print(kwargs)
    
    N = kwargs.get('N_rec', 5)
    print('filter')
    data = prefilter_items(data_train, take_n_popular, item_features)
    print('recommender')
    recommender = rcm.MainRecommender(data,
                                      kwargs.get('weighting', True), 
                                      kwargs.get('n_factors', 100), 
                                      kwargs.get('regularization', 0.001), 
                                      kwargs.get('iterations', 15), 
                                      num_threads=0 
                                     )
    
#     print(recommender.itemid_to_id)
        
    # датасет для просчета метрик
    dd = data_test.groupby('user_id')
    result = dd['item_id'].unique().reset_index()
    result.columns=['user_id', 'actual']
    # добавим цены на реальные покупки
    # result['actual_price'] = dd['price'].unique().reset_index()['price']
    
    print('result shape =', result.shape)
    
    # разбиваем на новых и старых пользователей
    new_users = np.array(list(set(result['user_id'].unique())-set(data['user_id'].unique())))
    r_new_usr = result.loc[result['user_id'].isin(new_users)]
    r_als = result.loc[~result['user_id'].isin(new_users)]
    
    # получаем рекомендации для новых пользователей
    top_n = recommender.overall_top_purchases[:N]
    sim_top_n = [recommender._get_similar_item(item) for item in top_n]
    r_new_usr['r_top'] = r_new_usr['user_id'].apply(lambda x: top_n)
    r_new_usr['recomendation'] = r_new_usr['r_top'] 
    r_new_usr['r_own'] = r_new_usr['user_id'].apply(lambda x: sim_top_n)

    # получаем рекомендации для старых пользователей
    r_als['recomendation'] = r_als['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N))
    r_als['r_own'] = r_als['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N))
#     r_als['r_sim_i'] = r_als['user_id'].apply(lambda x: recommender.get_similar_items_recommendation(x, N))
#     r_als['r_sim_u'] = r_als['user_id'].apply(lambda x: recommender.get_similar_users_recommendation(x, N))
    r_als['r_top'] = r_als['user_id'].apply(lambda x: recommender.overall_top_purchases[:N])
    
    result = r_als.append(r_new_usr)
    print('result shape =', result.shape)
    
    
    result['r_all'] = result['recomendation']+result['r_own']+result['r_top']
    result['r_all'] = result['r_all'].apply(lambda x: list(set(x)))
    print('r_all = ', len(result['r_all'].iloc[0]))
    
    
    # заполняем значение средних цен для рекомендаций
    def get_price(row):
        items = pd.Series(row['recomendation'], name='item_id')
        print(items)
        res = items_price.join(items,how='right', lsuffix='_l')
        return res
    
#     print(get_price(result.iloc[0]))
        
    result['recomendation_price'] = result['recomendation'].apply(lambda x: [items_price.loc[items_price['item_id']==i, 'price'].iloc[0] for i in x])
    
#     print('recomendation_price', result['recomendation_price'].head(2))
#     result['r_all_price'] = result['r_all'].apply(lambda x: [items_price.loc[items_price['item_id']==i, 'price'].iloc[0] for i in x])
#     print('r_all_price', len(result['r_all_price'].iloc[0]))
    
    p_at_k = result.apply(lambda row: precision_at_k(row['recomendation'], row['actual'], N), axis=1).mean()
    mp_at_k = result.apply(lambda row: money_precision_at_k(row['recomendation'], row['actual'], row['recomendation_price'], N), axis=1).mean()
    mp_at_o = result.apply(lambda row: money_precision_at_k(row['r_own'], row['actual'], row['recomendation_price'], N), axis=1).mean()
#     mp_at_i = r_als.apply(lambda row: money_precision_at_k(row['r_sim_i'], row['actual'], row['recomendation_price'], N), axis=1).mean()
#     mp_at_u = r_als.apply(lambda row: money_precision_at_k(row['r_sim_u'], row['actual'], row['recomendation_price'], N), axis=1).mean()
    mp_at_t = result.apply(lambda row: money_precision_at_k(row['r_top'], row['actual'], row['recomendation_price'], N), axis=1).mean()
    
#     mp_at_a = result.apply(lambda row: money_precision_at_k(row['r_all'], row['actual'], row['r_all_price'], len(row['r_all'])), axis=1).mean()
    
    r_at_a = result.apply(lambda row: recall_at_k(row['r_all'], row['actual'], len(row['r_all'])), axis=1).mean()
    r_at_k = result.apply(lambda row: recall_at_k(row['recomendation'], row['actual'], N), axis=1).mean()
    
      
    print()
    print('***'*25)
    print('precision_at_k = ', p_at_k)
    print('money_precision_at_k = ', mp_at_k)
    print('money_precision_at_o = ', mp_at_o)
#     print('money_precision_at_i = ', mp_at_i)
#     print('money_precision_at_u = ', mp_at_u)
    print('money_precision_at_t = ', mp_at_t)
#     print('money_precision_at_a = ', mp_at_a)
    
    print()
    print('recall_at_k = ', r_at_k)
    print('recall_at_a = ', r_at_a)
    print()
    #  print('money_recall_at_k = ', mr_at_k)
    
    return mp_at_k, result
#     return r_at_k


In [54]:
# # %%time
# log_list = []

# step = 1
# start = 1
# stop = 11

# for i in range(start, stop, step):
#     print(i)
#     fi = float(i)/1000
#     res = calc_model(data_train, 
#                      data_val, 
#                      items_price, 
#                      item_features, 
#                      take_n_popular=7650, 
#                      n_factors=250, 
#                      iterations = 126, 
#                      regularization=fi, 
#                      N_rec = 5,
#                     )
#     log_list.append(res)
    
# _idx = np.argmax(log_list)
# res = start + _idx*step

# print()
# print(max(log_list))
# print(log_list[_idx])
# print(res)


In [61]:
%%time
m_at_k_val, res_vaL = calc_model(data_train, data_val, items_price, item_features, take_n_popular=7650, weighting=True, n_factors=250, iterations = 126, N_rec = 200)

{'weighting': True, 'n_factors': 250, 'iterations': 126, 'N_rec': 200}
filter
recommender
start fit
LOG = 005
fit_own_recommender


HBox(children=(FloatProgress(value=0.0, max=7651.0), HTML(value='')))


result shape = (1979, 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  r_new_usr['r_top'] = r_new_usr['user_id'].apply(lambda x: top_n)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  r_new_usr['recomendation'] = r_new_usr['r_top']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  r_new_usr['r_own'] = r_new_usr['user_id'].apply(lambda x: sim_top_n)
A value is trying to be

result shape = (1979, 5)
r_all =  414

***************************************************************************
precision_at_k =  0.00601566447700859
money_precision_at_k =  0.005481134659672155
money_precision_at_o =  0.006485572084927123
money_precision_at_t =  0.0065384660618601756

recall_at_k =  0.10214179151935945
recall_at_a =  0.20154050684028094

CPU times: user 5min 8s, sys: 9.88 s, total: 5min 18s
Wall time: 4min 46s


In [62]:
# %%time
# m_at_k_test, res_test = calc_model(data_train, d_test, items_price, item_features, take_n_popular=7650, weighting=True, n_factors=250, iterations = 126, N_rec = 200)

In [66]:
res_val = res_vaL

In [67]:
res_val.columns

Index(['user_id', 'actual', 'recomendation', 'r_own', 'r_top', 'r_all',
       'recomendation_price'],
      dtype='object')

#### Подготовка 2 шага

In [70]:
# реальные покупки пользователей
result_lvl_1 = res_val.loc[:, ['user_id', 'actual']]
result_lvl_1.head(2)                               

Unnamed: 0,user_id,actual
0,1,"[1098248, 1017299, 1035805, 829563, 1077430, 9..."
1,2,"[980666, 1096261, 13876377, 901062, 13190188, ..."


In [92]:
# рекомендованные кандидаты
users_lvl_2 = res_val.loc[:, ['user_id', 'recomendation']]
users_lvl_2.rename(columns={'recomendation': 'candidates'}, inplace=True)
users_lvl_2.head(2)

Unnamed: 0,user_id,candidates
0,1,"[995242, 1081177, 1082185, 1004906, 845078, 66..."
1,2,"[951590, 899624, 952163, 909714, 940947, 10205..."


In [94]:
%%time

# разворачиваем кандидатов в один общий список
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1)
s= s.stack().reset_index(level=1, drop=True)
s.name = 'item_id'
print(s.shape)

# и джойним по индексу, предварительно дропнув исходных кандидатов
users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['drop'] = 1  # фиктивная пересенная

users_lvl_2.shape

(395800,)
CPU times: user 1.36 s, sys: 18 ms, total: 1.38 s
Wall time: 1.32 s


(395800, 3)

In [98]:
# джойним кандидатов и реальные покупки и проставляем флаг таргета (угадали(1) или нет(0))

targets_lvl_2 = data_val[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('drop', axis=1, inplace=True)

In [99]:
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target
0,1,995242,1.0
1,1,1081177,0.0


In [100]:
targets_lvl_2['user_id'].nunique()

1979

In [212]:
targets_lvl_2['target'].value_counts()

0.0    393419
1.0      2597
Name: target, dtype: int64

#### Фичи для 2 шага

In [103]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [107]:
user_features = pd.read_csv('../data/hh_demographic.csv')
user_features.columns = [col.lower() for col in user_features.columns]
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [108]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,1,995242,1.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown
1,1,1081177,0.0,2,PRODUCE,National,TOMATOES,TOMATOES VINE RIPE BULK,15 LB,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown


**Фичи user_id:**
    - Средний чек
    - Средняя сумма покупки 1 товара в каждой категории (55 руб для категории молоко, 230 руб для категории мясо, ...)
    - Кол-во покупок в каждой категории
    - Частотность покупок раз/месяц
    - Долю покупок в выходные
    - Долю покупок утром/днем/вечером
    - ...

**Фичи item_id**:
    - Кол-во покупок в неделю (молоко Домик в деревне --> 172 раза/неделю)
    - Среднее кол-во покупок 1 товара в категории в неделю (категория молоко --> 56 раз/неделю)
    - (Кол-во покупок в неделю) / (Среднее кол-во покупок 1 товара в категории в неделю)
    - Цена (Можно посчитать из retil_train.csv) --> выручку за период / продажи за период (молоко Домик в деревне --> 72 руб)
    - Средняя цена товара в категории (категория молоко --> 56 руб)
    - Цена / Средняя цена товара в категории
    
**Фичи пары user_id - item_id**
    - (Средняя сумма покупки 1 товара в каждой категории (берем категорию item_id)) - (Цена item_id). {Средняя стоимость покупки юзером Вася в категории молоко = 46 руб} - {молоко Домик в деревне --> 72 руб}
    - (Кол-во покупок юзером конкретной категории в неделю) - (Среднее кол-во покупок всеми юзерами конкретной категории в неделю). {юзер Вася покупает молоко в среднем 3.3 раза/неделю} - {Среднестатистический юзер покупает молоко 1.5 неделю}
    - (Кол-во покупок юзером конкретной категории в неделю) / (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)

In [185]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2['target']

cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

In [186]:
# y_train = np.array(y_train).reshape(-1, 1).shape

In [187]:
train_data = lgb.Dataset(X_train, label=X_train.columns, categorical_feature=cat_feats)
train_data

<lightgbm.basic.Dataset at 0x7f593b3eb8e0>

In [237]:
%%time

model = lgb.LGBMClassifier(
    objective='binary', # lambdarank binary
#     max_depth=7, 
    n_estimators=750,
    categorical_column=cat_feats,
)


CPU times: user 37 µs, sys: 2 µs, total: 39 µs
Wall time: 44.8 µs


In [238]:
model.fit(X_train, y_train)



LGBMClassifier(boosting_type='gbdt',
               categorical_column=['manufacturer', 'department', 'brand',
                                   'commodity_desc', 'sub_commodity_desc',
                                   'curr_size_of_product', 'age_desc',
                                   'marital_status_code', 'income_desc',
                                   'homeowner_desc', 'hh_comp_desc',
                                   'household_size_desc', 'kid_category_desc'],
               class_weight=None, colsample_bytree=1.0, importance_type='split',
               learning_rate=0.1, max_depth=-1, min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=750,
               n_jobs=-1, num_leaves=31, objective='binary', random_state=None,
               reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [241]:
train_preds = model.predict(X_train, raw_score=True)

In [242]:
train_preds.shape

(396016,)

In [243]:
Pred_data = X_train[['user_id','item_id']].copy()
Pred_data['prediction'] = train_preds

In [244]:
Pred_data.head(5)

Unnamed: 0,user_id,item_id,prediction
0,1,995242,0.633475
1,1,1081177,-8.645428
2,1,1082185,0.642637
3,1,1004906,-4.788213
4,1,845078,-10.690178


In [245]:
True_pred = Pred_data[Pred_data['prediction']>0]

In [246]:
True_pred.shape

(1348, 3)