# Проект рекомендательной системы для ритейла

In [1]:
%load_ext autoreload
%autoreload 2

In [268]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Функции из 1-ого вебинара
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
   


In [269]:
%autoreload
from src.metrics import money_precision_at_k, money_recall_at_k, precision_at_k
from src.utils import prefilter_items, postfilter_items
import src.recommenders as rcm

**Train-test datasets**

In [293]:
data = pd.read_csv('../data/retail_train_sample.csv')

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)

# подсчет цены 
data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))
# выборосим не нужные колонки
data = data[data.columns[1:]]

print(data.shape)


test_size_weeks = 6

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_val = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

print(data_train.shape)
data_train.head(2)

(227849, 13)
(207776, 13)


Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
0,1078,35573861879,524,1082185,1,0.56,375,0.0,1440,76,0.0,0.0,0.56
1,324,29170411703,165,7168774,2,6.98,367,0.0,1115,24,0.0,0.0,3.49


**Проверочный датасет**

In [309]:
d_test = pd.read_csv('../data/retail_test1.csv')
d_test['price'] = d_test['sales_value'] / (np.maximum(d_test['quantity'], 1))
d_test.head(3)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0,8.49
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0,6.29
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0,1.82


**Продуктовый датасет**

In [310]:
item_features = pd.read_csv('../data/product.csv')
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)

item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [296]:
# item_features.loc[item_features['commodity_desc']=='NO COMMODITY DESCRIPTION','commodity_desc'] = item_features['department']

In [311]:
# количество подкатегорий товаров
item_features.commodity_desc.nunique()

308

### Готовим модель

In [312]:
%%time
data = prefilter_items(data_train)

CPU times: user 214 ms, sys: 3.04 ms, total: 217 ms
Wall time: 216 ms


In [313]:
%%time
recommender = rcm.MainRecommender(data)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))


CPU times: user 10.9 s, sys: 3.92 s, total: 14.8 s
Wall time: 6.73 s


In [314]:
# recommender.overall_top_purchases[:10]


In [301]:
# recommender.get_als_recommendations()

In [315]:
data_val.head(5)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
48,1633,40618357715,593,953476,1,0.5,32004,-0.07,1813,85,0.0,0.0,0.5
54,1268,41260142401,635,1027102,1,1.19,31742,-0.1,1752,91,0.0,0.0,1.19
58,1240,40841021630,611,900358,2,2.67,375,-1.11,1153,88,0.0,0.0,1.335
60,386,41259157348,633,1058686,1,1.0,410,0.0,2111,91,0.0,0.0,1.0
64,2107,40788501083,607,916122,2,14.68,450,-2.94,1251,87,0.0,0.0,7.34


In [491]:
dd = d_test.groupby('user_id')
result = dd['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
# добавим цены на реальные покупки
result['actual_price'] = dd['price'].unique().reset_index()['price']

In [492]:
result['recomendation'] = None
result.head()

Unnamed: 0,user_id,actual,actual_price,recomendation
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[2.99, 3.99, 1.5, 1.29, 1.0, 2.79, 1.14, 3.14,...",
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[0.25, 1.99, 1.89, 2.69, 2.99, 0.56, 3.24, 0.1...",
2,3,"[827683, 908531, 989069, 1071377, 1080155, 109...","[1.99, 1.2, 2.5, 0.6, 3.0, 0.2, 0.59, 1.79, 1.5]",
3,6,"[956902, 960791, 1037863, 1119051, 1137688, 84...","[1.88, 1.34, 1.29, 0.49, 1.09, 2.49, 1.0, 3.38...",
4,7,"[847270, 855557, 859987, 863407, 895454, 90663...","[0.79, 1.67, 3.19, 4.5, 3.69, 2.79, 3.0, 0.44,...",


In [503]:
new_users = np.array(list(set(result['user_id'].unique())-set(data['user_id'].unique())))
new_users.size

21

In [504]:
r_new_usr = result[result['user_id'].isin(new_users)]
r_als = result.loc[~result['user_id'].isin(new_users)]

In [505]:
%%time
r_als['recomendation'] = r_als['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=5))

CPU times: user 15.3 s, sys: 9.4 s, total: 24.7 s
Wall time: 6.22 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [506]:
r_als.head(2)

Unnamed: 0,user_id,actual,actual_price,recomendation
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[2.99, 3.99, 1.5, 1.29, 1.0, 2.79, 1.14, 3.14,...","[1087895, 856942, 933913, 1082269, 9655212]"
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[0.25, 1.99, 1.89, 2.69, 2.99, 0.56, 3.24, 0.1...","[945199, 871722, 1042910, 7410321, 12427353]"


In [507]:
rec_price_df = data.groupby('item_id')['price'].mean().reset_index()
rec_price_df.columns = ['item_id', 'price']
rec_price_df.head(2)

Unnamed: 0,item_id,price
0,42346,2.58
1,43020,2.7


In [508]:
# r_als['recomendation_price'] = list([rec_price_df[rec_price_df['item_id']==item] for item in r_als['recomendation']]

# [expr for val in collection for innerVal in val if condition]
# [rec_price_df[rec_price_df['item_id'] == i]['price'] for item in r_als['recomendation'] for i in item]
# rec_price_df.loc[rec_price_df['item_id']==item,'price']

In [509]:
%%time
# заполняем значение средних цен для рекомендаций
r_als['recomendation_price'] = r_als['recomendation'].apply(lambda x: [rec_price_df.loc[rec_price_df['item_id']==i, 'price'].iloc[0] for i in x])

CPU times: user 5.66 s, sys: 0 ns, total: 5.66 s
Wall time: 5.66 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [510]:
r_als.head(3)

Unnamed: 0,user_id,actual,actual_price,recomendation,recomendation_price
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[2.99, 3.99, 1.5, 1.29, 1.0, 2.79, 1.14, 3.14,...","[1087895, 856942, 933913, 1082269, 9655212]","[2.632857142857143, 2.8096666666666668, 3.9250..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[0.25, 1.99, 1.89, 2.69, 2.99, 0.56, 3.24, 0.1...","[945199, 871722, 1042910, 7410321, 12427353]","[2.5, 2.9618181818181815, 2.698333333333333, 2..."
2,3,"[827683, 908531, 989069, 1071377, 1080155, 109...","[1.99, 1.2, 2.5, 0.6, 3.0, 0.2, 0.59, 1.79, 1.5]","[1089025, 958233, 985893, 907631, 845774]","[3.0453846153846156, 2.667692307692308, 2.09, ..."


## Расчет метрик

In [513]:
r_als.apply(lambda row: precision_at_k(row['recomendation'], row['actual']), axis=1).mean()

0.06105150214592275

In [514]:
# money_precision_at_k(recommended_list, bought_list, prices_recommended, k=5)
r_als.apply(lambda row: money_precision_at_k(row['recomendation'], row['actual'], row['recomendation_price']), axis=1).mean()

0.060390332920255085