In [1]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix

from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

from src.metrics import recall_at_k, money_precision_at_k
from src.recommenders import MainRecommender
from src.utils import prefilter_items, postfilter_items, DataTransformer

In [2]:
data = pd.read_csv('data/retail_train.csv')
item_features = pd.read_csv('data/product.csv')
user_features = pd.read_csv('data/hh_demographic.csv')

item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

In [3]:
prices = data.groupby('item_id')[['sales_value', 'quantity']].sum().reset_index()
prices['price'] = prices['sales_value']/prices['quantity']
prices.replace(np.inf, 0, inplace=True)
prices = dict(zip(prices['item_id'], prices['price']))

In [9]:
items_sub_comm = dict(zip(item_features['item_id'], item_features['sub_commodity_desc']))

In [10]:
data_train_lvl_1_filter = prefilter_items(data_train_lvl_1, 6000)

In [13]:
als_rec = MainRecommender(data_train_lvl_1_filter, weighting=None, n_factors=100, regularization=0.01, iterations=100, num_threads=8)



HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6001.0), HTML(value='')))




In [14]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']

In [15]:
result_lvl_1['als_recs'] = result_lvl_1['user_id'].apply(lambda x: als_rec.get_main_model_recommendations(x, N=6000))

In [16]:
result_lvl_1 = postfilter_items(result_lvl_1, 'als_recs', items_sub_comm, prices, N=200)

In [17]:
recall_at_200_als = result_lvl_1.apply(lambda row: recall_at_k(row['postfilter_als_recs'], row['actual'], k=200), axis=1).mean()
recall_at_200_als

0.13790611389256

In [18]:
result_lvl_1.rename(columns={'postfilter_als_recs': 'recommendations'}, inplace=True)

In [19]:
trans = DataTransformer()
data_train_lvl_2 = trans.fit_transform(result_lvl_1, data_val_lvl_1, item_features, user_features, with_targets=True)

In [21]:
data_train_lvl_2.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,mean_purchases,mean_item_purchases_per_sub_comm_desc,mean_item_purchases_per_comm_desc,price,mean_price_in_sub_comm_desc,purchases_department_diff,price/mean_price_sub_comm_desc,mean_sum_sub_comm_desc-price,mean_sum_comm_desc-price,mean_sum_department-price
0,1,5569374,0.0,1208,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,65+,...,3.833333,368.333333,1539.5,4.1055,3.24761,-24557.066667,1.264161,-4.1055,-0.5155,-1.836844
1,1,885290,0.0,910,GROCERY,National,BAKED BREAD/BUNS/ROLLS,MAINSTREAM WHEAT/MULTIGRAIN BR,20 OZ,65+,...,12.5,194.5,1053.166667,1.962464,1.316061,-24557.066667,1.491165,-0.272464,0.465314,0.306192
2,1,1100972,0.0,586,GROCERY,National,CRACKERS/MISC BKD FD,BUTTER SPRAY CRACKER (RITZ/CLU,16 OZ,65+,...,8.833333,83.666667,385.0,2.092,2.155,-24557.066667,0.970766,-2.092,0.193,0.176656
3,1,965766,0.0,317,GROCERY,National,CHEESE,IWS SINGLE CHEESE,12 OZ,65+,...,37.333333,139.833333,1157.0,1.744991,2.010113,-24557.066667,0.868106,0.145009,1.00358,0.523665
4,1,940947,1.0,2082,MEAT-PCKGD,National,HEAT/SERVE,ENTREES,24 OZ,65+,...,21.0,50.0,86.5,2.715891,3.206647,-1407.166667,0.846957,-0.005891,-0.005891,0.385776


In [22]:
from catboost import CatBoostClassifier

In [24]:
def get_recs_from_lvl_2_model(data, predictions):
    data_with_preds = data.copy()
    data_with_preds['predictions'] = predictions
    user_id_list = []
    recs = []
    sorted_data = data_with_preds.sort_values(['user_id', 'predictions'], ascending=False)
    for user_id in data_with_preds['user_id'].unique():
        user_id_list.append(user_id)
        recs.append(sorted_data[sorted_data['user_id']==user_id]['item_id'].tolist())
        
    result = pd.DataFrame({'user_id':user_id_list, 'catboost_recs': recs})
    return result

In [25]:
from sklearn.model_selection import GridSearchCV

In [26]:
y = data_train_lvl_2['target']
X = data_train_lvl_2.drop('target', axis=1)

In [27]:
data_train_lvl_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 437953 entries, 0 to 437952
Data columns (total 37 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   user_id                                437953 non-null  int64  
 1   item_id                                437953 non-null  int64  
 2   target                                 437953 non-null  float64
 3   manufacturer                           437953 non-null  int64  
 4   department                             437953 non-null  object 
 5   brand                                  437953 non-null  object 
 6   commodity_desc                         437953 non-null  object 
 7   sub_commodity_desc                     437953 non-null  object 
 8   curr_size_of_product                   437953 non-null  object 
 9   age_desc                               437953 non-null  object 
 10  marital_status_code                    437953 non-null  

In [28]:
cat_features = ['department', 'brand', 'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product',
                'age_desc', 'marital_status_code', 'income_desc', 'homeowner_desc', 'hh_comp_desc', 'household_size_desc', 
                'kid_category_desc']

In [32]:
class_1_weight = len(y[y==0])/len(y[y==1])

In [50]:
param_grid = {'n_estimators': [100, 300, 500], 'max_depth': [None, 5, 7, 9]}

In [54]:
estimator = CatBoostClassifier(n_estimators=300, max_depth=7, class_weights=[1, class_1_weight], cat_features=cat_features)

In [55]:
estimator.fit(X, y)

Learning rate set to 0.417161
0:	learn: 0.2312480	total: 441ms	remaining: 2m 11s
1:	learn: 0.1550495	total: 748ms	remaining: 1m 51s
2:	learn: 0.1250624	total: 1.08s	remaining: 1m 47s
3:	learn: 0.1138271	total: 1.38s	remaining: 1m 42s
4:	learn: 0.1074210	total: 1.66s	remaining: 1m 37s
5:	learn: 0.1045224	total: 1.96s	remaining: 1m 35s
6:	learn: 0.1017963	total: 2.27s	remaining: 1m 34s
7:	learn: 0.0998140	total: 2.56s	remaining: 1m 33s
8:	learn: 0.0986940	total: 2.85s	remaining: 1m 32s
9:	learn: 0.0972053	total: 3.13s	remaining: 1m 30s
10:	learn: 0.0958768	total: 3.43s	remaining: 1m 30s
11:	learn: 0.0955027	total: 3.72s	remaining: 1m 29s
12:	learn: 0.0946691	total: 4.02s	remaining: 1m 28s
13:	learn: 0.0940645	total: 4.33s	remaining: 1m 28s
14:	learn: 0.0932882	total: 4.62s	remaining: 1m 27s
15:	learn: 0.0923783	total: 4.92s	remaining: 1m 27s
16:	learn: 0.0918869	total: 5.22s	remaining: 1m 26s
17:	learn: 0.0909534	total: 5.52s	remaining: 1m 26s
18:	learn: 0.0905134	total: 5.8s	remaining: 

160:	learn: 0.0667046	total: 46.8s	remaining: 40.4s
161:	learn: 0.0665407	total: 47.1s	remaining: 40.1s
162:	learn: 0.0664728	total: 47.4s	remaining: 39.8s
163:	learn: 0.0664728	total: 47.7s	remaining: 39.5s
164:	learn: 0.0663388	total: 47.9s	remaining: 39.2s
165:	learn: 0.0663054	total: 48.2s	remaining: 38.9s
166:	learn: 0.0661702	total: 48.5s	remaining: 38.6s
167:	learn: 0.0660907	total: 48.8s	remaining: 38.4s
168:	learn: 0.0658913	total: 49.1s	remaining: 38.1s
169:	learn: 0.0658379	total: 49.4s	remaining: 37.8s
170:	learn: 0.0657166	total: 49.7s	remaining: 37.5s
171:	learn: 0.0655973	total: 50s	remaining: 37.2s
172:	learn: 0.0655557	total: 50.3s	remaining: 36.9s
173:	learn: 0.0654103	total: 50.6s	remaining: 36.6s
174:	learn: 0.0653094	total: 50.9s	remaining: 36.3s
175:	learn: 0.0651471	total: 51.1s	remaining: 36s
176:	learn: 0.0651143	total: 51.4s	remaining: 35.7s
177:	learn: 0.0649111	total: 51.7s	remaining: 35.4s
178:	learn: 0.0648425	total: 52s	remaining: 35.1s
179:	learn: 0.0648

<catboost.core.CatBoostClassifier at 0x23b5e4979d0>

In [56]:
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']

In [57]:
result_lvl_2['als_recs'] = result_lvl_2['user_id'].apply(lambda x: als_rec.get_main_model_recommendations(x, N=6000))

In [58]:
result_lvl_2 = postfilter_items(result_lvl_2, 'als_recs', items_sub_comm, prices, N=200)

In [59]:
result_lvl_2.rename(columns={'postfilter_als_recs': 'recommendations'}, inplace=True)

In [60]:
data_test_lvl_2 = trans.fit_transform(result_lvl_2, data_val_lvl_2, item_features, user_features, with_targets=True)

In [61]:
y_test = data_test_lvl_2['target']
X_test = data_test_lvl_2.drop('target', axis=1)

In [62]:
preds = estimator.predict_proba(X_test)

In [63]:
from sklearn.metrics import roc_auc_score

In [64]:
roc_auc_score(y_test, preds[:, 1])

0.9908528479284291

In [65]:
catboost_recs = get_recs_from_lvl_2_model(X_test, preds[:, 1])

In [68]:
result_lvl_2 = result_lvl_2.merge(catboost_recs, on='user_id', how='left')

In [70]:
result_lvl_2 = postfilter_items(result_lvl_2, 'catboost_recs', items_sub_comm, prices, N=5)

In [71]:
result_lvl_2.head()

Unnamed: 0,user_id,actual,als_recs,recommendations,catboost_recs,postfilter_catboost_recs
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[5569374, 885290, 1100972, 965766, 1082212, 94...","[5569374, 885290, 1100972, 965766, 940947, 100...","[979707, 979707, 1005186, 1005186, 856942, 856...","[979707, 1058997, 1100972, 1002787, 9524291]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[940947, 951590, 910032, 12810393, 1133018, 11...","[940947, 951590, 910032, 12810393, 1133018, 55...","[1053690, 13842214, 841220, 8276172, 1092937, ...","[1053690, 841220, 8276172, 1042544, 874972]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[834826, 1000753, 831628, 5585510, 1051516, 85...","[834826, 1000753, 831628, 5585510, 1051516, 85...","[1024306, 1037863, 845208, 5580166, 1119051, 9...","[1024306, 960613, 871611, 834826, 1094781]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[985999, 1126899, 1122358, 916122, 12810393, 5...","[985999, 1126899, 1122358, 916122, 12810393, 5...","[866211, 1003188, 1003188, 1122358, 1122358, 1...","[866211, 839346, 886703, 974156, 9245512]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[823704, 933067, 965766, 1005186, 999270, 8441...","[823704, 933067, 965766, 1005186, 999270, 8441...","[1005186, 901062, 901062, 901062, 845208, 8721...","[1005186, 1101010, 869728, 871061, 9859217]"


In [72]:
result_lvl_2['recommend_prices'] = result_lvl_2['postfilter_catboost_recs'].apply(lambda x: [prices[item] for item in x])

In [74]:
result_lvl_2.head()

Unnamed: 0,user_id,actual,als_recs,recommendations,catboost_recs,postfilter_catboost_recs,recommend_prices
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[5569374, 885290, 1100972, 965766, 1082212, 94...","[5569374, 885290, 1100972, 965766, 940947, 100...","[979707, 979707, 1005186, 1005186, 856942, 856...","[979707, 1058997, 1100972, 1002787, 9524291]","[1.3072035323801539, 1.243247116392868, 2.4205..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[940947, 951590, 910032, 12810393, 1133018, 11...","[940947, 951590, 910032, 12810393, 1133018, 55...","[1053690, 13842214, 841220, 8276172, 1092937, ...","[1053690, 841220, 8276172, 1042544, 874972]","[1.1769917012448297, 2.8747356828193804, 2.732..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[834826, 1000753, 831628, 5585510, 1051516, 85...","[834826, 1000753, 831628, 5585510, 1051516, 85...","[1024306, 1037863, 845208, 5580166, 1119051, 9...","[1024306, 960613, 871611, 834826, 1094781]","[1.92120746073298, 2.90814569536424, 3.5676767..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[985999, 1126899, 1122358, 916122, 12810393, 5...","[985999, 1126899, 1122358, 916122, 12810393, 5...","[866211, 1003188, 1003188, 1122358, 1122358, 1...","[866211, 839346, 886703, 974156, 9245512]","[3.3717496229260933, 1.7782242990654193, 1.123..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[823704, 933067, 965766, 1005186, 999270, 8441...","[823704, 933067, 965766, 1005186, 999270, 8441...","[1005186, 901062, 901062, 901062, 845208, 8721...","[1005186, 1101010, 869728, 871061, 9859217]","[2.4483079769736764, 2.7163636363636345, 1.796..."


In [75]:
money_precision_5 = result_lvl_2.apply(lambda row: money_precision_at_k(row['postfilter_catboost_recs'], 
                                                                        row['actual'], row['recommend_prices']), axis=1).mean()

In [76]:
money_precision_5

0.22067770995198296

In [78]:
import pickle

In [79]:
with open('Saved_Models/model_lvl_1.pickle', 'wb') as f:
    pickle.dump(als_rec, f)

In [80]:
with open('Saved_Models/model_lvl_2.pickle', 'wb') as f:
    pickle.dump(estimator, f)