# Import libs

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.sparse import csr_matrix

from implicit import als

from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

#homebrew
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

## Read data

In [2]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

# Set global const

In [4]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'

# N = Neighbors
N_PREDICT = 50 

# Process features dataset

In [5]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

# Split dataset for train, eval, test

In [6]:
VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

In [7]:
# train data
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# val data
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]

# ranking train data
data_train_ranker = data_val_matcher.copy()

# ranking match test data
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [8]:
df_join_train_matcher = pd.concat([data_train_matcher, data_val_matcher])

# Prefilter items

In [13]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))


Decreased # items from 83685 to 5001


# Make cold-start to warm-start

In [14]:
# finding common users
common_users = list(set(data_train_matcher.user_id.values)&(set(data_val_matcher.user_id.values))&set(data_val_ranker.user_id.values))

# keeping common users
data_train_matcher = data_train_matcher[data_train_matcher.user_id.isin(common_users)]
data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (784420, 13) Users: 1915 Items: 4999
val_matcher
Shape: (163261, 12) Users: 1915 Items: 27118
train_ranker
Shape: (163261, 12) Users: 1915 Items: 27118
val_ranker
Shape: (115989, 12) Users: 1915 Items: 24042


# Init/train recommender

In [15]:
recommender = MainRecommender(data_train_matcher)



  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/4999 [00:00<?, ?it/s]

# Eval recall of matching

In [21]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,6,"[1024306, 1102949, 6548453, 835394, 940804, 96..."


In [22]:
%%time
result_eval_matcher['own_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
result_eval_matcher['sim_item_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_similar_items_recommendation(x, N=N_PREDICT))
result_eval_matcher['als_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_als_recommendations(x, N=N_PREDICT))

Wall time: 22.9 s


In [24]:
def evalRecall(df_result, target_col_name, recommend_model):
    result_col_name = 'result'
    df_result[result_col_name] = df_result[target_col_name].apply(lambda x: recommend_model(x, N=25))
    return df_result.apply(lambda row: recall_at_k(row[result_col_name], row[ACTUAL_COL], k=N_PREDICT), axis=1).mean()

In [26]:
def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [27]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

### Precision@5 of matching

In [30]:
TOPK_PRECISION = 5

In [31]:
sorted(calc_precision(result_eval_matcher, TOPK_PRECISION), key=lambda x: x[1],reverse=True)

[('own_rec', 0.18872062663185182),
 ('als_rec', 0.12793733681462036),
 ('sim_item_rec', 0.06527415143603181)]

# Ranking part

In [33]:
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [34]:
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

In [36]:
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [37]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

### Check warm start

In [39]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (95750, 2) Users: 1915 Items: 4437


In [177]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1

df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

df_ranker_train['target'].fillna(0, inplace= True)

In [180]:
df_ranker_train['target'].mean()

0.11119830179378062

## Feature preparation

### Descriptive features

In [181]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [182]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [183]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [184]:
data_train_features = data_train_matcher.merge(item_features[['item_id','department']], on='item_id', how='left')

In [185]:
%%time
# Mean purchase

for user_id in data_train_features['user_id'].unique():
    data_train_features.loc[data_train_features['user_id']==user_id, 'mean_purchase_sum']=data_train_features.loc[
        data_train_features['user_id']==user_id].groupby(['day'])['sales_value'].sum().mean()

Wall time: 13.2 s


In [186]:
%%time
# Num of purchases in each category

for user_id in data_train_matcher['user_id'].unique():
    purchase_dict = dict(data_train_features.loc[data_train_features['user_id']==user_id].groupby(['department'])['quantity'].sum())
    for item_dept in list(purchase_dict.keys()):
        data_train_features.loc[(data_train_features['user_id']==user_id) & (data_train_features['department']==item_dept), 'purch_by_dept'] = purchase_dict[item_dept]

Wall time: 8min 17s


In [187]:
%%time
# Num of purchases per week

for item_id in data_train_features['item_id'].unique():
    data_train_features.loc[data_train_features['item_id']==item_id, 'mean_purchase_per_week']=data_train_features.loc[
        data_train_features['item_id']==item_id].groupby(['week_no'])['quantity'].sum().mean()

Wall time: 35.5 s


In [188]:
%%time
# Num of purchases in one category per week

for department in data_train_features['department'].unique():
    mean_sum = data_train_features.loc[data_train_features['department']==department].groupby(['week_no'])['quantity'].sum().mean()
    num_of_items = data_train_features.loc[data_train_features['department']==department, 'item_id'].nunique()
    if num_of_items > 0:
        data_train_features.loc[data_train_features['department']==department, 'mean_of_items']= mean_sum / num_of_items

Wall time: 1.41 s


In [189]:
%%time
# Mean num of purchases in one category per user per week

for department in data_train_features['department'].unique():
    mean_sum = data_train_features.loc[data_train_features['department']==department].groupby(['week_no'])['quantity'].sum().mean()
    num_of_users = data_train_features.loc[data_train_features['department']==department, 'user_id'].nunique()
    if num_of_users > 0:
        data_train_features.loc[data_train_features['department']==department, 'mean_of_users']= mean_sum / num_of_users

Wall time: 1.43 s


In [190]:
%%time
# (Num of purchases in one category per week by user) / (Mean num of purchases in one category per user per week)

data_train_features['dept_mean'] = data_train_features['purch_by_dept']/data_train_features['mean_of_users']

Wall time: 4 ms


In [191]:
data_train_added_features = data_train_features[
    ['user_id','item_id','mean_purchase_sum', 
     'purch_by_dept','mean_purchase_per_week',
     'mean_of_items','mean_of_users','dept_mean']]

In [192]:
data_train_added_features

Unnamed: 0,user_id,item_id,mean_purchase_sum,purch_by_dept,mean_purchase_per_week,mean_of_items,mean_of_users,dept_mean
0,2375,1085983,27.839038,99.0,3.973333,1.273458,2.161061,45.810836
1,1364,999999,13.881122,,3234.305882,,,
2,1364,999999,13.881122,,3234.305882,,,
3,1364,999999,13.881122,,3234.305882,,,
4,1364,937406,13.881122,21.0,1.815789,1.547085,0.383733,54.725587
...,...,...,...,...,...,...,...,...
784415,856,999999,20.200818,,3234.305882,,,
784416,856,937454,20.200818,238.0,1.552632,1.273458,2.161061,110.131102
784417,856,999999,20.200818,,3234.305882,,,
784418,856,1132814,20.200818,238.0,1.622222,1.273458,2.161061,110.131102


### Behavioural features

In [193]:
df_join_train_matcher.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [194]:
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=USER_COL)


df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_baskter')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)


df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)


In [196]:
df_ranker_train = df_ranker_train.merge(data_train_added_features, on=[USER_COL, ITEM_COL], how='left')

In [197]:
df_ranker_train.drop_duplicates(subset = ['user_id', 'item_id'], inplace=True)
df_ranker_train.fillna('', inplace= True)

In [199]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [200]:
cat_feats = X_train.columns[2:15].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

## Train ranking

In [162]:
from catboost import CatBoostClassifier

In [205]:
#lgb = LGBMClassifier(objective='binary',
#                     max_depth=10,
#                     n_estimators=500,
#                     learning_rate=0.05,
#                     categorical_column=cat_feats)
#
#lgb.fit(X_train, y_train)
#
#train_preds = lgb.predict_proba(X_train)

cbc = CatBoostClassifier(n_estimators = 500,
                        depth=10,
                        learning_rate=0.05,
                        loss_function='Logloss',
                        cat_features = cat_feats,
                        verbose=True)

cbc.fit(X_train, y_train)

train_preds = cbc.predict_proba(X_train)

0:	learn: 0.6286501	total: 358ms	remaining: 2m 58s
1:	learn: 0.5774330	total: 533ms	remaining: 2m 12s
2:	learn: 0.5335738	total: 724ms	remaining: 1m 59s
3:	learn: 0.4976081	total: 871ms	remaining: 1m 48s
4:	learn: 0.4660109	total: 961ms	remaining: 1m 35s
5:	learn: 0.4397717	total: 985ms	remaining: 1m 21s
6:	learn: 0.4134231	total: 1.09s	remaining: 1m 16s
7:	learn: 0.3919124	total: 1.27s	remaining: 1m 18s
8:	learn: 0.3761585	total: 1.28s	remaining: 1m 9s
9:	learn: 0.3598835	total: 1.37s	remaining: 1m 7s
10:	learn: 0.3447920	total: 1.54s	remaining: 1m 8s
11:	learn: 0.3325154	total: 1.63s	remaining: 1m 6s
12:	learn: 0.3212430	total: 1.79s	remaining: 1m 7s
13:	learn: 0.3117255	total: 1.92s	remaining: 1m 6s
14:	learn: 0.3048763	total: 1.94s	remaining: 1m 2s
15:	learn: 0.2977193	total: 1.96s	remaining: 59.3s
16:	learn: 0.2921028	total: 2.01s	remaining: 57.2s
17:	learn: 0.2856237	total: 2.18s	remaining: 58.5s
18:	learn: 0.2815560	total: 2.2s	remaining: 55.6s
19:	learn: 0.2773764	total: 2.26s	

162:	learn: 0.2106805	total: 27.6s	remaining: 57s
163:	learn: 0.2105554	total: 27.8s	remaining: 56.9s
164:	learn: 0.2104291	total: 27.9s	remaining: 56.7s
165:	learn: 0.2103514	total: 28.1s	remaining: 56.6s
166:	learn: 0.2101221	total: 28.3s	remaining: 56.4s
167:	learn: 0.2099471	total: 28.5s	remaining: 56.3s
168:	learn: 0.2098586	total: 28.7s	remaining: 56.2s
169:	learn: 0.2096588	total: 28.9s	remaining: 56s
170:	learn: 0.2095839	total: 29.1s	remaining: 55.9s
171:	learn: 0.2095604	total: 29.3s	remaining: 55.8s
172:	learn: 0.2094432	total: 29.5s	remaining: 55.7s
173:	learn: 0.2093563	total: 29.7s	remaining: 55.6s
174:	learn: 0.2092831	total: 29.9s	remaining: 55.5s
175:	learn: 0.2092828	total: 29.9s	remaining: 55s
176:	learn: 0.2091872	total: 30.1s	remaining: 54.8s
177:	learn: 0.2090989	total: 30.2s	remaining: 54.7s
178:	learn: 0.2089893	total: 30.4s	remaining: 54.6s
179:	learn: 0.2088249	total: 30.6s	remaining: 54.4s
180:	learn: 0.2087760	total: 30.8s	remaining: 54.3s
181:	learn: 0.2087

321:	learn: 0.1878301	total: 57.1s	remaining: 31.6s
322:	learn: 0.1877616	total: 57.3s	remaining: 31.4s
323:	learn: 0.1875198	total: 57.5s	remaining: 31.3s
324:	learn: 0.1873543	total: 57.7s	remaining: 31.1s
325:	learn: 0.1871554	total: 57.9s	remaining: 30.9s
326:	learn: 0.1871063	total: 58.1s	remaining: 30.8s
327:	learn: 0.1869133	total: 58.3s	remaining: 30.6s
328:	learn: 0.1868460	total: 58.5s	remaining: 30.4s
329:	learn: 0.1867275	total: 58.7s	remaining: 30.3s
330:	learn: 0.1865882	total: 59s	remaining: 30.1s
331:	learn: 0.1863870	total: 59.2s	remaining: 30s
332:	learn: 0.1861793	total: 59.4s	remaining: 29.8s
333:	learn: 0.1860188	total: 59.6s	remaining: 29.6s
334:	learn: 0.1858473	total: 59.8s	remaining: 29.4s
335:	learn: 0.1857341	total: 60s	remaining: 29.3s
336:	learn: 0.1856624	total: 1m	remaining: 29.1s
337:	learn: 0.1854729	total: 1m	remaining: 28.9s
338:	learn: 0.1853416	total: 1m	remaining: 28.7s
339:	learn: 0.1851534	total: 1m	remaining: 28.6s
340:	learn: 0.1850659	total: 1

478:	learn: 0.1690822	total: 1m 28s	remaining: 3.88s
479:	learn: 0.1690248	total: 1m 28s	remaining: 3.69s
480:	learn: 0.1689253	total: 1m 28s	remaining: 3.51s
481:	learn: 0.1687784	total: 1m 28s	remaining: 3.32s
482:	learn: 0.1686252	total: 1m 29s	remaining: 3.14s
483:	learn: 0.1685603	total: 1m 29s	remaining: 2.95s
484:	learn: 0.1684474	total: 1m 29s	remaining: 2.77s
485:	learn: 0.1682823	total: 1m 29s	remaining: 2.59s
486:	learn: 0.1680741	total: 1m 29s	remaining: 2.4s
487:	learn: 0.1679022	total: 1m 30s	remaining: 2.22s
488:	learn: 0.1678162	total: 1m 30s	remaining: 2.03s
489:	learn: 0.1677596	total: 1m 30s	remaining: 1.85s
490:	learn: 0.1676731	total: 1m 30s	remaining: 1.66s
491:	learn: 0.1676013	total: 1m 30s	remaining: 1.48s
492:	learn: 0.1674951	total: 1m 31s	remaining: 1.29s
493:	learn: 0.1673444	total: 1m 31s	remaining: 1.11s
494:	learn: 0.1672894	total: 1m 31s	remaining: 924ms
495:	learn: 0.1671547	total: 1m 31s	remaining: 740ms
496:	learn: 0.1670024	total: 1m 31s	remaining: 

In [206]:
df_ranker_predict = df_ranker_train.copy()

In [207]:
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [208]:
df_ranker_predict

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket,mean_purchase_sum,purch_by_dept,mean_purchase_per_week,mean_of_items,mean_of_users,dept_mean,proba_item_purchase
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,...,0.452137,0.000404,0.008140,28.909231,14.0,1.980392,1.879319,0.218927,63.94814,0.057511
7,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,...,0.452137,0.000208,0.008140,,,,,,,0.002148
8,2070,879194,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,45-54,...,0.452137,0.000188,0.008140,28.909231,33.0,1.636364,1.064347,0.232614,141.866039,0.084969
10,2070,948640,0.0,1213,DRUG GM,National,ORAL HYGIENE PRODUCTS,WHITENING SYSTEMS,3 OZ,45-54,...,0.452137,0.000179,0.008140,,,,,,,0.001569
11,2070,928263,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,45-54,...,0.452137,0.000216,0.008140,28.909231,33.0,1.92,1.064347,0.232614,141.866039,0.128288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282770,1745,948832,0.0,1719,MEAT-PCKGD,National,HOT DOGS,BETTER FOR YOU,1 LB,45-54,...,0.004902,0.001028,0.003658,14.266381,75.0,3.323077,1.547085,0.383733,195.448524,0.133353
282772,1745,903454,0.0,1216,MEAT-PCKGD,National,FROZEN MEAT,OTHER - FULLY COOKED,32 OZ,45-54,...,0.004902,0.000196,0.003658,14.266381,75.0,2.190476,1.547085,0.383733,195.448524,0.038067
282773,1745,9419888,0.0,759,GROCERY,National,YOGURT,YOGURT MULTI-PACKS,48 OZ,45-54,...,0.004902,0.000355,0.003658,14.266381,157.0,1.54,1.273458,2.161061,72.649508,0.066815
282774,1745,880469,0.0,544,GROCERY,National,BAG SNACKS,CORN CHIPS,10 OZ,45-54,...,0.004902,0.000701,0.003658,14.266381,157.0,1.964286,1.273458,2.161061,72.649508,0.076862


# Evaluation on test dataset

In [209]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."


## Eval matching on test dataset

In [210]:
%%time
result_eval_ranker['own_rec'] = result_eval_ranker[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

Wall time: 5.57 s


In [211]:
# померяем precision только модели матчинга, чтобы понимать влияение ранжирования на метрики

sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('own_rec', 0.1462140992167092)]

## Eval re-ranked matched result on test dataset
    

In [212]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [213]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [214]:
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.18715404699738714)
('own_rec', 0.1462140992167092)


In [None]:
precision@5 >= 0.25

In [215]:
df_test = pd.read_csv('retail_test1.csv')
df_transactions = pd.read_csv('retail_train.csv')

In [216]:
df_test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [217]:
result_test = df_test.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_test.columns=[USER_COL, ACTUAL_COL]
result_test.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


In [218]:
result_test = result_test[result_test.user_id.isin(common_users)]

In [219]:
%%time
result_test['own_rec'] = result_test[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
result_test['reranked_own_rec'] = result_test[USER_COL].apply(lambda user_id: rerank(user_id))

print(*sorted(calc_precision(result_test, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.1601924233313279)
('own_rec', 0.12928442573661988)
Wall time: 5.92 s
