In [1]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix

from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

from src.metrics import recall_at_k, money_precision_at_k

In [2]:
data = pd.read_csv('data/retail_train.csv')
item_features = pd.read_csv('data/product.csv')
user_features = pd.read_csv('data/hh_demographic.csv')

item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

In [3]:
from src.recommenders import MainRecommender
from src.utils import prefilter_items, postfilter_items

In [4]:
data_train_lvl_1 = prefilter_items(data_train_lvl_1)

In [5]:
recommender = MainRecommender(data_train_lvl_1, n_factors=30, iterations=30, num_threads=8)



HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [6]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']

In [7]:
result_lvl_1['als_recs'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_main_model_recommendations(x, N=200))

In [8]:
prices = data_train_lvl_1.groupby('item_id')[['sales_value', 'quantity']].sum().reset_index()
prices['price'] = prices['sales_value']/prices['quantity']

In [9]:
prices.head()

Unnamed: 0,item_id,sales_value,quantity,price
0,818981,298.86,104,2.873654
1,819063,517.49,320,1.617156
2,819112,148.3,90,1.647778
3,819255,1143.93,504,2.269702
4,819304,507.14,312,1.625449


In [10]:
prices.drop(['sales_value', 'quantity'], axis=1, inplace=True)

In [11]:
prices = dict(zip(prices['item_id'], prices['price']))

In [12]:
result_lvl_1.head()

Unnamed: 0,user_id,actual,als_recs
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[13007846, 12949855, 1082212, 1015386, 834382,..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[1021324, 9419422, 909338, 9419961, 945998, 90..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[997796, 1037417, 1077745, 1115098, 6514160, 8..."
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[823862, 1042616, 950935, 969941, 9553397, 105..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[1020404, 835285, 913144, 5981267, 879280, 938..."


In [13]:
def get_prices(items):
    return [prices[k] for k in items]

In [14]:
result_lvl_1['prices_recommend'] = result_lvl_1.apply(lambda row: get_prices(row['als_recs']), axis=1)

In [15]:
money_precision_5_als = result_lvl_1.apply(lambda row: money_precision_at_k(row['als_recs'], row['actual'], row['prices_recommend']), axis=1).mean()
money_precision_5_als

0.03683910319457695

In [16]:
item_features_with_prices = data_train_lvl_1.groupby('item_id')[['sales_value', 'quantity']].sum().reset_index()
item_features_with_prices['price'] = item_features_with_prices['sales_value']/item_features_with_prices['quantity']
item_features_with_prices.drop(['sales_value', 'quantity'], axis=1, inplace=True)
item_features_with_prices = item_features_with_prices.merge(item_features, on='item_id', how='left')
item_features_with_prices = item_features_with_prices.loc[item_features_with_prices['item_id'] != 999999]

In [17]:
item_features_with_prices.loc[item_features_with_prices['price']>7]['item_id'].nunique()

125

In [18]:
result_lvl_1['als_recs'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_main_model_recommendations(x, N=4876))

In [19]:
result_lvl_1 = postfilter_items(result_lvl_1, 'als_recs', item_features_with_prices)

In [20]:
result_lvl_1.head()

Unnamed: 0,user_id,actual,als_recs,prices_recommend,postfilter_als_recs
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[13007846, 12949855, 1082212, 1015386, 834382,...","[2.1810317460317448, 3.0885897435897456, 2.017...","[13007846, 12949855, 1066095, 1100972, 1036249]"
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[1021324, 9419422, 909338, 9419961, 945998, 90...","[7.5499999999999705, 5.4060000000000015, 1.214...","[1021324, 9419422, 909338, 9419961, 945998]"
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[997796, 1037417, 1077745, 1115098, 6514160, 8...","[3.641684587813626, 2.895266666666666, 3.23040...","[997796, 1037417, 1082627, 925178, 988736]"
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[823862, 1042616, 950935, 969941, 9553397, 105...","[4.359743589743593, 2.945144230769237, 2.98254...","[823862, 1042616, 9655679, 898958, 919766]"
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[1020404, 835285, 913144, 5981267, 879280, 938...","[4.0226086956521785, 1.8193023255813947, 4.230...","[1020404, 835285, 995151, 853643, 1085846]"


In [21]:
result_lvl_1['prices_recommend_postfilter'] = result_lvl_1.apply(lambda row: get_prices(row['postfilter_als_recs']), axis=1)

In [22]:
money_precision_5_als_postfilter = result_lvl_1.apply(lambda row: money_precision_at_k(row['postfilter_als_recs'], row['actual'],
                                                                                       row['prices_recommend_postfilter']), 
                                                                                       axis=1).mean()
money_precision_5_als_postfilter

0.02814370436069936

In [23]:
recall_at_200_als = result_lvl_1.apply(lambda row: recall_at_k(row['als_recs'], row['actual'], k=200), axis=1).mean()
recall_at_200_als

0.12364460849093736

In [24]:
from src.searcher import GridSearch

In [25]:
data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]

In [26]:
prices = data.groupby('item_id')[['sales_value', 'quantity']].sum().reset_index()
prices['price'] = prices['sales_value']/prices['quantity']
prices.drop(['sales_value', 'quantity'], axis=1, inplace=True)
item_features_with_prices = item_features.merge(prices, on='item_id', how='right')

In [27]:
item_features_with_prices.dropna(inplace=True)

In [28]:
top_n_list = [6000, 8000, 10000]
weighting_list = ['bm25', 'tfidf', None]
param_grid = {'n_factors': [25, 50, 75], 'regularization': [0.001, 0.01], 'iterations': [20, 50, 100], 'num_threads': [8]}

In [29]:
#search = GridSearch('MainRecommender', top_n_list, weighting_list, param_grid, recall_at_k)

In [30]:
#best_score, best_params = search.fit(data_train_lvl_1, data_val_lvl_1, item_features_with_prices)

In [31]:
best_score, best_params = (0.015779534928617823,
 [6000,
  None,
  {'n_factors': 75,
   'regularization': 0.01,
   'iterations': 100,
   'num_threads': 8}])

In [32]:
prefilter_data_train = prefilter_items(data_train_lvl_1, 6000)

In [33]:
als_rec = MainRecommender(prefilter_data_train, weighting=None, n_factors=75, regularization=0.01, iterations=100, num_threads=8)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6001.0), HTML(value='')))




In [34]:
result_lvl_1.head()

Unnamed: 0,user_id,actual,als_recs,prices_recommend,postfilter_als_recs,prices_recommend_postfilter
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[13007846, 12949855, 1082212, 1015386, 834382,...","[2.1810317460317448, 3.0885897435897456, 2.017...","[13007846, 12949855, 1066095, 1100972, 1036249]","[2.1810317460317448, 3.0885897435897456, 7.667..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[1021324, 9419422, 909338, 9419961, 945998, 90...","[7.5499999999999705, 5.4060000000000015, 1.214...","[1021324, 9419422, 909338, 9419961, 945998]","[7.5499999999999705, 5.4060000000000015, 1.214..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[997796, 1037417, 1077745, 1115098, 6514160, 8...","[3.641684587813626, 2.895266666666666, 3.23040...","[997796, 1037417, 1082627, 925178, 988736]","[3.641684587813626, 2.895266666666666, 7.67382..."
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[823862, 1042616, 950935, 969941, 9553397, 105...","[4.359743589743593, 2.945144230769237, 2.98254...","[823862, 1042616, 9655679, 898958, 919766]","[4.359743589743593, 2.945144230769237, 8.33727..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[1020404, 835285, 913144, 5981267, 879280, 938...","[4.0226086956521785, 1.8193023255813947, 4.230...","[1020404, 835285, 995151, 853643, 1085846]","[4.0226086956521785, 1.8193023255813947, 9.980..."


In [35]:
#%%time
#result_lvl_1['als_recs'] = result_lvl_1['user_id'].apply(lambda x: als_rec.get_main_model_recommendations(x, N=6000))

In [36]:
#result_lvl_1 = postfilter_items(result_lvl_1, 'als_recs', item_features_with_prices, N=200)
#recall_at_200_als = result_lvl_1.apply(lambda row: recall_at_k(row['postfilter_als_recs'], row['actual'], k=200), axis=1).mean()
#recall_at_200_als  #0.13577141284317992

In [37]:
from src.recommenders import BPRRecommender

In [38]:
#bpr_rec = BPRRecommender(prefilter_data_train, weighting=None, n_factors=75, regularization=0.01, iterations=100, num_threads=8)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6001.0), HTML(value='')))




In [39]:
#%%time
#result_lvl_1['bpr_recs'] = result_lvl_1['user_id'].apply(lambda x: bpr_rec.get_main_model_recommendations(x, N=6000))

Wall time: 8min 7s


In [41]:
#result_lvl_1 = postfilter_items(result_lvl_1, 'bpr_recs', item_features_with_prices, N=200)
#recall_at_200_bpr = result_lvl_1.apply(lambda row: recall_at_k(row['postfilter_bpr_recs'], row['actual'], k=200), axis=1).mean()
#recall_at_200_bpr  #0.09491826956640702

0.09491826956640702

In [44]:
top_n_list = [6000]
weighting_list = [None]
param_grid = {'n_factors': [75, 100], 'regularization': [0.005, 0.01], 'iterations': [100, 150, 200], 'num_threads': [8]}

In [49]:
als_rec_1 = MainRecommender(prefilter_data_train, weighting=None, n_factors=100, regularization=0.01, iterations=100, num_threads=8)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6001.0), HTML(value='')))




In [50]:
%%time
result_lvl_1['als_recs'] = result_lvl_1['user_id'].apply(lambda x: als_rec_1.get_main_model_recommendations(x, N=6000))

Wall time: 8min 41s


In [51]:
result_lvl_1 = postfilter_items(result_lvl_1, 'als_recs', item_features_with_prices, N=200)
recall_at_200_als = result_lvl_1.apply(lambda row: recall_at_k(row['postfilter_als_recs'], row['actual'], k=200), axis=1).mean()
recall_at_200_als

0.13983039637329248

In [52]:
item_features.head()

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ


In [53]:
item_features['department'].nunique()

44

In [54]:
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [60]:
item_features['department'].unique()

array(['GROCERY', 'MISC. TRANS.', 'PASTRY', 'DRUG GM', 'MEAT-PCKGD',
       'SEAFOOD-PCKGD', 'PRODUCE', 'NUTRITION', 'DELI', 'COSMETICS',
       'MEAT', 'FLORAL', 'TRAVEL & LEISUR', 'SEAFOOD', 'MISC SALES TRAN',
       'SALAD BAR', 'KIOSK-GAS', 'ELECT &PLUMBING', 'GRO BAKERY',
       'GM MERCH EXP', 'FROZEN GROCERY', 'COUP/STR & MFG', 'SPIRITS',
       'GARDEN CENTER', 'TOYS', 'CHARITABLE CONT', 'RESTAURANT', 'RX',
       'PROD-WHS SALES', 'MEAT-WHSE', 'DAIRY DELI', 'CHEF SHOPPE', 'HBC',
       'DELI/SNACK BAR', 'PORK', 'AUTOMOTIVE', 'VIDEO RENTAL', ' ',
       'CNTRL/STORE SUP', 'HOUSEWARES', 'POSTAL CENTER', 'PHOTO', 'VIDEO',
       'PHARMACY SUPPLY'], dtype=object)

In [64]:
item_user_categories = data[['user_id', 'item_id', 'quantity']].merge(item_features[['item_id', 'department']], on='item_id', how='left')
item_user_categories['department'].unique()

array(['PRODUCE', 'GROCERY', 'DRUG GM', 'MEAT', 'MEAT-PCKGD', 'DELI',
       'SEAFOOD-PCKGD', ' ', 'PASTRY', 'NUTRITION', 'VIDEO RENTAL',
       'MISC SALES TRAN', 'FLORAL', 'SEAFOOD', 'SALAD BAR', 'AUTOMOTIVE',
       'SPIRITS', 'COSMETICS', 'MISC. TRANS.', 'GARDEN CENTER',
       'CHEF SHOPPE', 'TRAVEL & LEISUR', 'COUP/STR & MFG', 'KIOSK-GAS',
       'FROZEN GROCERY', 'RESTAURANT', 'HOUSEWARES', 'PORK',
       'POSTAL CENTER', 'GM MERCH EXP', 'CNTRL/STORE SUP',
       'PROD-WHS SALES', 'DAIRY DELI', 'HBC', 'CHARITABLE CONT', 'RX',
       'TOYS', 'PHOTO', 'DELI/SNACK BAR', 'GRO BAKERY', 'PHARMACY SUPPLY',
       'ELECT &PLUMBING', 'MEAT-WHSE', 'VIDEO'], dtype=object)

In [106]:
k = pd.concat([data.loc[data['day']%6==0], data.loc[data['day']%7==0]])
k.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
1755,1096,27031378941,6,913785,1,0.99,321,-1.0,40,2,0.0,0.0
1756,1096,27031378941,6,1082185,1,0.48,321,0.0,40,2,0.0,0.0
1757,1096,27031378941,6,1106523,1,2.32,321,0.0,40,2,0.0,0.0
1758,1096,27031378941,6,6533236,1,2.33,321,-1.66,40,2,0.0,0.0
1759,1096,27031378941,6,6773232,1,2.29,321,0.0,40,2,0.0,0.0


In [120]:
user_features.head()

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8
3,25-34,U,75-99K,Homeowner,2 Adults Kids,4,2,13
4,45-54,B,50-74K,Homeowner,Single Female,1,None/Unknown,16


In [121]:
user_features['user_id'].nunique()

801

In [129]:
item_features_with_prices[item_features_with_prices['price'] >= 5]['item_id'].nunique()

18219

In [131]:
item_test = item_features.copy()

In [132]:
item_test.replace(' ', np.nan, inplace=True)

In [133]:
item_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92353 entries, 0 to 92352
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   item_id               92353 non-null  int64 
 1   manufacturer          92353 non-null  int64 
 2   department            92338 non-null  object
 3   brand                 92353 non-null  object
 4   commodity_desc        92338 non-null  object
 5   sub_commodity_desc    92338 non-null  object
 6   curr_size_of_product  61746 non-null  object
dtypes: int64(2), object(5)
memory usage: 4.9+ MB


In [134]:
item_test['curr_size_of_product'].fillna('Unknown', inplace=True)

In [135]:
item_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92353 entries, 0 to 92352
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   item_id               92353 non-null  int64 
 1   manufacturer          92353 non-null  int64 
 2   department            92338 non-null  object
 3   brand                 92353 non-null  object
 4   commodity_desc        92338 non-null  object
 5   sub_commodity_desc    92338 non-null  object
 6   curr_size_of_product  92353 non-null  object
dtypes: int64(2), object(5)
memory usage: 4.9+ MB


In [160]:
result_lvl_1.head()

Unnamed: 0,user_id,actual,als_recs,prices_recommend,postfilter_als_recs,prices_recommend_postfilter,bpr_recs,postfilter_bpr_recs
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[1100972, 5569374, 909268, 1105488, 965766, 10...","[2.1810317460317448, 3.0885897435897456, 2.017...","[1100972, 5569374, 909268, 1105488, 965766, 10...","[2.1810317460317448, 3.0885897435897456, 7.667...","[977658, 1128744, 896292, 1013895, 1115098, 95...","[977658, 1128744, 896292, 1013895, 954525, 923..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[5569230, 1133018, 940947, 8090521, 1106523, 9...","[7.5499999999999705, 5.4060000000000015, 1.214...","[5569230, 1133018, 940947, 916122, 914190, 107...","[7.5499999999999705, 5.4060000000000015, 1.214...","[1068719, 5569845, 1004906, 5569230, 1042616, ...","[1068719, 5569845, 1004906, 1042616, 916122, 1..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[1075368, 962229, 12301073, 5569230, 902172, 8...","[3.641684587813626, 2.895266666666666, 3.23040...","[1075368, 962229, 12301073, 5569230, 902172, 8...","[3.641684587813626, 2.895266666666666, 7.67382...","[902172, 1076161, 9677093, 929768, 1108994, 89...","[902172, 1076161, 9677093, 929768, 1108994, 89..."
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[1000753, 1051516, 834826, 5585510, 1127179, 9...","[4.359743589743593, 2.945144230769237, 2.98254...","[1000753, 1051516, 834826, 5585510, 1127179, 9...","[4.359743589743593, 2.945144230769237, 8.33727...","[831557, 951834, 930118, 965267, 9926758, 1008...","[831557, 951834, 930118, 965267, 9926758, 1023..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[1069312, 985999, 1101173, 916122, 12810393, 1...","[4.0226086956521785, 1.8193023255813947, 4.230...","[1069312, 985999, 1101173, 916122, 12810393, 1...","[4.0226086956521785, 1.8193023255813947, 9.980...","[5591154, 5981267, 844685, 1074172, 846634, 99...","[5591154, 5981267, 844685, 1074172, 846634, 87..."


In [161]:
result_lvl_1.rename(columns={'postfilter_als_recs': 'recommendations'}, inplace=True)

In [224]:
class DataTransformer:
    """Класс для создания датасета для обучения модели второго уровня.
    Датасет создается из четырех DataFrame'ов:
    -DataFrame с рекомендациями товаров для каждого юзера. (Колонки 'user_id' и 'recommendations') (recommendations_df)
    -DataFrame покупок (purchase_df)
    -Item Features DataFrame
    -User Features DataFrame
    Новые фичи после преобразования:
        Фичи user_id:
        -Средний чек
        -Средняя сумма покупки 1 товара в каждой sub_commodity_desc
        -Средняя сумма покупки 1 товара в каждой commodity_desc
        -Средняя сумма покупки 1 товара в каждом department
        -Количество покупок в каждой sub_commodity_desc
        -Количество покупок в каждой commodity_desc
        -Количество покупок в каждом department
        -Частотность покупок раз/месяц
        -Доля покупок в выходные (кол-во айтемов в выходные/общее кол-во приобретенных айтемов)
        -Средняя сумма покупок/месяц
        -Среднее количество(quantity) покупаемых айтемов за одну покупку
        Фичи item_id:
        -Кол-во покупок в неделю
        -Среднее кол-во покупок 1 товара в sub_commodity_desc в неделю
        -Среднее кол-во покупок 1 товара в commodity_desc в неделю
        -Цена товара
        -Средняя цена товара в sub_commodity_desc
        -Цена/Средняя цена товара в sub_commodity_desc
        Фичи user_id-item_id:
        -Средняя сумма покупки 1 товара в каждой sub_commodity_desc - Цена товара
        -Средняя сумма покупки 1 товара в каждой commodity_desc - Цена товара
        -Средняя сумма покупки 1 товара в каждом department - Цена товара
        -Количество покупок в каждом department конкретного юзера в неделю - Среднее кол-во покупок всеми юзерами в department в неделю"""
    def __init__(self):
        pass

    def _mean_check(self, purchase_df):
        """DataFrame для фичи Средний чек"""

        mean_check_by_user = purchase_df.groupby(['user_id', 'basket_id'])['sales_value'].sum().reset_index().groupby('user_id')['sales_value'].mean().reset_index()
        mean_check_by_user.rename(columns={'sales_value': 'mean_check'}, inplace=True)

        return mean_check_by_user

    def _mean_sum_purchase_sub_comm_desc(self, purchase_df, item_features):
        """DataFrame для фичи Средняя сумма покупки 1 товара в каждой sub_commodity_desc"""

        data = purchase_df.merge(item_features, on='item_id', how='left')
        data['price'] = data['sales_value']/data['quantity']
        data.replace(np.inf, np.nan, inplace=True)
        data.dropna(inplace=True)

        result = data.groupby(['user_id', 'sub_commodity_desc'])['price'].mean().reset_index()
        result.rename(columns={'price': 'mean_sum_purchase_sub_comm_desc'}, inplace=True)

        return result

    def _mean_sum_purchase_comm_desc(self, purchase_df, item_features):
        """DataFrame для фичи Средняя сумма покупки 1 товара в каждой commodity_desc"""

        data = purchase_df.merge(item_features, on='item_id', how='left')
        data['price'] = data['sales_value']/data['quantity']
        data.replace(np.inf, np.nan, inplace=True)
        data.dropna(inplace=True)

        result = data.groupby(['user_id', 'commodity_desc'])['price'].mean().reset_index()
        result.rename(columns={'price': 'mean_sum_purchase_comm_desc'}, inplace=True)

        return result


    def _mean_sum_purchase_department(self, purchase_df, item_features):
        """DataFrame для фичи Средняя сумма покупки 1 товара в каждом department"""

        data = purchase_df.merge(item_features, on='item_id', how='left')
        data['price'] = data['sales_value']/data['quantity']
        data.replace(np.inf, np.nan, inplace=True)
        data.dropna(inplace=True)

        result = data.groupby(['user_id', 'department'])['price'].mean().reset_index()
        result.rename(columns={'price': 'mean_sum_purchase_department'}, inplace=True)

        return result


    def _purchases_in_sub_comm_desc(self, purchase_df, item_features):
        """DataFrame для фичи Количество покупок в каждой sub_commodity_desc"""

        data = purchase_df.merge(item_features, on='item_id', how='left')

        result = data.groupby(['user_id', 'sub_commodity_desc'])['quantity'].sum().reset_index()
        result.rename(columns={'quantity': 'purchases_in_sub_commodity_desc'}, inplace=True)

        return result

    def _purchases_in_comm_desc(self, purchase_df, item_features):
        """DataFrame для фичи Количество покупок в каждой commodity_desc"""

        data = purchase_df.merge(item_features, on='item_id', how='left')

        result = data.groupby(['user_id', 'commodity_desc'])['quantity'].sum().reset_index()
        result.rename(columns={'quantity': 'purchases_in_commodity_desc'}, inplace=True)

        return result


    def _purchases_in_department(self, purchase_df, item_features):
        """DataFrame для фичи Количество покупок в каждом department"""

        data = purchase_df.merge(item_features, on='item_id', how='left')

        result = data.groupby(['user_id', 'department'])['quantity'].sum().reset_index()
        result.rename(columns={'quantity': 'purchases_in_department'}, inplace=True)

        return result

    def _purchase_frequency(self, purchase_df):
        """DataFrame для фичи Частотность покупок раз/месяц"""

        data = purchase_df.copy()

        data['month'] = (data['day']-1)//30 + 1

        data_grouped = data.groupby(['user_id', 'month'])['basket_id'].unique().reset_index()
        data_grouped['frequency'] = data_grouped.apply(lambda row: len(row['basket_id']), axis=1)

        result = data_grouped.groupby('user_id')['frequency'].mean().reset_index()

        return result

    def _weekend_purchases_frac(self, purchase_df):
        """DataFrame для фичи Доля покупок в выходные (кол-во айтемов в выходные/общее кол-во приобретенных айтемов)"""

        result = purchase_df.groupby('user_id')['quantity'].sum().reset_index()
        weekend_purchases = pd.concat([purchase_df.loc[purchase_df['day']%6==0], purchase_df.loc[purchase_df['day']%7==0]])
        weekend_purchases = weekend_purchases.groupby('user_id')['quantity'].sum().reset_index()
        weekend_purchases.rename(columns={'quantity': 'weekend_quantity'}, inplace=True)
        result = result.merge(weekend_purchases, on='user_id', how='left')
        result.fillna(0, inplace=True)
        result['weekend_purchases_frac'] = result['weekend_quantity']/result['quantity']
        result.drop(['quantity', 'weekend_quantity'], axis=1, inplace=True)

        return result
                
    def _mean_sum_purchases_per_month(self, purchase_df):
        """"DataFrame для фичи Средняя сумма покупок/месяц"""

        data = purchase_df.copy()

        data['month'] = (data['day']-1)//30 + 1
        result = data.groupby(['user_id', 'month'])['sales_value'].sum().reset_index().groupby('user_id')['sales_value'].mean().reset_index()
        result.rename(columns={'sales_value': 'mean_sum_purchases_per_month'}, inplace=True)

        return result

    def _mean_quantity_per_basket(self, purchase_df):
        """DataFrame для фичи Среднее количество(quantity) покупаемых айтемов за одну покупку"""

        result = purchase_df.groupby(['user_id', 'basket_id'])['quantity'].sum().reset_index().groupby('user_id')['quantity'].mean().reset_index()
        result.rename(columns={'quantity': 'mean_quantity_per_basket'}, inplace=True)

        return result

    def _mean_purchases(self, purchase_df):
        """DataFrame для фичи Кол-во покупок в неделю"""

        result = purchase_df.groupby(['item_id', 'week_no'])['quantity'].sum().reset_index().groupby('item_id')['quantity'].mean().reset_index()
        result.rename(columns={'quantity': 'mean_purchases'}, inplace=True)

        return result

    def _mean_item_purchases_per_sub_comm_desc(self, purchase_df, item_features):
        """DataFrame для фичи Среднее кол-во покупок 1 товара в sub_commodity_desc в неделю"""

        data = purchase_df.merge(item_features, on='item_id', how='left')

        result = data.groupby(['sub_commodity_desc', 'week_no'])['quantity'].sum().reset_index().groupby('sub_commodity_desc')['quantity'].mean().reset_index()
        result.rename(columns={'quantity': 'mean_item_purchases_per_sub_comm_desc'}, inplace=True)

        return result

    def _mean_item_purchases_per_comm_desc(self, purchase_df, item_features):
        """DataFrame для фичи Среднее кол-во покупок 1 товара в commodity_desc в неделю"""

        data = purchase_df.merge(item_features, on='item_id', how='left')

        result = data.groupby(['commodity_desc', 'week_no'])['quantity'].sum().reset_index().groupby('commodity_desc')['quantity'].mean().reset_index()
        result.rename(columns={'quantity': 'mean_item_purchases_per_comm_desc'}, inplace=True)

        return result

    def _item_price(self, purchase_df):
        """DataFrame для фичи Цена товара"""

        data = purchase_df.copy()
        data['price'] = data['sales_value']/data['quantity']
        data.replace(np.inf, np.nan, inplace=True)
        data.dropna(inplace=True)
        result = data.groupby('item_id')['price'].mean().reset_index()

        return result

    def _mean_price_in_sub_comm_desc(self, purchase_df, item_features):
        """DataFrame для фичи Средняя цена товара в sub_commodity_desc"""

        data = purchase_df.merge(item_features, on='item_id', how='left')

        data['price'] = data['sales_value']/data['quantity']
        data.replace(np.inf, np.nan, inplace=True)
        data.dropna(inplace=True)

        result = data.groupby('sub_commodity_desc')['price'].mean().reset_index()
        result.rename(columns={'price': 'mean_price_in_sub_comm_desc'}, inplace=True)

        return result

    def _purchases_department_diff(self, purchase_df, item_features):
        """DataFrame для фичи Количество покупок в каждом department конкретного юзера в неделю - Среднее кол-во покупок всеми юзерами в department в неделю"""

        data = purchase_df.merge(item_features, on='item_id', how='left')
        all_users = data.groupby(['department', 'week_no'])['quantity'].sum().reset_index().groupby('department')['quantity'].mean().reset_index()
        all_users.rename(columns={'quantity': 'all_users'}, inplace=True)
        result = data.groupby(['user_id', 'department', 'week_no'])['quantity'].sum().reset_index().groupby(['user_id', 'department'])['quantity'].mean().reset_index()
        result = result.merge(all_users, on='department', how='left')
        result['purchases_department_diff'] = result['quantity'] - result['all_users']
        result.drop(['quantity', 'all_users'], axis=1, inplace=True)

        return result


    def fit_transform(self, recommend_df, purchase_df, item_features, user_features, with_targets=False):
        
        item_features.columns = [col.lower() for col in item_features.columns]
        user_features.columns = [col.lower() for col in user_features.columns]
        if 'product_id' in item_features.columns:
            item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
        if 'household_key' in user_features.columns:
            user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

        item_features.replace(' ', np.nan, inplace=True)
        item_features['curr_size_of_product'].fillna('Unknown', inplace=True)
        item_features['department'].fillna('GROCERY', inplace=True)
        item_features['commodity_desc'].fillna('BEERS/ALES', inplace=True)
        item_features['sub_commodity_desc'].fillna('BEERALEMALT LIQUORS', inplace=True)

        result = recommend_df.apply(lambda x: pd.Series(x['recommendations']), axis=1).stack().reset_index(level=1, drop=True)
        result.name = 'item_id'
        result = recommend_df[['user_id']].join(result)
        if with_targets:
            result['drop'] = 1
            targets = purchase_df[['user_id', 'item_id']].copy()
            targets['target'] = 1
            result = result.merge(targets, on=['user_id', 'item_id'], how='left')
            result['target'].fillna(0, inplace=True)
            result.drop('drop', axis=1, inplace=True)

        result = result.merge(item_features, on='item_id', how='left')
        result = result.merge(user_features, on='user_id', how='left')
        result.fillna('Uknown', inplace=True)

        mean_check = self._mean_check(purchase_df)
        result = result.merge(mean_check, on='user_id', how='left')

        mean_sum_purchase_sub_comm_desc = self._mean_sum_purchase_sub_comm_desc(purchase_df, item_features)
        result = result.merge(mean_sum_purchase_sub_comm_desc, on=['user_id', 'sub_commodity_desc'], how='left')

        mean_sum_purchase_comm_desc = self._mean_sum_purchase_comm_desc(purchase_df, item_features)
        result = result.merge(mean_sum_purchase_comm_desc, on=['user_id', 'commodity_desc'], how='left')

        mean_sum_purchase_department = self._mean_sum_purchase_department(purchase_df, item_features)
        result = result.merge(mean_sum_purchase_department, on=['user_id', 'department'], how='left')

        purchases_in_sub_comm_desc = self._purchases_in_sub_comm_desc(purchase_df, item_features)
        result = result.merge(purchases_in_sub_comm_desc, on=['user_id', 'sub_commodity_desc'], how='left')

        purchases_in_comm_desc = self._purchases_in_comm_desc(purchase_df, item_features)
        result = result.merge(purchases_in_comm_desc, on=['user_id', 'commodity_desc'], how='left')

        purchases_in_department = self._purchases_in_department(purchase_df, item_features)
        result = result.merge(purchases_in_department, on=['user_id', 'department'], how='left')

        purchase_frequency = self._purchase_frequency(purchase_df)
        result = result.merge(purchase_frequency, on='user_id', how='left')

        weekend_purchases_frac = self._weekend_purchases_frac(purchase_df)
        result = result.merge(weekend_purchases_frac, on='user_id', how='left')

        mean_sum_purchases_per_month = self._mean_sum_purchases_per_month(purchase_df)
        result = result.merge(mean_sum_purchases_per_month, on='user_id', how='left')

        mean_quantity_per_basket = self._mean_quantity_per_basket(purchase_df)
        result = result.merge(mean_quantity_per_basket, on='user_id', how='left')

        mean_purchases = self._mean_purchases(purchase_df)
        result = result.merge(mean_purchases, on='item_id', how='left') #тут могут быть Nan
        result.fillna(0, inplace=True) 

        mean_item_purchases_per_sub_comm_desc = self._mean_item_purchases_per_sub_comm_desc(purchase_df, item_features)
        result = result.merge(mean_item_purchases_per_sub_comm_desc, on='sub_commodity_desc', how='left') #и тут могут быть nan
        result.fillna(0, inplace=True) 

        mean_item_purchases_per_comm_desc = self._mean_item_purchases_per_comm_desc(purchase_df, item_features)
        result = result.merge(mean_item_purchases_per_comm_desc, on='commodity_desc', how='left')
        result.fillna(0, inplace=True)

        item_price = self._item_price(purchase_df)
        result = result.merge(item_price, on='item_id', how='left')
        result.fillna(result['price'].mean(), inplace=True)

        mean_price_in_sub_comm_desc = self._mean_price_in_sub_comm_desc(purchase_df, item_features)
        result = result.merge(mean_price_in_sub_comm_desc, on='sub_commodity_desc', how='left')
        result.fillna(result['mean_price_in_sub_comm_desc'].mean(), inplace=True)

        purchases_department_diff = self._purchases_department_diff(purchase_df, item_features)
        result = result.merge(purchases_department_diff, on=['user_id', 'department'], how='left')
        result.fillna(min(result['purchases_department_diff']), inplace=True)

        result['price/mean_price_sub_comm_desc'] = result['price']/result['mean_price_in_sub_comm_desc']
        result['mean_sum_sub_comm_desc-price'] = result['mean_sum_purchase_sub_comm_desc'] - result['price']
        result['mean_sum_comm_desc-price'] = result['mean_sum_purchase_comm_desc'] - result['price']
        result['mean_sum_department-price'] = result['mean_sum_purchase_department'] - result['price']

        return result

In [225]:
trans = DataTransformer()

In [226]:
result_lvl_1.shape

(2154, 8)

In [227]:
%%time
train_lvl_2_2 = trans.fit_transform(result_lvl_1, data_train_lvl_2, item_features, user_features, with_targets=True)

Wall time: 10.2 s
