In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

%matplotlib inline 

In [6]:
pd.set_option('display.max_rows', 100)

## Load Data

In [7]:
aisles = pd.read_csv('data/aisles.csv', dtype = {
    'aisle_id': np.uint16,
    'aisle': np.str},
    usecols=['aisle_id', 'aisle'])

In [8]:
departments = pd.read_csv('data/departments.csv', dtype = {
    'department_id': np.uint16,
    'department': np.str},
    usecols=['department_id', 'department'])

In [9]:
order_products_prior = pd.read_csv('data/order_products__prior.csv', dtype = {
    'order_id': np.uint32,
    'product_id': np.uint32,
    'add_to_cart_order': np.uint16,
    'reordered': np.uint16},
    usecols=['order_id', 'product_id', 'add_to_cart_order', 'reordered'])

In [10]:
order_product_train = pd.read_csv('data/order_products__train.csv', dtype = {
    'order_id': np.uint32,
    'product_id': np.uint32,
    'add_to_cart_order': np.uint16,
    'reordered': np.uint16},
    usecols=['order_id', 'product_id', 'add_to_cart_order', 'reordered'])

In [11]:
orders = pd.read_csv('data/orders.csv', dtype = {
    'order_id': np.uint32,
    'user_id' :np.uint32,
    'eval_set': 'category',
    'order_number': np.uint16,
    'order_dow': np.uint16,
    'order_hour_of_day': np.uint8,
    'days_since_prior_order': np.float32},
                     usecols=['order_id', 'user_id', 'eval_set', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order'])

In [12]:
products = pd.read_csv('data/products.csv', dtype = {'product_id': np.uint16,
        'product_name': np.str,
        'aisle_id': np.uint16,
        'department_id': np.uint16},
        usecols=['product_id', 'product_name', 'aisle_id', 'department_id'])

##  Genarating Features

In [9]:
orders['days_since_first_order'] = orders.groupby('user_id')['days_since_prior_order'].cumsum()

In [10]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,days_since_first_order
0,2539329,1,prior,1,2,8,,
1,2398795,1,prior,2,3,7,15.0,15.0
2,473747,1,prior,3,3,12,21.0,36.0
3,2254736,1,prior,4,4,7,29.0,65.0
4,431534,1,prior,5,4,15,28.0,93.0


In [11]:
orders.days_since_first_order.fillna(0, inplace = True)

In [12]:
order_products_prior.shape

(32434489, 4)

In [13]:
priordf = pd.merge(order_products_prior, orders, on = 'order_id', how = 'left')

In [14]:
priordf.shape

(32434489, 11)

In [15]:
priordf.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,days_since_first_order
0,2,33120,1,1,202279,prior,3,5,9,8.0,28.0
1,2,28985,2,1,202279,prior,3,5,9,8.0,28.0
2,2,9327,3,0,202279,prior,3,5,9,8.0,28.0
3,2,45918,4,1,202279,prior,3,5,9,8.0,28.0
4,2,30035,5,0,202279,prior,3,5,9,8.0,28.0


In [16]:
priordf['days_since_first_order'].fillna(0, inplace=True)

In [17]:
priordf.sort_values(['user_id', 'product_id','order_number'], inplace = True)

In [18]:
priordf['days_since_prior_product_purchase'] = priordf.groupby(['user_id', 'product_id'])['days_since_first_order'].transform(lambda x: x.diff())

In [45]:
hour_bins = [-1,2,6,10,14,18,22,25]
hour_gourp_names = ['22-2-1', '3-6', '7-10', '11-14', '15-18', '19-22', '22-2-2']
priordf['order_hour_group'] = pd.cut(priordf.order_hour_of_day, bins = hour_bins, labels=hour_gourp_names)

In [133]:
priordf = pd.merge(priordf, products, on = 'product_id', how = 'left')

In [47]:
pickle.dump(priordf, open('data/pickle_files/priordf.p', 'wb'))

In [111]:
priordf = pickle.load(open('data/pickle_files/priordf.p', 'rb'))

In [15]:
#user_info

In [16]:
user_info = pd.DataFrame()
user_info['user_order_num'] = priordf.groupby('user_id')['order_id'].nunique().astype(np.uint16)

In [17]:
user_info['user_order_interval_mean'] = pd.DataFrame(priordf.groupby(['user_id', 'order_id'])['days_since_prior_order'].mean()).groupby(['user_id'])['days_since_prior_order'].mean().astype(np.float32)

In [18]:
user_info['user_order_interval_std'] = pd.DataFrame(priordf.groupby(['user_id', 'order_id'])['days_since_prior_order'].mean()).groupby(['user_id'])['days_since_prior_order'].std().astype(np.float32)

In [19]:
user_info['user_basket_size_mean'] = pd.DataFrame(priordf.groupby(['user_id', 'order_id'])['product_id'].count()).groupby('user_id')['product_id'].mean().astype(np.float32)

In [20]:
user_info['user_basket_size_std'] = pd.DataFrame(priordf.groupby(['user_id', 'order_id'])['product_id'].count()).groupby('user_id')['product_id'].std().astype(np.float32)

In [21]:
user_info['user_history'] = priordf.groupby('user_id')['days_since_first_order'].max().astype(np.uint16)

In [22]:
user_info.reset_index(inplace = True)

In [23]:
user_info.head()

Unnamed: 0,user_id,user_order_num,user_order_interval_mean,user_order_interval_std,user_basket_size_mean,user_basket_size_std,user_history
0,1,10,19.555555,9.395625,5.9,1.523884,176
1,2,14,15.230769,9.867064,13.928572,5.717238,198
2,3,12,12.090909,5.375026,7.333333,2.103388,133
3,4,5,13.75,9.5,3.6,2.073644,55
4,5,4,13.333333,4.932883,9.25,3.095696,40


In [24]:
pickle.dump(user_info, open('data/pickle_files/user_info.p', 'wb'))

In [None]:
#product_info

In [25]:
product_info = pd.DataFrame()

In [26]:
product_info['product_user_num'] = priordf.groupby(['product_id'])['user_id'].nunique().astype(np.uint32)

In [27]:
product_info['product_order_num'] = priordf.groupby(['product_id'])['order_id'].nunique().astype(np.uint32)

In [28]:
product_info['product_order_interval_mean'] = priordf.groupby(['product_id'])['days_since_prior_product_purchase'].mean().astype(np.float32)

In [29]:
product_info['product_order_interval_std'] = priordf.groupby(['product_id'])['days_since_prior_product_purchase'].std().astype(np.float32)

In [30]:
product_info['product_reorder_num'] = priordf[priordf.reordered == 1].groupby(['product_id'])['order_id'].nunique().astype(np.uint32)

In [31]:
product_info['product_reorder_num'].fillna(0, inplace = True)

In [32]:
product_info['product_reorder_user_num'] = priordf[priordf.reordered == 1].groupby(['product_id'])['user_id'].nunique().astype(np.uint32)

In [33]:
product_info['product_reorder_user_num'].fillna(0, inplace = True)

In [34]:
product_info['product_reorder_ratio'] = product_info['product_reorder_num']/product_info['product_order_num']

In [35]:
product_info['product_reorder_user_ratio'] = product_info['product_reorder_user_num']/product_info['product_user_num']

In [36]:
product_info['product_add_to_cart_order_mean'] = priordf.groupby(['product_id'])['add_to_cart_order'].mean().astype(np.float32)

In [37]:
product_info['product_add_to_cart_order_std'] = priordf.groupby(['product_id'])['add_to_cart_order'].std().astype(np.float32)

In [38]:
product_info.reset_index(inplace = True)

In [39]:
product_info = pd.merge(product_info, products, on = 'product_id', how = 'left')

In [11]:
product_info['product_order_interval_mean_NaN'] = product_info.product_order_interval_mean.isnull()
product_info['product_order_interval_mean_NaN'].replace([False, True], [0, 1], inplace = True)
product_info['product_order_interval_std_NaN'] = product_info.product_order_interval_std.isnull()
product_info['product_order_interval_std_NaN'].replace([False, True], [0, 1], inplace = True)
product_info['product_add_to_cart_order_std_NaN'] = product_info.product_add_to_cart_order_std.isnull()
product_info['product_add_to_cart_order_std_NaN'].replace([False, True], [0, 1], inplace = True)

In [None]:
product_info.fillna(product_info.mean(), inplace = True)

In [13]:
product_info.head()

Unnamed: 0,product_id,product_user_num,product_order_num,product_order_interval_mean,product_order_interval_std,product_reorder_num,product_reorder_user_num,product_reorder_ratio,product_reorder_user_ratio,product_add_to_cart_order_mean,product_add_to_cart_order_std,product_name,aisle_id,department_id,product_order_interval_mean_NaN,product_order_interval_std_NaN,product_add_to_cart_order_std_NaN
0,1,716,1852,27.961267,37.873127,1136.0,276.0,0.613391,0.385475,5.801836,5.575389,Chocolate Sandwich Cookies,61,19,0,0,0
1,2,78,90,50.166668,52.135895,12.0,8.0,0.133333,0.102564,9.888889,7.821671,All-Seasons Salt,104,13,0,0,0
2,3,74,277,19.369457,24.833035,203.0,36.0,0.732852,0.486486,6.415163,6.472701,Robust Golden Unsweetened Oolong Tea,94,7,0,0,0
3,4,182,329,31.707483,28.790031,147.0,64.0,0.446809,0.351648,9.507599,6.861485,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,0,0,0
4,5,6,15,34.888889,36.511795,9.0,4.0,0.6,0.666667,6.466667,3.563038,Green Chile Anytime Sauce,5,13,0,0,0


In [14]:
pickle.dump(product_info, open('data/pickle_files/product_info.p', 'wb'))

In [11]:
#user_product_info

In [42]:
user_product_info = pd.DataFrame(priordf.reset_index().groupby(['user_id', 'product_id'])['order_id'].nunique())

In [43]:
user_product_info.rename(columns = {'order_id': 'user_product_order_num'}, inplace = True)

In [44]:
user_product_info['user_product_last_purchase_day'] = priordf.reset_index().groupby(['user_id', 'product_id'])['days_since_first_order'].max().astype(np.uint32)

In [45]:
user_product_info.rename(columns = {'user_product_order_interval': 'user_product_order_interval_mean'}, inplace = True)

In [46]:
user_product_info['user_product_order_interval_mean'] = priordf.reset_index().groupby(['user_id', 'product_id'])['days_since_prior_product_purchase'].mean()

In [47]:
user_product_info['user_product_order_interval_std'] = priordf.reset_index().groupby(['user_id', 'product_id'])['days_since_prior_product_purchase'].std()

In [48]:
user_product_info['user_product_rank'] = user_product_info.groupby('user_id')['user_product_order_num'].rank(ascending=False)

In [49]:
user_product_info['user_product_add_order_mean'] = priordf.reset_index().groupby(['user_id', 'product_id'])['add_to_cart_order'].mean().astype(np.float32)

In [50]:
user_product_info['user_product_add_order_std'] = priordf.reset_index().groupby(['user_id', 'product_id'])['add_to_cart_order'].std().astype(np.float32)

In [51]:
user_product_info = pd.merge(pd.DataFrame(user_product_info).reset_index(), user_info.reset_index()[['user_id', 'user_order_num']], on='user_id', how = 'left')

In [52]:
user_product_info['user_product_reorder_ratio'] = user_product_info.user_product_order_num/user_product_info.user_order_num

In [53]:
user_product_info.drop('user_order_num', axis = 1, inplace = True)

In [5]:
user_product_info['user_product_order_interval_mean_NaN'] = user_product_info.user_product_order_interval_mean.isnull()
user_product_info['user_product_order_interval_mean_NaN'].replace([False, True], [0, 1], inplace = True)
user_product_info['user_product_order_interval_std_NaN'] = user_product_info.user_product_order_interval_std.isnull()
user_product_info['user_product_order_interval_std_NaN'].replace([False, True], [0, 1], inplace = True)
user_product_info['user_product_add_order_std_NaN'] = user_product_info.user_product_add_order_std.isnull()
user_product_info['user_product_add_order_std_NaN'].replace([False, True], [0, 1], inplace = True)

In [None]:
user_product_info.fillna(user_product_info.mean(), inplace = True)

In [9]:
user_product_info.head()

Unnamed: 0,user_id,product_id,user_product_order_num,user_product_last_purchase_day,user_product_order_interval_mean,user_product_order_interval_std,user_product_rank,user_product_add_order_mean,user_product_add_order_std,user_product_reorder_ratio,user_product_order_interval_mean_NaN,user_product_order_interval_std_NaN,user_product_add_order_std_NaN
0,1,196,10,176,19.555555,9.395625,1.5,1.4,0.966092,1.0,0,0,0
1,1,10258,9,176,20.125,9.876922,3.0,3.333333,1.322876,0.9,0,0,0
2,1,10326,1,93,42.677544,25.335363,14.5,5.0,4.219923,0.1,1,1,1
3,1,12427,10,176,19.555555,9.395625,1.5,3.3,2.406011,1.0,0,0,0
4,1,13032,3,176,80.5,51.618793,5.5,6.333333,1.527525,0.3,0,0,0


In [10]:
pickle.dump(user_product_info, open('data/pickle_files/user_product_info.p', 'wb'))

In [64]:
#ordertime_info

In [48]:
ordertime_info = pd.DataFrame(priordf.reset_index().groupby(['order_dow', 'order_hour_group'])['order_id'].nunique())

In [49]:
ordertime_info.rename(columns = {'order_id': 'ordertime_order_num'}, inplace = True)

In [50]:
ordertime_info.reset_index(inplace=True)

In [51]:
ordertime_info.head()

Unnamed: 0,order_dow,order_hour_group,ordertime_order_num
0,0,11-14,195567
1,0,15-18,160171
2,0,19-22,61821
3,0,22-2,13619
4,0,3-6,5852


In [52]:
pickle.dump(ordertime_info, open('data/pickle_files/ordertime_info.p', 'wb'))

In [53]:
user_ordertime_info = pd.DataFrame(priordf.reset_index().groupby(['user_id','order_dow', 'order_hour_group'])['order_id'].nunique())

In [54]:
user_ordertime_info.rename(columns = {'order_id': 'user_ordertime_order_num'}, inplace = True)
user_ordertime_info.reset_index(inplace=True)

In [55]:
user_ordertime_info.head()

Unnamed: 0,user_id,order_dow,order_hour_group,user_ordertime_order_num
0,1,1,11-14,1
1,1,1,15-18,1
2,1,1,7-10,1
3,1,2,7-10,2
4,1,3,11-14,1


In [56]:
pickle.dump(user_ordertime_info, open('data/pickle_files/user_ordertime_info.p', 'wb'))

In [57]:
product_ordertime_info = pd.DataFrame(priordf.reset_index().groupby(['product_id','order_dow', 'order_hour_group'])['order_id'].nunique())

In [58]:
product_ordertime_info.rename(columns = {'order_id': 'product_ordertime_order_num'}, inplace = True)
product_ordertime_info.reset_index(inplace=True)

In [59]:
product_ordertime_info.head()

Unnamed: 0,product_id,order_dow,order_hour_group,product_ordertime_order_num
0,1,0,11-14,51
1,1,0,15-18,62
2,1,0,19-22,42
3,1,0,22-2,11
4,1,0,3-6,2


In [60]:
pickle.dump(product_ordertime_info, open('data/pickle_files/product_ordertime_info.p', 'wb'))

In [76]:
#aisle_info

In [126]:
aisle_info = pd.DataFrame()

In [135]:
aisle_info['aisle_user_num'] = priordf.groupby(['aisle_id'])['user_id'].nunique().astype(np.uint32)

In [137]:
aisle_info['aisle_order_num'] = priordf.groupby(['aisle_id'])['order_id'].nunique().astype(np.uint32)

In [139]:
aisle_info['aisle_order_interval_mean'] = product_info.groupby(['aisle_id'])['product_order_interval_mean'].mean().astype(np.uint32)

In [140]:
aisle_info['aisle_order_interval_std'] = product_info.groupby(['aisle_id'])['product_order_interval_mean'].std().astype(np.uint32)

In [141]:
aisle_info['aisle_reorder_num'] = priordf[priordf.reordered == 1].groupby(['aisle_id'])['order_id'].nunique().astype(np.uint32)

In [142]:
aisle_info['aisle_reorder_num'].fillna(0, inplace = True)

In [143]:
aisle_info['aisle_reorder_user_num'] = priordf[priordf.reordered == 1].groupby(['aisle_id'])['user_id'].nunique().astype(np.uint32)

In [144]:
aisle_info['aisle_reorder_user_num'].fillna(0, inplace = True)

In [145]:
aisle_info['aisle_reorder_ratio'] = aisle_info['aisle_reorder_num']/aisle_info['aisle_order_num']

In [146]:
aisle_info['aisle_reorder_user_ratio'] = aisle_info['aisle_reorder_user_num']/aisle_info['aisle_user_num']

In [148]:
aisle_info['aisle_product_add_to_cart_order_mean'] = product_info.groupby(['aisle_id'])['product_add_to_cart_order_mean'].mean().astype(np.float32)

In [149]:
aisle_info['aisle_product_add_to_cart_order_std'] = product_info.groupby(['aisle_id'])['product_add_to_cart_order_mean'].std().astype(np.float32)

In [150]:
aisle_info.reset_index(inplace = True)

In [151]:
aisle_info = pd.merge(aisle_info, aisles, on = 'aisle_id', how = 'left')

In [159]:
aisle_info.head()

Unnamed: 0,aisle_id,aisle_user_num,aisle_order_num,aisle_order_interval_mean,aisle_order_interval_std,aisle_reorder_num,aisle_reorder_user_num,aisle_reorder_ratio,aisle_reorder_user_ratio,aisle_product_add_to_cart_order_mean,aisle_product_add_to_cart_order_std,aisle
0,1,20711,63115,27,13,38470,9162,0.609522,0.442374,8.818574,1.806635,prepared soups salads
1,2,31222,77171,36,17,38747,11799,0.502093,0.377907,9.35891,2.217766,specialty cheeses
2,3,63592,278151,26,14,184142,35117,0.662022,0.552224,9.891664,2.654259,energy granola bars
3,4,53892,165541,36,15,88508,24734,0.534659,0.458955,10.449979,2.380114,instant foods
4,5,32312,58390,46,26,16977,7847,0.290752,0.242851,10.37053,2.503755,marinades meat preparation


In [160]:
pickle.dump(aisle_info, open('data/pickle_files/aisle_info.p', 'wb'))

In [162]:
department_info = pd.DataFrame()

In [163]:
department_info['department_user_num'] = priordf.groupby(['department_id'])['user_id'].nunique().astype(np.uint32)

In [164]:
department_info['department_order_num'] = priordf.groupby(['department_id'])['order_id'].nunique().astype(np.uint32)

In [165]:
department_info['department_order_interval_mean'] = product_info.groupby(['department_id'])['product_order_interval_mean'].mean().astype(np.uint32)

In [166]:
department_info['department_order_interval_std'] = product_info.groupby(['department_id'])['product_order_interval_mean'].std().astype(np.uint32)

In [167]:
department_info['department_reorder_num'] = priordf[priordf.reordered == 1].groupby(['department_id'])['order_id'].nunique().astype(np.uint32)

In [168]:
department_info['department_reorder_num'].fillna(0, inplace = True)

In [169]:
department_info['department_reorder_user_num'] = priordf[priordf.reordered == 1].groupby(['department_id'])['user_id'].nunique().astype(np.uint32)

In [170]:
department_info['department_reorder_user_num'].fillna(0, inplace = True)

In [171]:
department_info['department_reorder_ratio'] = department_info['department_reorder_num']/department_info['department_order_num']

In [172]:
department_info['department_reorder_user_ratio'] = department_info['department_reorder_user_num']/department_info['department_user_num']

In [173]:
department_info['department_product_add_to_cart_order_mean'] = product_info.groupby(['department_id'])['product_add_to_cart_order_mean'].mean().astype(np.float32)

In [174]:
department_info['department_product_add_to_cart_order_std'] = product_info.groupby(['department_id'])['product_add_to_cart_order_mean'].std().astype(np.float32)

In [181]:
department_info.reset_index(inplace = True)

In [183]:
department_info = pd.merge(department_info, departments, on = 'department_id', how = 'left')

In [184]:
department_info.head()

Unnamed: 0,department_id,department_user_num,department_order_num,department_order_interval_mean,department_order_interval_std,department_reorder_num,department_reorder_user_num,department_reorder_ratio,department_reorder_user_ratio,department_product_add_to_cart_order_mean,department_product_add_to_cart_order_std,department
0,1,163233,1181018,31,14,761976,112199,0.645186,0.687355,9.354378,2.21593,frozen
1,2,17875,35056,40,29,14565,4843,0.415478,0.270937,8.194178,2.71967,other
2,3,140612,881556,30,14,589018,91541,0.668157,0.651018,8.493633,2.011495,bakery
3,4,193237,2409320,29,12,1962871,169909,0.814699,0.879278,9.039677,1.801582,produce
4,5,15798,84689,28,17,56064,8751,0.661999,0.553931,6.410433,2.907398,alcohol


In [185]:
pickle.dump(department_info, open('data/pickle_files/department_info.p', 'wb'))