## Engineered Features Based on Characteristics of Products

In [1]:
import pandas as pd
import numpy as np
import scipy as sp

In [2]:
df_allData = pd.read_pickle('input/all_data.p')

In [3]:
df_allData.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,SucceedingOrdCnt
0,2,33120,1,1,Organic-Egg-Whites,86,16,eggs,dairy eggs,202279,prior,3,5,9,8.0,6
1,2,28985,2,1,Michigan-Organic-Kale,83,4,fresh vegetables,produce,202279,prior,3,5,9,8.0,6
2,2,9327,3,0,Garlic-Powder,104,13,spices seasonings,pantry,202279,prior,3,5,9,8.0,6
3,2,45918,4,1,Coconut-Butter,19,13,oils vinegars,pantry,202279,prior,3,5,9,8.0,6
4,2,30035,5,0,Natural-Sweetener,17,13,baking ingredients,pantry,202279,prior,3,5,9,8.0,6


#### Product Purchase Hour and Day of Week (DOW)

In [4]:
# Count and distribution of products bought by hour of the day.
hrGrp = df_allData.groupby(['product_id','order_hour_of_day']).size().reset_index()
hrGrp.columns = ['product_id','order_hour_of_day','item_hour_cnt']
hrGrp['itemHrDist'] = hrGrp['item_hour_cnt']/hrGrp.groupby('product_id').transform(np.sum).item_hour_cnt
hrGrp.head()

Unnamed: 0,product_id,order_hour_of_day,item_hour_cnt,itemHrDist
0,1,0,12,0.006224
1,1,1,12,0.006224
2,1,2,9,0.004668
3,1,3,5,0.002593
4,1,4,4,0.002075


In [5]:
# Count and distribution of unique products bought in any given hour of the day.
hrGrpUnq = df_allData.drop_duplicates(['user_id','product_id','order_hour_of_day']).groupby(['product_id',
                                                                                             'order_hour_of_day']).size().reset_index()
hrGrpUnq.columns = ['product_id','order_hour_of_day','unique_item_hour_cnt']
hrGrpUnq['unqItemHrDist'] = hrGrpUnq['unique_item_hour_cnt']/hrGrpUnq.groupby('product_id'
                                                                             ).transform(np.sum).unique_item_hour_cnt
hrGrpUnq.head()

Unnamed: 0,product_id,order_hour_of_day,unique_item_hour_cnt,unqItemHrDist
0,1,0,11,0.007412
1,1,1,10,0.006739
2,1,2,9,0.006065
3,1,3,5,0.003369
4,1,4,3,0.002022


In [6]:
hrGrp.to_pickle('input/hrGrp.p')
hrGrpUnq.to_pickle('input/hrGrpUnq.p')

#### *Day of Week*

In [7]:
# Count and distribution of products bought by day of the week.
dowGrp = df_allData.groupby(['product_id','order_dow']).size().reset_index()
dowGrp.columns = ['product_id','order_dow','item_dow_cnt']
dowGrp['itemDowDist'] = dowGrp['item_dow_cnt']/dowGrp.groupby('product_id').transform(np.sum).item_dow_cnt
dowGrp.head()

Unnamed: 0,product_id,order_dow,item_dow_cnt,itemDowDist
0,1,0,206,0.106846
1,1,1,414,0.21473
2,1,2,285,0.147822
3,1,3,272,0.141079
4,1,4,308,0.159751


In [8]:
# Count and distribution of unique products bought in any given day of the week.
unqDowGrp = df_allData.drop_duplicates(['user_id','product_id','order_dow']).groupby(['product_id',
                                                                                     'order_dow']).size().reset_index()
unqDowGrp.columns = ['product_id','order_dow','unique_item_dow_cnt']
unqDowGrp['unqItemDowDist'] = unqDowGrp['unique_item_dow_cnt']/unqDowGrp.groupby('product_id'
                                                                               ).transform(np.sum).unique_item_dow_cnt
unqDowGrp.head()

Unnamed: 0,product_id,order_dow,unique_item_dow_cnt,unqItemDowDist
0,1,0,115,0.094495
1,1,1,212,0.174199
2,1,2,205,0.168447
3,1,3,203,0.166804
4,1,4,203,0.166804


In [9]:
dowGrp.to_pickle('input/dowGrp.p')
unqDowGrp.to_pickle('input/unqDowGrp.p')

#### Product Combinations Purchased

In [10]:
df_all_order_details = pd.read_pickle('input/all_order_details.p')

In [11]:
df_all_order_details.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,SucceedingOrdCnt,days_since_first_order,prev_prd,diff_prd,same_prd
0,2539329,1,prior,1,2,8,,"[Soda, Organic-Unsweetened-Vanilla-Almond-Milk...",10.0,,,"[Soda, Organic-Unsweetened-Vanilla-Almond-Milk...",[]
1,2398795,1,prior,2,3,7,15.0,"[Soda, Pistachios, Original-Beef-Jerky, Bag-of...",9.0,15.0,"[Soda, Organic-Unsweetened-Vanilla-Almond-Milk...","[Pistachios, Bag-of-Organic-Bananas, Cinnamon-...","[Soda, Original-Beef-Jerky, Aged-White-Cheddar..."
2,473747,1,prior,3,3,12,21.0,"[Soda, Original-Beef-Jerky, Pistachios, Organi...",8.0,36.0,"[Soda, Pistachios, Original-Beef-Jerky, Bag-of...","[Organic-String-Cheese, Creamy-Almond-Butter]","[Soda, Original-Beef-Jerky, Pistachios]"
3,2254736,1,prior,4,4,7,29.0,"[Soda, Original-Beef-Jerky, Pistachios, Organi...",7.0,65.0,"[Soda, Original-Beef-Jerky, Pistachios, Organi...",[XL-Pick-A-Size-Paper-Towel-Rolls],"[Soda, Original-Beef-Jerky, Pistachios, Organi..."
4,431534,1,prior,5,4,15,28.0,"[Soda, Original-Beef-Jerky, Pistachios, Organi...",6.0,93.0,"[Soda, Original-Beef-Jerky, Pistachios, Organi...","[Organic-Fuji-Apples, Honeycrisp-Apples, Bartl...","[Soda, Original-Beef-Jerky, Pistachios, Organi..."


In [12]:
df_all_order_details.sort_values(['user_id','order_number'],inplace=True)

In [13]:
#Determine order id of the previous order
df_all_order_details['prev_order_id']=df_all_order_details.groupby('user_id')['order_id'].shift(1)
df_all_order_details.reset_index(drop=True, inplace=True)

In [14]:
# Read product details dataset.
df_pd = pd.read_pickle('input/product_detail.p')

In [15]:
# List ids of products in an order
order_item_array = df_allData.groupby('order_id')['product_id'].apply(np.array).reset_index()

In [16]:
order_item_array.head()

Unnamed: 0,order_id,product_id
0,1,"[49302, 11109, 10246, 49683, 43633, 13176, 472..."
1,2,"[33120, 28985, 9327, 45918, 30035, 17794, 4014..."
2,3,"[33754, 24838, 17704, 21903, 17668, 46667, 174..."
3,4,"[46842, 26434, 39758, 27761, 10054, 21351, 225..."
4,5,"[13176, 15005, 47329, 27966, 23909, 48370, 132..."


In [17]:
#Filter only for orders which have been ordered more than once
df_all_order_details_ = df_all_order_details[df_all_order_details['SucceedingOrdCnt']>1].dropna()

In [18]:
item_to_item = []

#### *List products bought in prior and succeeding orders*

In [19]:
from collections import Counter
from itertools import product
from operator import itemgetter
iCounter = Counter()

In [20]:
for priItem, curItem in df_all_order_details_[['prev_prd','product_name']].values:
    item_to_item += [i1+'>'+i2 for i1, i2 in list(product(priItem, curItem))]
    iCounter += Counter(priItem)
item_to_item = Counter(item_to_item)

In [21]:
df1 = pd.DataFrame.from_dict(item_to_item, orient='index').reset_index()
df1.columns = ['item','cnt']
df2 = pd.DataFrame.from_dict(iCounter,orient='index').reset_index()
df2.columns = ['before','total_cnt']
df1.sort_values('cnt',ascending=False,inplace=True)
del item_to_item, iCounter

In [22]:
df1['before']=df1.item.map(lambda x: x.split('>')[0])
df1['after']=df1.item.map(lambda x: x.split('>')[1])

In [24]:
df1 = df1[df1.before!=df1.after]

In [25]:
df1=df1.merge(df2,on='before',how='left')

In [26]:
df1['before2after']=df1['cnt']/df1['total_cnt']
df1 = df1[['before','after','before2after']]

In [27]:
df1 = df1.merge(df_pd.rename(columns={'product_name':'before','product_id':'before_id'}),on='before',how='left')
df1 = df1.merge(df_pd.rename(columns={'product_name':'after','product_id':'after_id'}),on='after',how='left')

In [28]:
df1 = df1[['before_id','after_id','before2after']]

In [None]:
df_all_order_details.to_pickle('input/all_order_details-FE3.p')

#### Product Frequently Bougth Together

In [3]:
# Determining how often multi product orders are placed and also its min, max and standard deviation.
df_allDataT = df_allData[df_allData['SucceedingOrdCnt']>1]

In [4]:
orderSize = df_allDataT.groupby('order_id').size().reset_index()
orderSize.columns = ['order_id','totalSize']

In [5]:
df_allDataT = df_allDataT.merge(orderSize,on='order_id',how='left')

In [6]:
prod = df_allDataT.groupby('product_id')['totalSize'].mean().to_frame()
prod.columns = ['prodComb_mean']

In [7]:
prod['prodComb_min'] = df_allDataT.groupby('product_id')['totalSize'].min()
prod['prodComb_max'] = df_allDataT.groupby('product_id')['totalSize'].max()
prod['prodComb_std'] = df_allDataT.groupby('product_id')['totalSize'].std()
    

In [8]:
prod.reset_index().to_pickle('input/prodComb.p')