## Engineered Features Based on User purchase behavior

In [1]:
import pandas as pd
import numpy as np

In [2]:
df_allData = pd.read_pickle('input/all_data.p')

In [3]:
df_allOrdDet = pd.read_pickle('input/all_order_details.p')

#### Reorder and Unique Items

In [4]:
# Determine ratio of re-order and no re-order by using the reordered column provided in the base datasets.
reorder_ratio = df_allData.groupby(['order_id'])['reordered'].mean().reset_index()

In [5]:
reorder_ratio.columns = ['order_id','reorder_ratio']

In [6]:
df_allOrdDet = df_allOrdDet.merge(reorder_ratio,on='order_id',how='left')

In [7]:
df_allData['nonReOrd'] = 1-df_allData['reordered']

In [9]:
nonReOrdRatio = df_allData.groupby(['order_id'])['nonReOrd'].mean().reset_index()
nonReOrdRatio.columns = ['order_id','nonReOrdRatio']

In [10]:
df_allOrdDet = df_allOrdDet.merge(nonReOrdRatio,on='order_id',how='left')

In [11]:
# Using the non re-order column derived above determine unique items and their ratio to total orders.
uniItm = df_allData.groupby('order_id')['nonReOrd'].sum().reset_index()

In [12]:
uniItm.columns = ['order_id','nonReOrdTot']

In [13]:
df_allOrdDet = df_allOrdDet.merge(uniItm,on='order_id',how='left')

In [14]:
df_allOrdDet['uniItmTot']=df_allOrdDet.groupby('user_id')['nonReOrdTot'].cumsum()

In [15]:
df_allOrdDet['uniItmTot_Ratio']=df_allOrdDet['uniItmTot']/df_allOrdDet['order_number']

#### Ordered Items

In [17]:
# Total items ordered by a user in an order
OrdItem = df_allData.groupby('order_id').size().reset_index()

In [18]:
OrdItem.columns = ['order_id','ordItem']

In [19]:
df_allOrdDet = df_allOrdDet.merge(OrdItem,on='order_id',how='left')

In [20]:
df_allOrdDet['ordItemTot'] = df_allOrdDet.groupby('user_id')['ordItem'].cumsum()

In [21]:
df_allOrdDet['ordItemTot_Ratio'] = df_allOrdDet['ordItemTot']/df_allOrdDet['order_number']

#### Average Reorder Duration

In [22]:
# Average number of days between re-orders by user
user = df_allOrdDet.groupby('user_id')['days_since_prior_order'].mean().reset_index()
user.rename(columns={'days_since_prior_order_y':'AvgReOrdDuration'},inplace=True)
df_allOrdDet = df_allOrdDet.merge(user,on='user_id',how='left')

#### Most Frequent Order Day of week For Users

In [23]:
# Days when a user is most expected to make a purchase.
DOW_Mode=df_allOrdDet.groupby('user_id')['order_dow'].agg(pd.Series.mode).reset_index().rename(columns={'order_dow':
                                                                                                       'DOW_Mode'})
DOW_Mode.head()

Unnamed: 0,user_id,DOW_Mode
0,1,4
1,2,1
2,3,0
3,4,"[4, 5]"
4,5,"[0, 3]"


In [24]:
df_allOrdDet=df_allOrdDet.merge(DOW_Mode,on='user_id',how='left')

In [28]:
df_allOrdDet.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order_x,product_name,SucceedingOrdCnt,days_since_first_order,...,reorder_ratio,nonReOrdRatio,nonReOrdTot,uniItmTot,uniItmTot_Ratio,ordItem,ordItemTot,ordItemTot_Ratio,days_since_prior_order_y,DOW_Mode
0,2539329,1,prior,1,2,8,,"[Soda, Organic-Unsweetened-Vanilla-Almond-Milk...",10.0,,...,0.0,1.0,5.0,5.0,5.0,5.0,5.0,5.0,19.0,4
1,2398795,1,prior,2,3,7,15.0,"[Soda, Pistachios, Original-Beef-Jerky, Bag-of...",9.0,15.0,...,0.5,0.5,3.0,8.0,4.0,6.0,11.0,5.5,19.0,4
2,473747,1,prior,3,3,12,21.0,"[Soda, Original-Beef-Jerky, Pistachios, Organi...",8.0,36.0,...,0.6,0.4,2.0,10.0,3.333333,5.0,16.0,5.333333,19.0,4
3,2254736,1,prior,4,4,7,29.0,"[Soda, Original-Beef-Jerky, Pistachios, Organi...",7.0,65.0,...,1.0,0.0,0.0,10.0,2.5,5.0,21.0,5.25,19.0,4
4,431534,1,prior,5,4,15,28.0,"[Soda, Original-Beef-Jerky, Pistachios, Organi...",6.0,93.0,...,0.625,0.375,3.0,13.0,2.6,8.0,29.0,5.8,19.0,4


In [27]:
df_allOrdDet.to_pickle('input/all_order_details_Transformed.p')

#### Order Size

In [25]:
# Determine total number of products in an order as well as mean, median, max, min and standard deviation
col = ['order_id','user_id','product_id','order_number','SucceedingOrdCnt']

In [26]:
log=df_allData[col].sort_values('user_id')

In [29]:
log.head()

Unnamed: 0,order_id,user_id,product_id,order_number,SucceedingOrdCnt
24076667,2539329,1,26088,1,10
5212931,550135,1,13032,7,4
5212930,550135,1,25133,7,4
5212929,550135,1,12427,7,4
5212928,550135,1,10258,7,4


In [30]:
log_ = log[log['SucceedingOrdCnt']>0]

In [31]:
OrdGrpTbl = log_.groupby('order_id').size().to_frame()
OrdGrpTbl.columns=['order_size']
OrdGrpTbl.reset_index(inplace=True)

In [32]:
OrdGrpTbl = OrdGrpTbl.merge(log[['order_id','user_id']].drop_duplicates())

In [33]:
OrdGrpTbl.head()

Unnamed: 0,order_id,order_size,user_id
0,2,9,202279
1,3,8,205970
2,4,13,178520
3,5,26,156122
4,6,3,22352


In [35]:
user_osz = OrdGrpTbl.groupby(['user_id']).order_size.min().to_frame()

In [36]:
user_osz.columns = ['user_order_size-min']

In [37]:
user_osz['user_order_size-max'] = OrdGrpTbl.groupby(['user_id'])['order_size'].max()
user_osz['user_order_size-median'] = OrdGrpTbl.groupby(['user_id'])['order_size'].median()
user_osz['user_order_size-mean'] = OrdGrpTbl.groupby(['user_id'])['order_size'].mean()
user_osz['user_order_size-std'] = OrdGrpTbl.groupby(['user_id'])['order_size'].std()
user_osz.reset_index(inplace=True)

In [38]:
user_osz.to_pickle('input/userOrdSize.p')