## Combine Engineered Features for Prior Dataset

In [1]:
import pandas as pd
import numpy as np
import gc

In [2]:
df_allOrdDet = pd.read_pickle('input/all_order_details.p')

In [3]:
df_allOrdDet = df_allOrdDet[['order_id','user_id','order_number','SucceedingOrdCnt','eval_set','order_dow',
                             'order_hour_of_day']]

In [4]:
df_allOrdDet['eval_set'].unique()

array(['prior', 'train', 'test'], dtype=object)

In [5]:
df_allOrdDet.sort_values(['user_id','order_number','order_id'],inplace=True)

In [6]:
df_allOrdDet=df_allOrdDet.merge(pd.read_pickle('input/label.p')
                                [['order_id','product_id','reordered']],on='order_id',how='inner')

In [7]:
df_allOrdDet['eval_set'].unique()

array(['prior'], dtype=object)

In [8]:
df_allOrdDet=df_allOrdDet.merge(pd.read_pickle('input/product_detail.p')
                                [['product_id','aisle_id','department_id']],on='product_id',how='left')

In [9]:
df_allOrdDet.head()

Unnamed: 0,order_id,user_id,order_number,SucceedingOrdCnt,eval_set,order_dow,order_hour_of_day,product_id,reordered,aisle_id,department_id
0,2550362,1,10,1.0,prior,4,8,196,1,77,7
1,2550362,1,10,1.0,prior,4,8,10258,1,117,19
2,2550362,1,10,1.0,prior,4,8,10326,0,24,4
3,2550362,1,10,1.0,prior,4,8,12427,1,23,19
4,2550362,1,10,1.0,prior,4,8,13032,1,121,14


In [10]:
df_allOrdDet.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12084910 entries, 0 to 12084909
Data columns (total 11 columns):
order_id             int64
user_id              int64
order_number         int64
SucceedingOrdCnt     float64
eval_set             object
order_dow            int64
order_hour_of_day    int64
product_id           int64
reordered            int64
aisle_id             int64
department_id        int64
dtypes: float64(1), int64(9), object(1)
memory usage: 1.7 GB


In [11]:
df_allOrdDet.describe()

Unnamed: 0,order_id,user_id,order_number,SucceedingOrdCnt,order_dow,order_hour_of_day,product_id,reordered,aisle_id,department_id
count,12084910.0,12084910.0,12084910.0,12084910.0,12084910.0,12084910.0,12084910.0,12084910.0,12084910.0,12084910.0
mean,1717179.0,102998.7,26.45752,1.0,2.747434,13.52937,25517.1,0.1017412,71.01587,10.21775
std,988422.0,59430.54,22.56259,0.0,2.072485,4.225188,14220.81,0.3023077,38.07417,6.209747
min,16.0,1.0,2.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
25%,859892.0,51590.0,10.0,1.0,1.0,10.0,13309.0,0.0,32.0,4.0
50%,1720821.0,102694.0,19.0,1.0,3.0,14.0,25636.0,0.0,81.0,11.0
75%,2575344.0,154429.0,37.0,1.0,5.0,16.0,38159.0,0.0,106.0,16.0
max,3421073.0,206209.0,99.0,1.0,6.0,23.0,49688.0,1.0,134.0,21.0


In [12]:
# Collect Order ids of succeeding 3 orders
for i in range(1,4):
    df_allOrdDet['t-{}_order_id'.format(i)] = df_allOrdDet.groupby('user_id')['order_id'].shift(i)

In [13]:
df_allOrdDet.isnull().sum()

order_id                  0
user_id                   0
order_number              0
SucceedingOrdCnt          0
eval_set                  0
order_dow                 0
order_hour_of_day         0
product_id                0
reordered                 0
aisle_id                  0
department_id             0
t-1_order_id         206209
t-2_order_id         411299
t-3_order_id         614574
dtype: int64

In [14]:
df_allOrdDet.dropna(inplace=True)

In [15]:
df_allOrdDet['SucceedingOrdCnt']=df_allOrdDet['SucceedingOrdCnt'].astype(int)

In [16]:
col = [c for c in df_allOrdDet.columns if 'order_id' in c]
for c in col:
    df_allOrdDet[c] = df_allOrdDet[c].map(int)

In [17]:
df_allOrdDet.reset_index(drop=1, inplace=True)

In [18]:
#df_allOrdDet=df_allOrdDet.merge(pd.read_pickle('input/all_data.p')[['order_id','product_id','order_hour_of_day','order_dow']],on=['order_id','product_id'],how='left')

In [19]:
df_allOrdDet.head()

Unnamed: 0,order_id,user_id,order_number,SucceedingOrdCnt,eval_set,order_dow,order_hour_of_day,product_id,reordered,aisle_id,department_id,t-1_order_id,t-2_order_id,t-3_order_id
0,2550362,1,10,1,prior,4,8,12427,1,23,19,2550362,2550362,2550362
1,2550362,1,10,1,prior,4,8,13032,1,121,14,2550362,2550362,2550362
2,2550362,1,10,1,prior,4,8,13176,0,24,4,2550362,2550362,2550362
3,2550362,1,10,1,prior,4,8,14084,0,91,16,2550362,2550362,2550362
4,2550362,1,10,1,prior,4,8,17122,0,24,4,2550362,2550362,2550362


## Combining Engineered Features
____________

#### Combining User Behavior Features

In [20]:
df_userBehavior = pd.read_pickle('input/userOrdSize.p')

In [21]:
df_userBehavior.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 6 columns):
user_id                   206209 non-null int64
user_order_size-min       206209 non-null int64
user_order_size-max       206209 non-null int64
user_order_size-median    206209 non-null float64
user_order_size-mean      206209 non-null float64
user_order_size-std       206209 non-null float64
dtypes: float64(3), int64(3)
memory usage: 9.4 MB


In [22]:
#Downcast column datatypes to save memory
df_userBehavior=df_userBehavior.apply(pd.to_numeric,downcast='unsigned')
df_userBehavior.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 6 columns):
user_id                   206209 non-null uint32
user_order_size-min       206209 non-null uint8
user_order_size-max       206209 non-null uint8
user_order_size-median    206209 non-null float64
user_order_size-mean      206209 non-null float64
user_order_size-std       206209 non-null float64
dtypes: float64(3), uint32(1), uint8(2)
memory usage: 5.9 MB


In [23]:
df_userBehavior.head()

Unnamed: 0,user_id,user_order_size-min,user_order_size-max,user_order_size-median,user_order_size-mean,user_order_size-std
0,1,4,9,5.5,5.9,1.523884
1,2,5,26,13.5,13.928571,5.717238
2,3,5,11,8.0,7.454545,2.161649
3,4,2,7,3.0,3.75,2.362908
4,5,5,12,10.0,9.25,3.095696


In [24]:
df_allOrdDet = df_allOrdDet.merge(df_userBehavior, on='user_id',how='left')

In [25]:
df_allOrdDet.head()

Unnamed: 0,order_id,user_id,order_number,SucceedingOrdCnt,eval_set,order_dow,order_hour_of_day,product_id,reordered,aisle_id,department_id,t-1_order_id,t-2_order_id,t-3_order_id,user_order_size-min,user_order_size-max,user_order_size-median,user_order_size-mean,user_order_size-std
0,2550362,1,10,1,prior,4,8,12427,1,23,19,2550362,2550362,2550362,4,9,5.5,5.9,1.523884
1,2550362,1,10,1,prior,4,8,13032,1,121,14,2550362,2550362,2550362,4,9,5.5,5.9,1.523884
2,2550362,1,10,1,prior,4,8,13176,0,24,4,2550362,2550362,2550362,4,9,5.5,5.9,1.523884
3,2550362,1,10,1,prior,4,8,14084,0,91,16,2550362,2550362,2550362,4,9,5.5,5.9,1.523884
4,2550362,1,10,1,prior,4,8,17122,0,24,4,2550362,2550362,2550362,4,9,5.5,5.9,1.523884


#### Combining Item Characteristics Features

In [26]:
df_itemChar = pd.read_pickle('input/hrGrp.p')
df_itemChar.head()

Unnamed: 0,product_id,order_hour_of_day,item_hour_cnt,itemHrDist
0,1,0,12,0.006224
1,1,1,12,0.006224
2,1,2,9,0.004668
3,1,3,5,0.002593
4,1,4,4,0.002075


In [27]:
pd.read_pickle('input/hrGrpUnq.p').head()

Unnamed: 0,product_id,order_hour_of_day,unique_item_hour_cnt,unqItemHrDist
0,1,0,11,0.007412
1,1,1,10,0.006739
2,1,2,9,0.006065
3,1,3,5,0.003369
4,1,4,3,0.002022


In [28]:
df_itemChar = df_itemChar.merge(pd.read_pickle('input/hrGrpUnq.p'),on=['product_id','order_hour_of_day'],how='left')

In [29]:
df_itemChar=df_itemChar.apply(pd.to_numeric,downcast='unsigned')
df_itemChar.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 777214 entries, 0 to 777213
Data columns (total 6 columns):
product_id              777214 non-null uint16
order_hour_of_day       777214 non-null uint8
item_hour_cnt           777214 non-null uint16
itemHrDist              777214 non-null float64
unique_item_hour_cnt    777214 non-null uint16
unqItemHrDist           777214 non-null float64
dtypes: float64(2), uint16(3), uint8(1)
memory usage: 23.0 MB


In [30]:
df_allOrdDet = df_allOrdDet.merge(df_itemChar,on=['product_id','order_hour_of_day'],how='left')

In [31]:
df_itemChar2 = pd.read_pickle('input/dowGrp.p')

In [32]:
df_itemChar2 = df_itemChar2.merge(pd.read_pickle('input/unqDowGrp.p'),on=['product_id','order_dow'],how='left')

In [33]:
df_itemChar2=df_itemChar2.apply(pd.to_numeric,downcast='unsigned')
df_itemChar2.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 319956 entries, 0 to 319955
Data columns (total 6 columns):
product_id             319956 non-null uint16
order_dow              319956 non-null uint8
item_dow_cnt           319956 non-null uint32
itemDowDist            319956 non-null float64
unique_item_dow_cnt    319956 non-null uint16
unqItemDowDist         319956 non-null float64
dtypes: float64(2), uint16(2), uint32(1), uint8(1)
memory usage: 10.1 MB


In [34]:
df_allOrdDet = df_allOrdDet.merge(df_itemChar2,on=['product_id','order_dow'],how='left')

In [35]:
df_allOrdDet = df_allOrdDet.merge(pd.read_pickle('input/prodComb.p'),on=['product_id'],how='left')

In [36]:
df_allOrdDet.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11470336 entries, 0 to 11470335
Data columns (total 31 columns):
order_id                  int64
user_id                   int64
order_number              int64
SucceedingOrdCnt          int32
eval_set                  object
order_dow                 int64
order_hour_of_day         int64
product_id                int64
reordered                 int64
aisle_id                  int64
department_id             int64
t-1_order_id              int64
t-2_order_id              int64
t-3_order_id              int64
user_order_size-min       uint8
user_order_size-max       uint8
user_order_size-median    float64
user_order_size-mean      float64
user_order_size-std       float64
item_hour_cnt             float64
itemHrDist                float64
unique_item_hour_cnt      float64
unqItemHrDist             float64
item_dow_cnt              float64
itemDowDist               float64
unique_item_dow_cnt       float64
unqItemDowDist            float6

In [37]:
df_allOrdDet.head()

Unnamed: 0,order_id,user_id,order_number,SucceedingOrdCnt,eval_set,order_dow,order_hour_of_day,product_id,reordered,aisle_id,...,unique_item_hour_cnt,unqItemHrDist,item_dow_cnt,itemDowDist,unique_item_dow_cnt,unqItemDowDist,prodComb_mean,prodComb_min,prodComb_max,prodComb_std
0,2550362,1,10,1,prior,4,8,12427,1,23,...,234.0,0.053843,872.0,0.130208,510.0,0.146805,8.634077,1,62,6.98822
1,2550362,1,10,1,prior,4,8,13032,1,121,...,187.0,0.063888,556.0,0.142418,352.0,0.143673,9.492694,1,100,7.732432
2,2550362,1,10,1,prior,4,8,13176,0,24,...,13480.0,0.054178,46358.0,0.117383,24169.0,0.129753,13.651778,1,145,8.621557
3,2550362,1,10,1,prior,4,8,14084,0,91,...,557.0,0.052448,1932.0,0.11724,998.0,0.126042,13.867454,1,91,8.472359
4,2550362,1,10,1,prior,4,8,17122,0,24,...,685.0,0.062437,1744.0,0.119863,1204.0,0.131255,11.989662,1,70,8.9415


#### Combining User Product Interaction Features

In [38]:
df_totalBuys=pd.read_pickle('input/totalbuys.p')

In [39]:
df_totalBuys=df_totalBuys.apply(pd.to_numeric,downcast='unsigned')
df_totalBuys.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12084910 entries, 0 to 12084909
Data columns (total 5 columns):
user_id            uint32
order_id           uint32
product_id         uint16
total_buy          uint8
total_buy_ratio    float64
dtypes: float64(1), uint16(1), uint32(2), uint8(1)
memory usage: 311.2 MB


In [40]:
df_allOrdDet = df_allOrdDet.merge(df_totalBuys,on=['user_id','product_id'],how='left')

In [41]:
prdOrdPoss=pd.read_pickle('input/prdOrdPoss.p')

In [42]:
prdOrdPoss=prdOrdPoss.apply(pd.to_numeric,downcast='unsigned')
prdOrdPoss.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12084910 entries, 0 to 12084909
Data columns (total 7 columns):
user_id        uint32
product_id     uint16
count          uint8
minPrdOrd      uint8
maxOrd         uint8
possibility    uint8
ordPoss        float64
dtypes: float64(1), uint16(1), uint32(1), uint8(4)
memory usage: 299.7 MB


In [43]:
df_allOrdDet = df_allOrdDet.merge(prdOrdPoss,on=['user_id','product_id'],how='left')

In [44]:
df_allOrdDet.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11470336 entries, 0 to 11470335
Data columns (total 39 columns):
order_id_x                int64
user_id                   int64
order_number              int64
SucceedingOrdCnt          int32
eval_set                  object
order_dow                 int64
order_hour_of_day         int64
product_id                int64
reordered                 int64
aisle_id                  int64
department_id             int64
t-1_order_id              int64
t-2_order_id              int64
t-3_order_id              int64
user_order_size-min       uint8
user_order_size-max       uint8
user_order_size-median    float64
user_order_size-mean      float64
user_order_size-std       float64
item_hour_cnt             float64
itemHrDist                float64
unique_item_hour_cnt      float64
unqItemHrDist             float64
item_dow_cnt              float64
itemDowDist               float64
unique_item_dow_cnt       float64
unqItemDowDist            float6

In [45]:
df_allOrdDet=df_allOrdDet.drop(['order_id_y'],axis=1)
df_allOrdDet.rename(columns={'order_id_x':'order_id'},inplace=True)

In [46]:
df_allOrdDet.head()

Unnamed: 0,order_id,user_id,order_number,SucceedingOrdCnt,eval_set,order_dow,order_hour_of_day,product_id,reordered,aisle_id,...,prodComb_min,prodComb_max,prodComb_std,total_buy,total_buy_ratio,count,minPrdOrd,maxOrd,possibility,ordPoss
0,2550362,1,10,1,prior,4,8,12427,1,23,...,1,62,6.98822,9,1.0,9,1,9,9,1.0
1,2550362,1,10,1,prior,4,8,13032,1,121,...,1,100,7.732432,2,0.222222,2,2,9,8,0.25
2,2550362,1,10,1,prior,4,8,13176,0,24,...,1,145,8.621557,2,2.0,2,2,9,8,0.25
3,2550362,1,10,1,prior,4,8,14084,0,91,...,1,91,8.472359,1,inf,1,1,9,9,0.111111
4,2550362,1,10,1,prior,4,8,17122,0,24,...,1,70,8.9415,1,0.25,1,5,9,5,0.2


#### Combining Day and Hour Features

In [47]:
df_dowOrdTrd = pd.read_pickle('input/dowOrdTrd.p')

In [48]:
#df_dowOrdTrd=df_dowOrdTrd.apply(pd.to_numeric,downcast='unsigned')
#df_dowOrdTrd.info(memory_usage='deep')

In [49]:
df_allOrdDet = df_allOrdDet.merge(df_dowOrdTrd,on='order_dow',how='left')

In [50]:
df_hourOrdTrd=pd.read_pickle('input/hourOrdTrd.p').reset_index()

In [51]:
#df_hourOrdTrd=df_hourOrdTrd.apply(pd.to_numeric,downcast='unsigned')
#df_hourOrdTrd.info(memory_usage='deep')

In [52]:
df_allOrdDet = df_allOrdDet.merge(df_hourOrdTrd,on='order_hour_of_day',how='left')

### Export All Engineered Features

In [53]:
df_allOrdDet.isnull().sum()

order_id                      0
user_id                       0
order_number                  0
SucceedingOrdCnt              0
eval_set                      0
order_dow                     0
order_hour_of_day             0
product_id                    0
reordered                     0
aisle_id                      0
department_id                 0
t-1_order_id                  0
t-2_order_id                  0
t-3_order_id                  0
user_order_size-min           0
user_order_size-max           0
user_order_size-median        0
user_order_size-mean          0
user_order_size-std           0
item_hour_cnt             86756
itemHrDist                86756
unique_item_hour_cnt      86756
unqItemHrDist             86756
item_dow_cnt              17474
itemDowDist               17474
unique_item_dow_cnt       17474
unqItemDowDist            17474
prodComb_mean                 0
prodComb_min                  0
prodComb_max                  0
prodComb_std                245
total_bu

In [54]:
len(df_allOrdDet.index)

11470336

In [56]:
df_allOrdDet.dropna(inplace=True)

In [57]:
df_allOrdDet.to_pickle('input/all_Eng_Features.p')

In [58]:
df_allOrdDet['eval_set'].unique()

array(['prior'], dtype=object)

In [59]:
df_allOrdDet['SucceedingOrdCnt'].describe()

count    11375135.0
mean            1.0
std             0.0
min             1.0
25%             1.0
50%             1.0
75%             1.0
max             1.0
Name: SucceedingOrdCnt, dtype: float64

In [60]:
df_allOrdDet['reordered'].describe()

count    1.137514e+07
mean     1.005957e-01
std      3.007927e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: reordered, dtype: float64