In [103]:
#import data science libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#data types
import collections

#import general libraries
import gc
import time
import random
import datetime

#deep learning 
import tensorflow as tf

In [104]:
products = pd.read_csv('../data/products.csv')
print(products.shape)
products.head()

(49688, 4)


Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [105]:
aisles = pd.read_csv('../data/aisles.csv')
departments = pd.read_csv('../data/departments.csv')

In [106]:
orders = pd.read_csv('../data/orders.csv')
print(orders.shape)

(3421083, 7)


In [133]:
orders.head(12)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,cum_days_since_prior_order
0,2539329,1,prior,1,2,8,,0.0
1,2398795,1,prior,2,3,7,15.0,15.0
2,473747,1,prior,3,3,12,21.0,36.0
3,2254736,1,prior,4,4,7,29.0,65.0
4,431534,1,prior,5,4,15,28.0,93.0
5,3367565,1,prior,6,2,7,19.0,112.0
6,550135,1,prior,7,1,9,20.0,132.0
7,3108588,1,prior,8,1,14,14.0,146.0
8,2295261,1,prior,9,1,16,0.0,146.0
9,2550362,1,prior,10,4,8,30.0,176.0


In [132]:
orders['cum_days_since_prior_order'] = orders.days_since_prior_order.cumsum()
orders['cum_days_since_prior_order'] = orders['cum_days_since_prior_order'].fillna(0)

In [108]:
orders_products_prior = pd.read_csv('../data/order_products__prior.csv')
print(orders_products_prior.shape)
orders_products_prior.head(5)

(32434489, 4)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [109]:
orders_products_prior_sample = orders_products_prior[orders_products_prior.order_id <= 1000000]
orders_products_prior_sample.shape

(9473192, 4)

### Merge order_products and orders dataset 

In [134]:
# MEGA DATASET
sample_join_orders = orders_products_prior_sample.set_index('order_id').join(orders.set_index('order_id'), how='left').reset_index()

In [135]:
print(sample_join_orders.shape)
sample_join_orders.head()

(9473192, 11)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,cum_days_since_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8.0,35054882.0
1,2,28985,2,1,202279,prior,3,5,9,8.0,35054882.0
2,2,9327,3,0,202279,prior,3,5,9,8.0,35054882.0
3,2,45918,4,1,202279,prior,3,5,9,8.0,35054882.0
4,2,30035,5,0,202279,prior,3,5,9,8.0,35054882.0


In [13]:
last_order_dict = (sample_join_orders
                   .groupby('order_id')
                   .agg({'add_to_cart_order':'max'})
                   .to_dict()['add_to_cart_order'])

In [112]:
sample_join_orders['last_order'] = sample_join_orders['order_id'].map(last_order_dict)
sample_join_orders['last_order_flag'] = (np.where(sample_join_orders['last_order'] == sample_join_orders['add_to_cart_order'],
                                         1,0))

In [113]:
print(sample_join_orders.shape)
sample_join_orders.head(10)

(9473192, 12)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,last_order,last_order_flag
0,2,33120,1,1,202279,prior,3,5,9,8.0,9,0
1,2,28985,2,1,202279,prior,3,5,9,8.0,9,0
2,2,9327,3,0,202279,prior,3,5,9,8.0,9,0
3,2,45918,4,1,202279,prior,3,5,9,8.0,9,0
4,2,30035,5,0,202279,prior,3,5,9,8.0,9,0
5,2,17794,6,1,202279,prior,3,5,9,8.0,9,0
6,2,40141,7,1,202279,prior,3,5,9,8.0,9,0
7,2,1819,8,1,202279,prior,3,5,9,8.0,9,0
8,2,43668,9,0,202279,prior,3,5,9,8.0,9,1
9,3,33754,1,1,205970,prior,16,5,17,12.0,8,0


In [114]:
orders_products = sample_join_orders.merge(products, on='product_id', how='outer')
orders_products_aisles = orders_products.merge(aisles, on='aisle_id', how='outer')
orders_products_departments = orders_products_aisles.merge(departments, on='department_id', how='outer')

In [115]:
columns_df_product_embeddings = ['user_id','order_id','order_number','add_to_cart_order','product_id','product_name']

In [116]:
sorted_orders_products_departments = orders_products_departments.sort_values(['user_id',
                                                                              'order_number',
                                                                              'order_id',
                                                                              'add_to_cart_order'])

In [117]:
sorted_orders_products_departments.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,last_order,last_order_flag,product_name,aisle_id,department_id,aisle,department
7178054,473747.0,196,1.0,1.0,1.0,prior,3.0,3.0,12.0,21.0,5.0,0.0,Soda,77,7,soft drinks,beverages
6250952,473747.0,12427,2.0,1.0,1.0,prior,3.0,3.0,12.0,21.0,5.0,0.0,Original Beef Jerky,23,19,popcorn jerky,snacks
6302341,473747.0,10258,3.0,1.0,1.0,prior,3.0,3.0,12.0,21.0,5.0,0.0,Pistachios,117,19,nuts seeds dried fruit,snacks
1177272,473747.0,25133,4.0,0.0,1.0,prior,3.0,3.0,12.0,21.0,5.0,0.0,Organic String Cheese,21,16,packaged cheese,dairy eggs
4637501,473747.0,30450,5.0,0.0,1.0,prior,3.0,3.0,12.0,21.0,5.0,1.0,Creamy Almond Butter,88,13,spreads,pantry


In [118]:
#df_product_embeddings = sorted_orders_products_departments[columns_df_product_embeddings]
#df_product_embeddings.head(20)

In [None]:
# num_aggregations_from_product_level = {
#     'reordered': ['mean', 'median', 'sum'], #products reordered by user
#     'order_id': ['count'], #products reordered by user
#     'product_id': ['mean', 'median', 'count'], #products reordered by user
#     'department_id': ['mean', 'median', 'sum'], #products reordered by user

# }

In [158]:
#intermediate dataset for product couts per user per order
basket_stats = (sorted_orders_products_departments
                    .groupby(['user_id', 'order_id'])
                    .agg({'product_id':'count','department_id':'nunique', 'aisle_id':'nunique'})
                    .reset_index()
                    .rename(columns={'product_id':'count_products',
                                     'department_id':'count_departments'
                                     ,'aisle_id':'count_aisles'}))

# user_id basket_stats
user_id_basket_stats = (basket_stats
                            .groupby('user_id').agg({'count_products':['mean','max','min'],
                                                     'count_departments':['mean','max','min'], 
                                                     'count_aisles':['mean','max','min']}))

In [159]:
#Number of items reordered per user
items_reordered_per_user = (sorted_orders_products_departments
                                .groupby('user_id')
                                .agg({'reordered':['sum', 'mean']}))

#number of orders per user
number_of_orders_per_user = (sorted_orders_products_departments
                                .groupby('user_id')
                                .agg({'order_id':'count'}))

#feature engineering from orders dataset
days_between_orders = (orders
                           .groupby('user_id')
                           .agg({'days_since_prior_order': ['min','max','mean','median']}))
customer_lifetime = (orders
                           .groupby('user_id')
                           .agg({'cum_days_since_prior_order': 'max'}))

In [None]:
list_groupby_features = [basket_stats,
                         user_id_basket_stats,
                         items_reordered_per_user, 
                         days_between_orders]

for df in list_groupby_features:
    df.columns = pd.Index([e[0] + "_" + e[1].upper() for e in df.columns.tolist()])


In [156]:
days_between_orders.head()

Unnamed: 0_level_0,days_since_prior_order_MIN,days_since_prior_order_MAX,days_since_prior_order_MEAN,days_since_prior_order_MEDIAN
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.0,30.0,19.0,19.5
2,3.0,30.0,16.285714,13.0
3,7.0,21.0,12.0,11.0
4,0.0,30.0,17.0,19.0
5,6.0,19.0,11.5,10.5


In [157]:
user_id_basket_stats.head()

Unnamed: 0_level_0,count_products,count_products,count_products,count_departments,count_departments,count_departments,count_aisles,count_aisles,count_aisles
Unnamed: 0_level_1,mean,max,min,mean,max,min,mean,max,min
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
1.0,6.0,8,5,4.0,4,4,5.0,5,5
2.0,15.0,16,13,6.666667,8,5,9.0,11,7
3.0,6.666667,9,5,4.0,5,2,5.666667,9,3
4.0,4.5,7,2,3.5,5,2,3.5,5,2
5.0,12.0,12,12,5.0,5,5,10.0,10,10


In [None]:
#build vanilla classifier

In [None]:
#similarity between user and item (both being vectors)