In [1]:
#import data science libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#data types
import collections

#import general libraries
import gc
import time
import random
import datetime

#deep learning 
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [3]:
products = pd.read_csv('./data/products.csv')
print(products.shape)
products.head()

(49688, 4)


Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [4]:
aisles = pd.read_csv('./data/aisles.csv')
departments = pd.read_csv('./data/departments.csv')

In [5]:
orders = pd.read_csv('./data/orders.csv')
print(orders.shape)

(3421083, 7)


In [12]:
orders.head(13)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
5,3367565,1,prior,6,2,7,19.0
6,550135,1,prior,7,1,9,20.0
7,3108588,1,prior,8,1,14,14.0
8,2295261,1,prior,9,1,16,0.0
9,2550362,1,prior,10,4,8,30.0


In [7]:
train_eval = orders[orders.eval_set=='train']

In [8]:
train_eval.shape

(131209, 7)

In [9]:
test_eval = orders[orders.eval_set=='test']

In [10]:
test_eval.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
38,2774568,3,test,13,5,15,11.0
44,329954,4,test,6,3,12,30.0
53,1528013,6,test,4,3,16,22.0
96,1376945,11,test,8,6,11,8.0
102,1356845,12,test,6,1,20,30.0


In [11]:
orders[orders.order_id == 17]

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
611292,17,36855,test,5,6,15,1.0


In [13]:
orders['cum_days_since_prior_order'] = orders.groupby(['user_id'])['days_since_prior_order'].cumsum()
orders['cum_days_since_prior_order'] = orders['cum_days_since_prior_order'].fillna(0)

In [15]:
orders_products_train = pd.read_csv('./data/order_products__train.csv')
print(orders_products_train.shape)
orders_products_train.head(5)

(1384617, 4)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [16]:
orders_products_train[orders_products_train.order_id == 2774568]

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered


In [17]:
orders_products_train.isnull().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

In [18]:
orders_products_prior = pd.read_csv('./data/order_products__prior.csv')
print(orders_products_prior.shape)
orders_products_prior.head(5)

(32434489, 4)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [19]:
orders_products_prior[orders_products_prior.order_id == 2774568]

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered


### Create master df 

In [20]:
order_products_prior_df = pd.read_csv('data/order_products__prior.csv', engine='c',
                                          dtype={'order_id': np.int32, 'product_id': np.int32,
                                                 'add_to_cart_order': np.int16, 'reordered': np.int8})

order_products_train_df = pd.read_csv('data/order_products__train.csv', engine='c',
                                      dtype={'order_id': np.int32, 'product_id': np.int32,
                                             'add_to_cart_order': np.int16, 'reordered': np.int8})

orders_df = pd.read_csv('data/orders.csv', engine='c',
                        dtype={'order_id': np.int32, 'user_id': np.int32, 'order_number': np.int32,
                               'order_dow': np.int8, 'order_hour_of_day': np.int8,
                               'days_since_prior_order': np.float16})

products_df = pd.read_csv("data/products.csv", engine='c')

df_train = orders_df.merge(order_products_train_df, how='inner', on='order_id')
df_train = df_train.merge(products_df, how='inner', on='product_id')
df_train.sort_values(['user_id', 'order_number'], axis=0, inplace=True)

df_prior = orders_df.merge(order_products_prior_df, how='inner', on='order_id')
df_prior = df_prior.merge(products_df, how='inner', on='product_id')
df_prior.sort_values(['user_id', 'order_number'], axis=0, inplace=True)

df = pd.concat([df_prior, df_train])

In [22]:
print(df.shape)
df.head()

(33819106, 13)


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,2539329,1,prior,1,2,8,,196,1,0,Soda,77,7
35791,2539329,1,prior,1,2,8,,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16
51726,2539329,1,prior,1,2,8,,12427,3,0,Original Beef Jerky,23,19
58202,2539329,1,prior,1,2,8,,26088,4,0,Aged White Cheddar Popcorn,23,19
60725,2539329,1,prior,1,2,8,,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17


In [None]:
df.to_csv('./data/df_products_orders_all.csv', index=False)

### Take Sample of dataset

In [109]:
orders_products_prior_sample = orders_products_prior[orders_products_prior.order_id <= 1000000]
orders_products_prior_sample.shape

(9473192, 4)

### Merge order_products and orders dataset 

In [134]:
# MEGA SAMPLE DATASET
sample_join_orders = orders_products_prior_sample.set_index('order_id').join(orders.set_index('order_id'), how='left').reset_index()

In [135]:
print(sample_join_orders.shape)
sample_join_orders.head()

(9473192, 11)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,cum_days_since_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8.0,35054882.0
1,2,28985,2,1,202279,prior,3,5,9,8.0,35054882.0
2,2,9327,3,0,202279,prior,3,5,9,8.0,35054882.0
3,2,45918,4,1,202279,prior,3,5,9,8.0,35054882.0
4,2,30035,5,0,202279,prior,3,5,9,8.0,35054882.0


In [268]:
sample_join_orders[sample_join_orders.user_id == 36855]

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,cum_days_since_prior_order
2225204,234692,21709,1,1,36855,prior,2,0,9,30.0,6381669.0
2225205,234692,13107,2,0,36855,prior,2,0,9,30.0,6381669.0
2225206,234692,38777,3,0,36855,prior,2,0,9,30.0,6381669.0
2225207,234692,48896,4,0,36855,prior,2,0,9,30.0,6381669.0
2225208,234692,1283,5,0,36855,prior,2,0,9,30.0,6381669.0
2225209,234692,13535,6,0,36855,prior,2,0,9,30.0,6381669.0
2225210,234692,47766,7,1,36855,prior,2,0,9,30.0,6381669.0
2225211,234692,39928,8,0,36855,prior,2,0,9,30.0,6381669.0
2225212,234692,31964,9,0,36855,prior,2,0,9,30.0,6381669.0
8514419,898818,47766,1,0,36855,prior,1,1,13,,0.0


In [13]:
last_order_dict = (sample_join_orders
                   .groupby('order_id')
                   .agg({'add_to_cart_order':'max'})
                   .to_dict()['add_to_cart_order'])

In [112]:
sample_join_orders['last_order'] = sample_join_orders['order_id'].map(last_order_dict)
sample_join_orders['last_order_flag'] = (np.where(sample_join_orders['last_order'] == sample_join_orders['add_to_cart_order'],
                                         1,0))

In [113]:
print(sample_join_orders.shape)
sample_join_orders.head(10)

(9473192, 12)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,last_order,last_order_flag
0,2,33120,1,1,202279,prior,3,5,9,8.0,9,0
1,2,28985,2,1,202279,prior,3,5,9,8.0,9,0
2,2,9327,3,0,202279,prior,3,5,9,8.0,9,0
3,2,45918,4,1,202279,prior,3,5,9,8.0,9,0
4,2,30035,5,0,202279,prior,3,5,9,8.0,9,0
5,2,17794,6,1,202279,prior,3,5,9,8.0,9,0
6,2,40141,7,1,202279,prior,3,5,9,8.0,9,0
7,2,1819,8,1,202279,prior,3,5,9,8.0,9,0
8,2,43668,9,0,202279,prior,3,5,9,8.0,9,1
9,3,33754,1,1,205970,prior,16,5,17,12.0,8,0


In [114]:
orders_products = sample_join_orders.merge(products, on='product_id', how='outer')
orders_products_aisles = orders_products.merge(aisles, on='aisle_id', how='outer')
orders_products_departments = orders_products_aisles.merge(departments, on='department_id', how='outer')

In [115]:
columns_df_product_embeddings = ['user_id','order_id','order_number','add_to_cart_order','product_id','product_name']

In [116]:
sorted_orders_products_departments = orders_products_departments.sort_values(['user_id',
                                                                              'order_number',
                                                                              'order_id',
                                                                              'add_to_cart_order'])

In [117]:
sorted_orders_products_departments.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,last_order,last_order_flag,product_name,aisle_id,department_id,aisle,department
7178054,473747.0,196,1.0,1.0,1.0,prior,3.0,3.0,12.0,21.0,5.0,0.0,Soda,77,7,soft drinks,beverages
6250952,473747.0,12427,2.0,1.0,1.0,prior,3.0,3.0,12.0,21.0,5.0,0.0,Original Beef Jerky,23,19,popcorn jerky,snacks
6302341,473747.0,10258,3.0,1.0,1.0,prior,3.0,3.0,12.0,21.0,5.0,0.0,Pistachios,117,19,nuts seeds dried fruit,snacks
1177272,473747.0,25133,4.0,0.0,1.0,prior,3.0,3.0,12.0,21.0,5.0,0.0,Organic String Cheese,21,16,packaged cheese,dairy eggs
4637501,473747.0,30450,5.0,0.0,1.0,prior,3.0,3.0,12.0,21.0,5.0,1.0,Creamy Almond Butter,88,13,spreads,pantry


In [118]:
#df_product_embeddings = sorted_orders_products_departments[columns_df_product_embeddings]
#df_product_embeddings.head(20)

In [212]:
#intermediate dataset for product couts per user per order
basket_stats = (sorted_orders_products_departments
                    .groupby(['user_id', 'order_id'])
                    .agg({'product_id':'count','department_id':'nunique', 'aisle_id':'nunique'})
                    .reset_index()
                    .rename(columns={'product_id':'count_products',
                                     'department_id':'count_departments'
                                     ,'aisle_id':'count_aisles'}))

# user_id basket_stats
user_id_basket_stats = (basket_stats
                            .groupby('user_id')
                            .agg({'count_products':['mean','max','min'],
                                'count_departments':['mean','max','min'], 
                                'count_aisles':['mean','max','min']}))

In [213]:
#Number of items reordered per user
items_reordered_per_user = (sorted_orders_products_departments
                                .groupby('user_id')
                                .agg({'reordered':['sum', 'mean']}))

#number of orders per user
number_of_orders_per_user = (sorted_orders_products_departments
                                .groupby('user_id')
                                .agg({'order_id':'count'})
                                .rename(columns={'order_id':'count_orders'})
                                .reset_index())

#feature engineering from orders dataset
days_between_orders = (orders
                           .groupby('user_id')
                           .agg({'days_since_prior_order': ['min','max','mean','median']}))
customer_lifetime = (orders
                        .groupby('user_id')
                        .agg({'cum_days_since_prior_order': 'max'})
                        .rename(columns={'cum_days_since_prior_order':'max_cum_days_since_prior_order'})
                        .reset_index())

In [214]:
list_groupby_features = [user_id_basket_stats,
                         items_reordered_per_user, 
                         days_between_orders]

for df in list_groupby_features:
    df.columns = pd.Index([e[0] + "_" + e[1].upper() for e in df.columns.tolist()])

In [218]:
#build user_df
users_df = user_id_basket_stats.reset_index().merge(items_reordered_per_user.reset_index(), on='user_id')
users_df = users_df.merge(days_between_orders.reset_index(), on='user_id')
users_df = users_df.merge(number_of_orders_per_user, on='user_id')
users_df = users_df.merge(customer_lifetime, on='user_id')

In [219]:
users_df.head()

Unnamed: 0,user_id,count_products_MEAN,count_products_MAX,count_products_MIN,count_departments_MEAN,count_departments_MAX,count_departments_MIN,count_aisles_MEAN,count_aisles_MAX,count_aisles_MIN,reordered_SUM,reordered_MEAN,days_since_prior_order_MIN,days_since_prior_order_MAX,days_since_prior_order_MEAN,days_since_prior_order_MEDIAN,count_orders,max_cum_days_since_prior_order
0,1.0,6.0,8,5,4.0,4,4,5.0,5,5,13.0,0.722222,0.0,30.0,19.0,19.5,18,190.0
1,2.0,15.0,16,13,6.666667,8,5,9.0,11,7,20.0,0.444444,3.0,30.0,16.285714,13.0,45,418.0
2,3.0,6.666667,9,5,4.0,5,2,5.666667,9,3,13.0,0.65,7.0,21.0,12.0,11.0,20,562.0
3,4.0,4.5,7,2,3.5,5,2,3.5,5,2,1.0,0.111111,0.0,30.0,17.0,19.0,9,647.0
4,5.0,12.0,12,12,5.0,5,5,10.0,10,10,8.0,0.666667,6.0,19.0,11.5,10.5,12,693.0


In [220]:
#build vanilla classifier
users_df.isnull().sum()

user_id                           0
count_products_MEAN               0
count_products_MAX                0
count_products_MIN                0
count_departments_MEAN            0
count_departments_MAX             0
count_departments_MIN             0
count_aisles_MEAN                 0
count_aisles_MAX                  0
count_aisles_MIN                  0
reordered_SUM                     0
reordered_MEAN                    0
days_since_prior_order_MIN        0
days_since_prior_order_MAX        0
days_since_prior_order_MEAN       0
days_since_prior_order_MEDIAN     0
count_orders                      0
max_cum_days_since_prior_order    0
dtype: int64

In [None]:
#similarity between user and item (both being vectors)