### Sample Keras [Implementation](https://www.kaggle.com/nikhithn/sample-keras-nn-implementation/code) 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import csv
import subprocess
from tqdm import tqdm

from keras_tqdm import TQDMNotebookCallback
from keras.models import Sequential
from keras.layers import Dense
from keras import optimizers

import sklearn

import gc

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', -1)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


In [3]:
trainingOrderItems = pd.read_csv("data/order_products__train.csv")
priorOrderItems = pd.read_csv("data/order_products__prior.csv")

orders = pd.read_csv("data/orders.csv")

aisles = pd.read_csv("data/aisles.csv")
departments = pd.read_csv("data/departments.csv")
products = pd.read_csv("data/products.csv")

In [4]:
priorOrderItems.shape

(32434489, 4)

In [5]:
priorOrderItems.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [6]:
orders.shape

(3421083, 7)

In [7]:
#get reorder frequency of product ids + count of orders product id  shows up in
product_reorders = (priorOrderItems.groupby(['product_id'])
                   .agg({'order_id':'count','reordered':'mean'})
                   .rename(columns={'order_id':'product_total_orders', 
                                    'reordered':'product_reorder_rate'}))

In [8]:
product_reorders.head()

Unnamed: 0_level_0,product_total_orders,product_reorder_rate
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1852,0.613391
2,90,0.133333
3,277,0.732852
4,329,0.446809
5,15,0.6


In [9]:
#get basket_size per order
order_group = priorOrderItems.groupby("order_id").agg({'add_to_cart_order':'max'}).rename(columns={'add_to_cart_order':'basket_size'})
priorOrderItems = priorOrderItems.merge(order_group, left_on='order_id', right_index=True)

del order_group
gc.collect()

7

In [10]:
priorOrderItems.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,basket_size
0,2,33120,1,1,9
1,2,28985,2,1,9
2,2,9327,3,0,9
3,2,45918,4,1,9
4,2,30035,5,0,9


In [11]:
# Let's now look at products by user
orderItems = orders.merge(priorOrderItems, on='order_id')
orderItems['user_product_id'] = 1000000*orderItems['user_id']+orderItems['product_id']

userProducts = orderItems[orderItems['eval_set']=='prior']
userProducts = userProducts[['user_id','order_dow','order_hour_of_day',
                             'days_since_prior_order','product_id',
                             'add_to_cart_order','user_product_id', 
                             'basket_size', 'order_number', 'order_id']]

userProducts['orders'] = userProducts['user_id']
userAvgProducts = userProducts.groupby('user_product_id').aggregate({'user_id':'mean',
                                                                     'order_dow':'mean',
                                                                     'order_hour_of_day':'mean',
                                                                    'days_since_prior_order':'mean', 
                                                                     'orders':'count',
                                                                    'product_id':'mean',
                                                                     'add_to_cart_order':'mean',
                                                                     'basket_size':'mean'})

del orderItems
gc.collect()

14

In [12]:
userAvgProducts.head()

Unnamed: 0_level_0,user_id,order_dow,order_hour_of_day,days_since_prior_order,orders,product_id,add_to_cart_order,basket_size
user_product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1000196,1,2.5,10.3,19.555556,10,196,1.4,5.9
1010258,1,2.555556,10.555556,19.555556,9,10258,3.333333,6.0
1010326,1,4.0,15.0,28.0,1,10326,5.0,8.0
1012427,1,2.5,10.3,19.555556,10,12427,3.3,5.9
1013032,1,2.666667,8.0,21.666667,3,13032,6.333333,6.666667


In [13]:
lastOrderId = userProducts[userProducts.groupby(['user_product_id'])['order_number'].transform(max) == userProducts['order_number']]
lastOrderId = lastOrderId[['user_product_id','order_id']]
lastOrderId.rename(columns={'order_id':'last_order_id'}, inplace=True)
userAvgProducts = userAvgProducts.merge(lastOrderId, right_on='user_product_id', left_index=True)
userAvgProducts.set_index('user_product_id', inplace=True)

del userProducts
del lastOrderId
gc.collect()

userAvgProducts.head()

Unnamed: 0_level_0,user_id,order_dow,order_hour_of_day,days_since_prior_order,orders,product_id,add_to_cart_order,basket_size,last_order_id
user_product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1000196,1,2.5,10.3,19.555556,10,196,1.4,5.9,2550362
1010258,1,2.555556,10.555556,19.555556,9,10258,3.333333,6.0,2550362
1010326,1,4.0,15.0,28.0,1,10326,5.0,8.0,431534
1012427,1,2.5,10.3,19.555556,10,12427,3.3,5.9,2550362
1013032,1,2.666667,8.0,21.666667,3,13032,6.333333,6.666667,2550362


In [14]:
user_distinct_products = (userAvgProducts.groupby('user_id')
                          .agg({'product_id':'count'})
                          .rename(columns={'product_id':'distinct_products'}))
                          
user_distinct_products['user_reordered_products'] = (userAvgProducts[userAvgProducts['orders']>1]
                                                         .groupby('user_id')
                                                         .count()['product_id'])
user_distinct_products.head()

Unnamed: 0_level_0,distinct_products,user_reordered_products
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,18,10.0
2,102,37.0
3,33,19.0
4,17,1.0
5,23,8.0


In [15]:
userAvgProducts = user_distinct_products.merge(userAvgProducts.reset_index(),on='user_id')
userAvgProducts['user_reorder_rate'] = userAvgProducts['user_reordered_products']/userAvgProducts['distinct_products']
userAvgProducts.head()

del user_distinct_products
gc.collect()

97

In [16]:
userAvgProducts.shape

(13307953, 13)

In [17]:
features = userAvgProducts.set_index('user_product_id')

del userAvgProducts
gc.collect()

features.head()

Unnamed: 0_level_0,user_id,distinct_products,user_reordered_products,order_dow,order_hour_of_day,days_since_prior_order,orders,product_id,add_to_cart_order,basket_size,last_order_id,user_reorder_rate
user_product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1000196,1,18,10.0,2.5,10.3,19.555556,10,196,1.4,5.9,2550362,0.555556
1010258,1,18,10.0,2.555556,10.555556,19.555556,9,10258,3.333333,6.0,2550362,0.555556
1010326,1,18,10.0,4.0,15.0,28.0,1,10326,5.0,8.0,431534,0.555556
1012427,1,18,10.0,2.5,10.3,19.555556,10,12427,3.3,5.9,2550362,0.555556
1013032,1,18,10.0,2.666667,8.0,21.666667,3,13032,6.333333,6.666667,2550362,0.555556


In [18]:
features.columns = ['user_id','user_distinct_products','user_reordered_products',
                    'user_product_dow','user_product_hod',
                    'user_product_dsp','user_product_orders','product_id',
                    'user_product_addCart', 'user_avg_basket_size',
                    'user_product_last_order_id','user_reorder_rate']

In [19]:
print(features.shape)
features.head()

(13307953, 12)


Unnamed: 0_level_0,user_id,user_distinct_products,user_reordered_products,user_product_dow,user_product_hod,user_product_dsp,user_product_orders,product_id,user_product_addCart,user_avg_basket_size,user_product_last_order_id,user_reorder_rate
user_product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1000196,1,18,10.0,2.5,10.3,19.555556,10,196,1.4,5.9,2550362,0.555556
1010258,1,18,10.0,2.555556,10.555556,19.555556,9,10258,3.333333,6.0,2550362,0.555556
1010326,1,18,10.0,4.0,15.0,28.0,1,10326,5.0,8.0,431534,0.555556
1012427,1,18,10.0,2.5,10.3,19.555556,10,12427,3.3,5.9,2550362,0.555556
1013032,1,18,10.0,2.666667,8.0,21.666667,3,13032,6.333333,6.666667,2550362,0.555556


In [20]:
userOrders = orders[orders['eval_set']=='prior'].groupby('user_id').aggregate({'order_number':'max'})
features = userOrders.reset_index().merge(features.reset_index(), on='user_id')

del userOrders
gc.collect()

features.head()

Unnamed: 0,user_id,order_number,user_product_id,user_distinct_products,user_reordered_products,user_product_dow,user_product_hod,user_product_dsp,user_product_orders,product_id,user_product_addCart,user_avg_basket_size,user_product_last_order_id,user_reorder_rate
0,1,10,1000196,18,10.0,2.5,10.3,19.555556,10,196,1.4,5.9,2550362,0.555556
1,1,10,1010258,18,10.0,2.555556,10.555556,19.555556,9,10258,3.333333,6.0,2550362,0.555556
2,1,10,1010326,18,10.0,4.0,15.0,28.0,1,10326,5.0,8.0,431534,0.555556
3,1,10,1012427,18,10.0,2.5,10.3,19.555556,10,12427,3.3,5.9,2550362,0.555556
4,1,10,1013032,18,10.0,2.666667,8.0,21.666667,3,13032,6.333333,6.666667,2550362,0.555556


In [21]:
features['order_number'] = features['user_product_orders']/features['order_number']
features.rename(columns={'order_number': 'user_product_reorder_rate'}, inplace=True)
features = features.set_index('user_product_id')

In [22]:
features=features.merge(product_reorders, left_on='product_id',right_index=True)
features['product_reorders'] = features['product_reorder_rate'] * features['product_total_orders']
features.head()

Unnamed: 0_level_0,user_id,user_product_reorder_rate,user_distinct_products,user_reordered_products,user_product_dow,user_product_hod,user_product_dsp,user_product_orders,product_id,user_product_addCart,user_avg_basket_size,user_product_last_order_id,user_reorder_rate,product_total_orders,product_reorder_rate,product_reorders
user_product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1000196,1,1.0,18,10.0,2.5,10.3,19.555556,10,196,1.4,5.9,2550362,0.555556,35791,0.77648,27791.0
15000196,15,0.227273,13,9.0,1.4,11.0,8.6,5,196,2.2,2.8,487368,0.692308,35791,0.77648,27791.0
19000196,19,0.333333,133,39.0,5.0,14.333333,6.666667,3,196,6.333333,31.333333,532817,0.293233,35791,0.77648,27791.0
21000196,21,0.030303,102,30.0,3.0,10.0,6.0,1,196,2.0,7.0,1573906,0.294118,35791,0.77648,27791.0
31000196,31,0.1,190,66.0,3.0,9.5,7.5,2,196,15.5,27.5,2231262,0.347368,35791,0.77648,27791.0


In [23]:
user_order_group = (orders[orders['eval_set']=='prior']
                    .groupby('user_id')
                    .agg({'order_id':'count',
                          'days_since_prior_order':'mean'})
                    .rename(columns={'order_id':'count_user_orders',
                                    'days_since_prior_order':'user_order_dsp'}))

features = features.reset_index().merge(user_order_group.reset_index(),on='user_id')

del user_order_group
gc.collect()

21

In [24]:
features = features.set_index('user_product_id')
features.head()

Unnamed: 0_level_0,user_id,user_product_reorder_rate,user_distinct_products,user_reordered_products,user_product_dow,user_product_hod,user_product_dsp,user_product_orders,product_id,user_product_addCart,user_avg_basket_size,user_product_last_order_id,user_reorder_rate,product_total_orders,product_reorder_rate,product_reorders,count_user_orders,user_order_dsp,user_total_products
user_product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1000196,1,1.0,18,10.0,2.5,10.3,19.555556,10,196,1.4,5.9,2550362,0.555556,35791,0.77648,27791.0,10,19.555556,59.0
1010258,1,0.9,18,10.0,2.555556,10.555556,19.555556,9,10258,3.333333,6.0,2550362,0.555556,1946,0.713772,1389.0,10,19.555556,60.0
1010326,1,0.1,18,10.0,4.0,15.0,28.0,1,10326,5.0,8.0,431534,0.555556,5526,0.652009,3603.0,10,19.555556,80.0
1012427,1,1.0,18,10.0,2.5,10.3,19.555556,10,12427,3.3,5.9,2550362,0.555556,6476,0.740735,4797.0,10,19.555556,59.0
1013032,1,0.3,18,10.0,2.666667,8.0,21.666667,3,13032,6.333333,6.666667,2550362,0.555556,3751,0.657158,2465.0,10,19.555556,66.666667


In [25]:
features['user_total_products'] = features['user_avg_basket_size']*features['count_user_orders']
features.head()

Unnamed: 0,user_product_id,user_id,user_product_reorder_rate,user_distinct_products,user_reordered_products,user_product_dow,user_product_hod,user_product_dsp,user_product_orders,product_id,user_product_addCart,user_avg_basket_size,user_product_last_order_id,user_reorder_rate,product_total_orders,product_reorder_rate,product_reorders,count_user_orders,user_order_dsp,user_total_products
0,1000196,1,1.0,18,10.0,2.5,10.3,19.555556,10,196,1.4,5.9,2550362,0.555556,35791,0.77648,27791.0,10,19.555556,59.0
1,1010258,1,0.9,18,10.0,2.555556,10.555556,19.555556,9,10258,3.333333,6.0,2550362,0.555556,1946,0.713772,1389.0,10,19.555556,60.0
2,1010326,1,0.1,18,10.0,4.0,15.0,28.0,1,10326,5.0,8.0,431534,0.555556,5526,0.652009,3603.0,10,19.555556,80.0
3,1012427,1,1.0,18,10.0,2.5,10.3,19.555556,10,12427,3.3,5.9,2550362,0.555556,6476,0.740735,4797.0,10,19.555556,59.0
4,1013032,1,0.3,18,10.0,2.666667,8.0,21.666667,3,13032,6.333333,6.666667,2550362,0.555556,3751,0.657158,2465.0,10,19.555556,66.666667


In [28]:
n = 10000
list_df = [features[i:i+n] for i in range(0, features.shape[0], n)]

list_df[0].to_csv("data/features_wo_embeddings.csv", index=False)

for l in list_df[1:]: 
    l.to_csv("data/features_wo_embeddings.csv", index=False, header=False, mode='a')

In [27]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13307953 entries, 0 to 13307952
Data columns (total 20 columns):
user_product_id               int64
user_id                       int64
user_product_reorder_rate     float64
user_distinct_products        int64
user_reordered_products       float64
user_product_dow              float64
user_product_hod              float64
user_product_dsp              float64
user_product_orders           int64
product_id                    int64
user_product_addCart          float64
user_avg_basket_size          float64
user_product_last_order_id    int64
user_reorder_rate             float64
product_total_orders          int64
product_reorder_rate          float64
product_reorders              float64
count_user_orders             int64
user_order_dsp                float64
user_total_products           float64
dtypes: float64(12), int64(8)
memory usage: 2.1 GB


In [29]:
gc.collect()

706

### Add Embeddings

In [39]:
sample_embeddings = pd.read_csv('data/sample__product_embeddings_results_10000.csv')

In [41]:
sample_embeddings = sample_embeddings.merge(products[['product_id','product_name']], on='product_name')

In [43]:
sample_embeddings.drop(['product_name'], axis=1, inplace=True)

In [45]:
features = features.reset_index().merge(sample_embeddings,on='product_id')

In [46]:
print(features.shape)
features.head()

(13307950, 320)


Unnamed: 0,user_product_id,user_id,user_product_reorder_rate,user_distinct_products,user_reordered_products,user_product_dow,user_product_hod,user_product_dsp,user_product_orders,product_id,user_product_addCart,user_avg_basket_size,user_product_last_order_id,user_reorder_rate,product_total_orders,product_reorder_rate,product_reorders,count_user_orders,user_order_dsp,user_total_products,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,...,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,1000196,1,1.0,18,10.0,2.5,10.3,19.555556,10,196,1.4,5.9,2550362,0.555556,35791,0.77648,27791.0,10,19.555556,59.0,-0.048631,0.069633,0.01825,0.005274,-0.047201,0.039925,0.032421,0.017653,-0.069377,0.022546,0.038971,0.009014,0.089305,-0.015833,0.020525,-0.032151,-0.022865,-0.109547,-0.0056,0.044165,-0.003064,0.07041,0.034775,-0.036171,0.03227,-0.029892,0.033016,-0.014076,0.028392,-0.038009,...,-0.043994,0.062889,0.019922,0.107018,0.073617,-0.15235,-0.07208,-0.020137,0.00611,-0.038584,0.015883,0.0797,-0.003314,-0.027355,0.174971,-0.013496,0.029454,0.044987,0.048319,0.056494,0.032231,-0.018851,-0.081571,-0.038643,-0.079232,-0.016469,-0.147787,-0.037878,0.065717,0.158066,-0.06919,-0.005415,-0.070279,0.031691,-0.058964,-0.055847,-0.079774,0.098916,-0.004713,0.129708,-0.023472,0.044532,-0.033235,-0.021021,-0.073629,0.028521,0.026339,-0.052897,0.03465,0.035633
1,15000196,15,0.227273,13,9.0,1.4,11.0,8.6,5,196,2.2,2.8,487368,0.692308,35791,0.77648,27791.0,22,10.809524,61.6,-0.048631,0.069633,0.01825,0.005274,-0.047201,0.039925,0.032421,0.017653,-0.069377,0.022546,0.038971,0.009014,0.089305,-0.015833,0.020525,-0.032151,-0.022865,-0.109547,-0.0056,0.044165,-0.003064,0.07041,0.034775,-0.036171,0.03227,-0.029892,0.033016,-0.014076,0.028392,-0.038009,...,-0.043994,0.062889,0.019922,0.107018,0.073617,-0.15235,-0.07208,-0.020137,0.00611,-0.038584,0.015883,0.0797,-0.003314,-0.027355,0.174971,-0.013496,0.029454,0.044987,0.048319,0.056494,0.032231,-0.018851,-0.081571,-0.038643,-0.079232,-0.016469,-0.147787,-0.037878,0.065717,0.158066,-0.06919,-0.005415,-0.070279,0.031691,-0.058964,-0.055847,-0.079774,0.098916,-0.004713,0.129708,-0.023472,0.044532,-0.033235,-0.021021,-0.073629,0.028521,0.026339,-0.052897,0.03465,0.035633
2,19000196,19,0.333333,133,39.0,5.0,14.333333,6.666667,3,196,6.333333,31.333333,532817,0.293233,35791,0.77648,27791.0,9,9.5,282.0,-0.048631,0.069633,0.01825,0.005274,-0.047201,0.039925,0.032421,0.017653,-0.069377,0.022546,0.038971,0.009014,0.089305,-0.015833,0.020525,-0.032151,-0.022865,-0.109547,-0.0056,0.044165,-0.003064,0.07041,0.034775,-0.036171,0.03227,-0.029892,0.033016,-0.014076,0.028392,-0.038009,...,-0.043994,0.062889,0.019922,0.107018,0.073617,-0.15235,-0.07208,-0.020137,0.00611,-0.038584,0.015883,0.0797,-0.003314,-0.027355,0.174971,-0.013496,0.029454,0.044987,0.048319,0.056494,0.032231,-0.018851,-0.081571,-0.038643,-0.079232,-0.016469,-0.147787,-0.037878,0.065717,0.158066,-0.06919,-0.005415,-0.070279,0.031691,-0.058964,-0.055847,-0.079774,0.098916,-0.004713,0.129708,-0.023472,0.044532,-0.033235,-0.021021,-0.073629,0.028521,0.026339,-0.052897,0.03465,0.035633
3,21000196,21,0.030303,102,30.0,3.0,10.0,6.0,1,196,2.0,7.0,1573906,0.294118,35791,0.77648,27791.0,33,9.90625,231.0,-0.048631,0.069633,0.01825,0.005274,-0.047201,0.039925,0.032421,0.017653,-0.069377,0.022546,0.038971,0.009014,0.089305,-0.015833,0.020525,-0.032151,-0.022865,-0.109547,-0.0056,0.044165,-0.003064,0.07041,0.034775,-0.036171,0.03227,-0.029892,0.033016,-0.014076,0.028392,-0.038009,...,-0.043994,0.062889,0.019922,0.107018,0.073617,-0.15235,-0.07208,-0.020137,0.00611,-0.038584,0.015883,0.0797,-0.003314,-0.027355,0.174971,-0.013496,0.029454,0.044987,0.048319,0.056494,0.032231,-0.018851,-0.081571,-0.038643,-0.079232,-0.016469,-0.147787,-0.037878,0.065717,0.158066,-0.06919,-0.005415,-0.070279,0.031691,-0.058964,-0.055847,-0.079774,0.098916,-0.004713,0.129708,-0.023472,0.044532,-0.033235,-0.021021,-0.073629,0.028521,0.026339,-0.052897,0.03465,0.035633
4,31000196,31,0.1,190,66.0,3.0,9.5,7.5,2,196,15.5,27.5,2231262,0.347368,35791,0.77648,27791.0,20,4.894737,550.0,-0.048631,0.069633,0.01825,0.005274,-0.047201,0.039925,0.032421,0.017653,-0.069377,0.022546,0.038971,0.009014,0.089305,-0.015833,0.020525,-0.032151,-0.022865,-0.109547,-0.0056,0.044165,-0.003064,0.07041,0.034775,-0.036171,0.03227,-0.029892,0.033016,-0.014076,0.028392,-0.038009,...,-0.043994,0.062889,0.019922,0.107018,0.073617,-0.15235,-0.07208,-0.020137,0.00611,-0.038584,0.015883,0.0797,-0.003314,-0.027355,0.174971,-0.013496,0.029454,0.044987,0.048319,0.056494,0.032231,-0.018851,-0.081571,-0.038643,-0.079232,-0.016469,-0.147787,-0.037878,0.065717,0.158066,-0.06919,-0.005415,-0.070279,0.031691,-0.058964,-0.055847,-0.079774,0.098916,-0.004713,0.129708,-0.023472,0.044532,-0.033235,-0.021021,-0.073629,0.028521,0.026339,-0.052897,0.03465,0.035633


In [None]:
features = features.set_index('user_product_id')

### Split Train Test

In [31]:
# Ready the inputs into model
train_orders = orders[orders['eval_set']=='train']
test_orders = orders[orders['eval_set']=='test']

In [32]:
test_orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
38,2774568,3,test,13,5,15,11.0
44,329954,4,test,6,3,12,30.0
53,1528013,6,test,4,3,16,22.0
96,1376945,11,test,8,6,11,8.0
102,1356845,12,test,6,1,20,30.0


In [18]:
test_orders.shape

NameError: name 'test_orders' is not defined

In [None]:
train_features = pd.merge(train_orders, features.reset_index(), on='user_id')
test_features = pd.merge(test_orders, features.reset_index(), on='user_id')

In [None]:
train_features.shape

In [None]:
test_features.shape

In [None]:
test_features.head()

In [None]:
# del train_orders
# del test_orders
# del features

In [None]:
train_features['diff_order_hod'] = abs(train_features['user_product_hod']-train_features['order_hour_of_day']).map(lambda x: min(x, 24-x))
train_features['ratio_dsp'] = train_features['user_product_dsp']/train_features['days_since_prior_order']
train_features['diff_dow'] = abs(train_features['user_product_dow']-train_features['order_dow']).map(lambda x: min (x, 7-x))
train_features['user_product_orders_since_last'] = train_features['user_orders'] - train_features['user_product_last_order_id'].map(orders.order_number)
train_features['user_product_hour_vs_last'] = abs(train_features['order_hour_of_day'] - train_features['user_product_last_order_id'].map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x))

test_features['diff_order_hod'] = abs(test_features['user_product_hod']-test_features['order_hour_of_day']).map(lambda x: min(x, 24-x))
test_features['ratio_dsp'] = test_features['user_product_dsp']/test_features['days_since_prior_order']
test_features['diff_dow'] = abs(test_features['user_product_dow']-test_features['order_dow']).map(lambda x: min (x, 7-x))
test_features['user_product_orders_since_last'] = test_features['user_orders'] - test_features['user_product_last_order_id'].map(orders.order_number)
test_features['user_product_hour_vs_last'] = abs(test_features['order_hour_of_day'] - test_features['user_product_last_order_id'].map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x))

In [None]:
train_features.sort_values(['order_id','product_id'], inplace=True)
test_features.sort_values(['order_id','product_id'], inplace=True)

In [None]:
train_features.drop(['eval_set','order_number', 'product_name'], axis=1, inplace=True)
test_features.drop(['eval_set','order_number','product_name'], axis=1, inplace=True)

In [None]:
train_orders = orders[orders['eval_set']=='train']
trainProducts = pd.merge(train_orders,trainingOrderItems, on='order_id')

In [None]:
trainProducts = trainProducts.groupby('user_id')['product_id'].apply(set)
trainProducts.head()

In [60]:
trainLabels = []
for row in tqdm(train_features.itertuples()):
    trainLabels += [row.product_id in trainProducts[row.user_id]]
print(len(trainLabels))
print(train_features.shape)


8474661it [02:16, 62308.56it/s]


8474661
(8474661, 29)


In [61]:
trainLabels[:5]

[False, False, True, False, True]

In [62]:
num_feature_list=['diff_dow','ratio_dsp','diff_order_hod', 'user_avg_basket_size', 'user_product_hour_vs_last',
                  'days_since_prior_order','user_product_reorder_rate','user_product_dow','user_reordered_products',
                  'user_product_hod','user_product_dsp','user_product_orders','user_product_addCart',
                  'product_reorder_rate','product_total_orders', 'user_reorder_rate', 'user_distinct_products',
                  'user_product_orders_since_last']

In [63]:
train_features[num_feature_list] = (train_features[num_feature_list]-train_features[num_feature_list].mean())/((train_features[num_feature_list].max()-train_features[num_feature_list].min()))
test_features[num_feature_list] = (test_features[num_feature_list]-test_features[num_feature_list].mean())/((test_features[num_feature_list].max()-test_features[num_feature_list].min()))

In [64]:
train_features.head()

Unnamed: 0,order_id,user_id,order_dow,order_hour_of_day,days_since_prior_order,user_product_id,user_product_reorder_rate,user_distinct_products,user_reordered_products,user_product_dow,user_product_hod,user_product_dsp,user_product_orders,product_id,user_product_addCart,user_avg_basket_size,user_product_last_order_id,user_reorder_rate,product_total_orders,product_reorder_rate,product_reorders,user_orders,user_order_dsp,user_total_products,diff_order_hod,ratio_dsp,diff_dow,user_product_orders_since_last,user_product_hour_vs_last
4629306,1,112108,4,10,-0.178176,112108002067,0.181188,-0.140862,-0.098748,-0.292511,-0.153366,-0.18977,-0.01465,2067,-0.015349,-0.052874,2541372,0.268539,-0.040995,0.076044,1902.0,3,11.0,24.0,-0.319506,,0.4003,-0.052444,-0.298248
4629308,1,112108,4,10,-0.178176,112108005707,0.517923,-0.140862,-0.098748,-0.209177,0.085764,-0.056437,-0.004446,5707,-0.029238,-0.06329,186706,0.268539,-0.0451,0.044001,690.0,3,11.0,19.5,0.138827,,0.257443,-0.031931,-0.381581
4629307,1,112108,4,10,-0.178176,112108011109,0.517923,-0.140862,-0.098748,-0.292511,-0.066409,-0.18977,-0.004446,11109,-0.036182,-0.052874,2541372,0.268539,-0.038172,0.190429,3192.0,3,11.0,24.0,-0.15284,,0.4003,-0.052444,-0.298248
4629304,1,112108,4,10,-0.178176,112108014947,0.854658,-0.140862,-0.098748,-0.236955,0.064025,-0.056437,0.005758,14947,-0.038497,-0.059818,186706,0.268539,0.013487,0.295131,23463.0,3,11.0,21.0,0.09716,,0.305062,-0.031931,-0.381581
4629302,1,112108,4,10,-0.178176,112108022035,0.517923,-0.140862,-0.098748,-0.292511,-0.066409,-0.18977,-0.004446,22035,-0.029238,-0.052874,2541372,0.268539,0.078646,0.244622,45639.0,3,11.0,24.0,-0.15284,,0.4003,-0.052444,-0.298248


In [65]:
cat_feature_list = []
feature_list = num_feature_list+cat_feature_list

In [66]:
X_train = train_features[feature_list].fillna(0).as_matrix()[:500000]
Y_train = numpy.array(trainLabels).astype('int8')[:500000]

  """Entry point for launching an IPython kernel.


In [67]:
Y_train

array([0, 0, 1, ..., 0, 0, 0], dtype=int8)

In [68]:
# fix random seed for reproducibility
numpy.random.seed(7)

# create model
model = Sequential()
model.add(Dense(13, input_dim=len(feature_list), activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [69]:
# Compile model
adam = optimizers.Adam()

# Fit the model
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model.fit(X_train, Y_train, epochs=10,verbose=0, callbacks=[TQDMNotebookCallback()])





<keras.callbacks.History at 0x181ef79048>

In [70]:
X_test = test_features[feature_list].fillna(0).as_matrix()
test_preds = model.predict(X_test)


  """Entry point for launching an IPython kernel.


In [71]:
test_preds

array([[0.07427749],
       [0.11459035],
       [0.05726245],
       ...,
       [0.06803884],
       [0.11764333],
       [0.10903908]], dtype=float32)

In [72]:
past_order_id = -1
reorderedProducts = []
output = []
i = 0 
maxProd = [0,0]

In [73]:
test_features.head()

Unnamed: 0,order_id,user_id,order_dow,order_hour_of_day,days_since_prior_order,user_product_id,user_product_reorder_rate,user_distinct_products,user_reordered_products,user_product_dow,user_product_hod,user_product_dsp,user_product_orders,product_id,user_product_addCart,user_avg_basket_size,user_product_last_order_id,user_reorder_rate,product_total_orders,product_reorder_rate,product_reorders,user_orders,user_order_dsp,user_total_products,diff_order_hod,ratio_dsp,diff_dow,user_product_orders_since_last,user_product_hour_vs_last
858095,17,36855,6,15,-0.446237,36855001283,0.096375,-0.145215,-0.14103,-0.458892,-0.196971,0.576189,-0.014692,1283,-0.040674,-0.061448,234692,-0.16614,-0.047506,-0.351908,6.0,4,14.0,36.0,0.181576,,-0.170464,-0.06782,-0.132405
858092,17,36855,6,15,-0.446237,36855006291,0.096375,-0.145215,-0.14103,0.541108,0.02042,-0.223811,-0.014692,6291,-0.002212,-0.061448,3248434,-0.16614,-0.043403,0.090071,1217.0,4,14.0,36.0,-0.235091,,-0.456178,-0.231922,-0.215738
858088,17,36855,6,15,-0.446237,36855007035,0.096375,-0.145215,-0.14103,-0.292225,-0.023058,,-0.014692,7035,-0.059905,-0.098485,898818,-0.16614,-0.043477,-0.060709,924.0,4,14.0,20.0,-0.151758,,0.115251,-0.072948,-0.215738
858094,17,36855,6,15,-0.446237,36855011494,0.096375,-0.145215,-0.14103,-0.292225,-0.023058,,-0.014692,11494,-0.040674,-0.098485,898818,-0.16614,-0.044793,-0.184154,476.0,4,14.0,20.0,-0.151758,,0.115251,-0.072948,-0.215738
858097,17,36855,6,15,-0.446237,36855013107,0.601477,-0.145215,-0.14103,0.152219,-0.066536,0.042856,0.005716,13107,-0.066315,-0.07688,1058761,-0.16614,-0.046778,0.108324,237.0,4,14.0,29.333333,-0.068424,,0.210489,-0.098589,-0.132405


In [74]:
for row in tqdm(test_features.itertuples()):
    if (row.order_id!=past_order_id):
        if (past_order_id==-1):
            pass
        else:
            if (reorderedProducts == []):
                reorderedProducts.append(maxProd[0])
            output.append([past_order_id," ".join(reorderedProducts)])
            reorderedProducts = []
            maxProd = [0,0]
        past_order_id = row.order_id
        
    if (test_preds[i]>.2):
            reorderedProducts.append(str(row.product_id))
    else:
        if (test_preds[i] > maxProd[1]):
            maxProd = [str(row.product_id),test_preds[i]]
    i+=1


4833292it [00:26, 185618.68it/s]


In [75]:
output.append([past_order_id," ".join(reorderedProducts)])

output[5]

[313, '12779 13198 14077 21903 25890 45007 46906 49683']

In [79]:
output[1][1]

'21137 39180 39475 43504 47029 47766 47792'

In [83]:
order_id_list = []
product_output_list = []

for order_id in range(0,len(output)):
    order_id_list.append(output[order_id][0])
    
for product_list in range(0,len(output)):
    product_output_list.append(output[product_list][1]) 

In [84]:
product_output_list[5]

'12779 13198 14077 21903 25890 45007 46906 49683'

In [85]:
kernel_submission = pd.DataFrame({'order_id':order_id_list, 'products':product_output_list})
print(kernel_submission.shape)
kernel_submission.head()

(75000, 2)


Unnamed: 0,order_id,products
0,17,13107 21463 21709 47766
1,34,21137 39180 39475 43504 47029 47766 47792
2,137,2326 23794 24852 25890 38689 41787 43352
3,182,5479 9337 13629 21903 24009 27104 30391 33000 34243 35951 39275 47209 47672
4,257,1025 4605 13176 13870 21137 24838 24852 27104 27966 28476 29837 30233 30391 36929 38558 45013 47766 49235


In [86]:
pwd

'/Users/waficel-assi/instacart-kaggle-competition'

In [87]:
kernel_submission.to_csv('./data/keras_kernel_submission_sample.csv', index=False)