### Sample Keras [Implementation](https://www.kaggle.com/nikhithn/sample-keras-nn-implementation/code) 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import csv
import subprocess
from tqdm import tqdm

from keras_tqdm import TQDMNotebookCallback
from keras.models import Sequential
from keras.layers import Dense
from keras import optimizers

import sklearn

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', -1)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


In [3]:
trainingOrderItems = pd.read_csv("data/order_products__train.csv")
priorOrderItems = pd.read_csv("data/order_products__prior.csv")

orders = pd.read_csv("data/orders.csv")

aisles = pd.read_csv("data/aisles.csv")
departments = pd.read_csv("data/departments.csv")
products = pd.read_csv("data/products.csv")

In [4]:
priorOrderItems.shape

(32434489, 4)

In [9]:
priorOrderItems.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [5]:
orders.shape

(3421083, 7)

In [55]:
#get reorder frequency of product ids + count of orders product id  shows up in
product_reorders = (priorOrderItems.groupby(['product_id'])
                   .agg({'order_id':'count','reordered':'mean'})
                   .rename(columns={'order_id':'product_total_orders', 
                                    'reordered':'product_reorder_rate'}))

In [56]:
product_reorders.head()

Unnamed: 0_level_0,product_total_orders,product_reorder_rate
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1852,0.613391
2,90,0.133333
3,277,0.732852
4,329,0.446809
5,15,0.6


In [37]:
#get basket_size per order
order_group = priorOrderItems.groupby("order_id").agg({'add_to_cart_order':'max'}).rename(columns={'add_to_cart_order':'basket_size'})
priorOrderItems = priorOrderItems.merge(order_group, left_on='order_id', right_index=True)

In [38]:
priorOrderItems.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,basket_size
0,2,33120,1,1,9
1,2,28985,2,1,9
2,2,9327,3,0,9
3,2,45918,4,1,9
4,2,30035,5,0,9


In [39]:
# Let's now look at products by user
orderItems = orders.merge(priorOrderItems, on='order_id')
orderItems['user_product_id'] = 1000000*orderItems['user_id']+orderItems['product_id']

userProducts = orderItems[orderItems['eval_set']=='prior']
userProducts = userProducts[['user_id','order_dow','order_hour_of_day',
                             'days_since_prior_order','product_id',
                             'add_to_cart_order','user_product_id', 
                             'basket_size', 'order_number', 'order_id']]

userProducts['orders'] = userProducts['user_id']
userAvgProducts = userProducts.groupby('user_product_id').aggregate({'user_id':'mean',
                                                                     'order_dow':'mean',
                                                                     'order_hour_of_day':'mean',
                                                                    'days_since_prior_order':'mean', 
                                                                     'orders':'count',
                                                                    'product_id':'mean',
                                                                     'add_to_cart_order':'mean',
                                                                     'basket_size':'mean'})
del orderItems

In [40]:
userAvgProducts.head()

Unnamed: 0_level_0,user_id,order_dow,order_hour_of_day,days_since_prior_order,orders,product_id,add_to_cart_order,basket_size
user_product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1000196,1,2.5,10.3,19.555556,10,196,1.4,5.9
1010258,1,2.555556,10.555556,19.555556,9,10258,3.333333,6.0
1010326,1,4.0,15.0,28.0,1,10326,5.0,8.0
1012427,1,2.5,10.3,19.555556,10,12427,3.3,5.9
1013032,1,2.666667,8.0,21.666667,3,13032,6.333333,6.666667


In [41]:
lastOrderId = userProducts[userProducts.groupby(['user_product_id'])['order_number'].transform(max) == userProducts['order_number']]
lastOrderId = lastOrderId[['user_product_id','order_id']]
lastOrderId.rename(columns={'order_id':'last_order_id'}, inplace=True)
userAvgProducts = userAvgProducts.merge(lastOrderId, right_on='user_product_id', left_index=True)
userAvgProducts.set_index('user_product_id', inplace=True)
userAvgProducts.head()

Unnamed: 0_level_0,user_id,order_dow,order_hour_of_day,days_since_prior_order,orders,product_id,add_to_cart_order,basket_size,last_order_id
user_product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1000196,1,2.5,10.3,19.555556,10,196,1.4,5.9,2550362
1010258,1,2.555556,10.555556,19.555556,9,10258,3.333333,6.0,2550362
1010326,1,4.0,15.0,28.0,1,10326,5.0,8.0,431534
1012427,1,2.5,10.3,19.555556,10,12427,3.3,5.9,2550362
1013032,1,2.666667,8.0,21.666667,3,13032,6.333333,6.666667,2550362


In [45]:
user_distinct_products = (userAvgProducts.groupby('user_id')
                          .agg({'product_id':'count'})
                          .rename(columns={'product_id':'distinct_products'}))
                          
user_distinct_products['user_reordered_products'] = (userAvgProducts[userAvgProducts['orders']>1]
                                                         .groupby('user_id')
                                                         .count()['product_id'])
user_distinct_products.head()

Unnamed: 0_level_0,distinct_products,user_reordered_products
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,18,10.0
2,102,37.0
3,33,19.0
4,17,1.0
5,23,8.0


In [46]:
userAvgProducts = user_distinct_products.merge(userAvgProducts.reset_index(),on='user_id')
userAvgProducts['user_reorder_rate'] = userAvgProducts['user_reordered_products']/userAvgProducts['distinct_products']
userAvgProducts.head()

Unnamed: 0,user_id,distinct_products,user_reordered_products,user_product_id,order_dow,order_hour_of_day,days_since_prior_order,orders,product_id,add_to_cart_order,basket_size,last_order_id,user_reorder_rate
0,1,18,10.0,1000196,2.5,10.3,19.555556,10,196,1.4,5.9,2550362,0.555556
1,1,18,10.0,1010258,2.555556,10.555556,19.555556,9,10258,3.333333,6.0,2550362,0.555556
2,1,18,10.0,1010326,4.0,15.0,28.0,1,10326,5.0,8.0,431534,0.555556
3,1,18,10.0,1012427,2.5,10.3,19.555556,10,12427,3.3,5.9,2550362,0.555556
4,1,18,10.0,1013032,2.666667,8.0,21.666667,3,13032,6.333333,6.666667,2550362,0.555556


In [47]:
userAvgProducts.shape

(13307953, 13)

In [65]:
features = userAvgProducts.set_index('user_product_id')
features.head()

Unnamed: 0_level_0,user_id,distinct_products,user_reordered_products,order_dow,order_hour_of_day,days_since_prior_order,orders,product_id,add_to_cart_order,basket_size,last_order_id,user_reorder_rate
user_product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1000196,1,18,10.0,2.5,10.3,19.555556,10,196,1.4,5.9,2550362,0.555556
1010258,1,18,10.0,2.555556,10.555556,19.555556,9,10258,3.333333,6.0,2550362,0.555556
1010326,1,18,10.0,4.0,15.0,28.0,1,10326,5.0,8.0,431534,0.555556
1012427,1,18,10.0,2.5,10.3,19.555556,10,12427,3.3,5.9,2550362,0.555556
1013032,1,18,10.0,2.666667,8.0,21.666667,3,13032,6.333333,6.666667,2550362,0.555556


In [66]:
features.columns = ['user_id','user_distinct_products','user_reordered_products',
                    'user_product_dow','user_product_hod',
                    'user_product_dsp','user_product_orders','product_id',
                    'user_product_addCart', 'user_avg_basket_size',
                    'user_product_last_order_id','user_reorder_rate']

In [67]:
print(features.shape)
features.head()

(13307953, 12)


Unnamed: 0_level_0,user_id,user_distinct_products,user_reordered_products,user_product_dow,user_product_hod,user_product_dsp,user_product_orders,product_id,user_product_addCart,user_avg_basket_size,user_product_last_order_id,user_reorder_rate
user_product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1000196,1,18,10.0,2.5,10.3,19.555556,10,196,1.4,5.9,2550362,0.555556
1010258,1,18,10.0,2.555556,10.555556,19.555556,9,10258,3.333333,6.0,2550362,0.555556
1010326,1,18,10.0,4.0,15.0,28.0,1,10326,5.0,8.0,431534,0.555556
1012427,1,18,10.0,2.5,10.3,19.555556,10,12427,3.3,5.9,2550362,0.555556
1013032,1,18,10.0,2.666667,8.0,21.666667,3,13032,6.333333,6.666667,2550362,0.555556


In [68]:
userOrders = orders[orders['eval_set']=='prior'].groupby('user_id').aggregate({'order_number':'max'})

In [69]:
print(userOrders.shape)
userOrders.head()

(206209, 1)


Unnamed: 0_level_0,order_number
user_id,Unnamed: 1_level_1
1,10
2,14
3,12
4,5
5,4


In [70]:
features = userOrders.reset_index().merge(features.reset_index(), on='user_id')
features.head()

Unnamed: 0,user_id,order_number,user_product_id,user_distinct_products,user_reordered_products,user_product_dow,user_product_hod,user_product_dsp,user_product_orders,product_id,user_product_addCart,user_avg_basket_size,user_product_last_order_id,user_reorder_rate
0,1,10,1000196,18,10.0,2.5,10.3,19.555556,10,196,1.4,5.9,2550362,0.555556
1,1,10,1010258,18,10.0,2.555556,10.555556,19.555556,9,10258,3.333333,6.0,2550362,0.555556
2,1,10,1010326,18,10.0,4.0,15.0,28.0,1,10326,5.0,8.0,431534,0.555556
3,1,10,1012427,18,10.0,2.5,10.3,19.555556,10,12427,3.3,5.9,2550362,0.555556
4,1,10,1013032,18,10.0,2.666667,8.0,21.666667,3,13032,6.333333,6.666667,2550362,0.555556


In [71]:
features['order_number'] = features['user_product_orders']/features['order_number']
features.rename(columns={'order_number': 'user_product_reorder_rate'}, inplace=True)
features = features.set_index('user_product_id')

In [72]:
features=features.merge(product_reorders, left_on='product_id',right_index=True)
features['product_reorders'] = features['product_reorder_rate'] * features['product_total_orders']
features.head()

Unnamed: 0_level_0,user_id,user_product_reorder_rate,user_distinct_products,user_reordered_products,user_product_dow,user_product_hod,user_product_dsp,user_product_orders,product_id,user_product_addCart,user_avg_basket_size,user_product_last_order_id,user_reorder_rate,product_total_orders,product_reorder_rate,product_reorders
user_product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1000196,1,1.0,18,10.0,2.5,10.3,19.555556,10,196,1.4,5.9,2550362,0.555556,35791,0.77648,27791.0
15000196,15,0.227273,13,9.0,1.4,11.0,8.6,5,196,2.2,2.8,487368,0.692308,35791,0.77648,27791.0
19000196,19,0.333333,133,39.0,5.0,14.333333,6.666667,3,196,6.333333,31.333333,532817,0.293233,35791,0.77648,27791.0
21000196,21,0.030303,102,30.0,3.0,10.0,6.0,1,196,2.0,7.0,1573906,0.294118,35791,0.77648,27791.0
31000196,31,0.1,190,66.0,3.0,9.5,7.5,2,196,15.5,27.5,2231262,0.347368,35791,0.77648,27791.0


In [73]:
user_order_group = (orders[orders['eval_set']=='prior']
                    .groupby('user_id')
                    .agg({'order_id':'count',
                          'days_since_prior_order':'mean'})
                    .rename(columns={'order_id':'count_user_orders',
                                    'days_since_prior_order':'user_order_dsp'}))
user_order_group.head()

Unnamed: 0_level_0,count_user_orders,user_order_dsp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,10,19.555556
2,14,15.230769
3,12,12.090909
4,5,13.75
5,4,13.333333


In [74]:
features = features.reset_index().merge(user_order_group.reset_index(),on='user_id')
features = features.set_index('user_product_id')
features['user_total_products'] = features['user_avg_basket_size']*features['count_user_orders']

In [75]:
features.head()

Unnamed: 0_level_0,user_id,user_product_reorder_rate,user_distinct_products,user_reordered_products,user_product_dow,user_product_hod,user_product_dsp,user_product_orders,product_id,user_product_addCart,user_avg_basket_size,user_product_last_order_id,user_reorder_rate,product_total_orders,product_reorder_rate,product_reorders,count_user_orders,user_order_dsp,user_total_products
user_product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1000196,1,1.0,18,10.0,2.5,10.3,19.555556,10,196,1.4,5.9,2550362,0.555556,35791,0.77648,27791.0,10,19.555556,59.0
1010258,1,0.9,18,10.0,2.555556,10.555556,19.555556,9,10258,3.333333,6.0,2550362,0.555556,1946,0.713772,1389.0,10,19.555556,60.0
1010326,1,0.1,18,10.0,4.0,15.0,28.0,1,10326,5.0,8.0,431534,0.555556,5526,0.652009,3603.0,10,19.555556,80.0
1012427,1,1.0,18,10.0,2.5,10.3,19.555556,10,12427,3.3,5.9,2550362,0.555556,6476,0.740735,4797.0,10,19.555556,59.0
1013032,1,0.3,18,10.0,2.666667,8.0,21.666667,3,13032,6.333333,6.666667,2550362,0.555556,3751,0.657158,2465.0,10,19.555556,66.666667


### Add Embeddings

In [80]:
sample_embeddings = pd.read_csv('data/sample__product_embeddings_results_10000.csv')

In [85]:
sample_embeddings = sample_embeddings.merge(products[['product_id','product_name']], on='product_name')

In [86]:
sample_embeddings.head()

Unnamed: 0,product_name,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,...,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,product_id
0,Banana,-0.066329,0.104867,0.024469,-0.033735,-0.050062,0.062402,0.016188,-0.029837,-0.006496,-0.044408,0.010496,-0.006934,0.080748,0.009459,0.008132,-0.011283,-0.029852,-0.12386,-0.03963,0.045593,-0.001091,0.057942,-0.018238,-0.080673,0.032567,-0.053717,0.037731,-0.024818,-0.004728,-0.041005,-0.063337,0.024319,-0.156829,-0.106846,-0.094582,0.018267,-0.100205,-0.071329,-0.01145,-0.060211,0.004815,-0.024693,-0.048072,0.009413,0.052281,0.011263,0.04578,-0.035466,0.030604,...,0.04052,-0.023742,0.085781,0.05394,-0.14502,-0.023445,-0.055318,-0.001625,-0.019949,-0.035682,0.054186,0.034749,-0.035738,0.148132,-0.01888,-0.000479,0.053934,0.01789,0.098763,-0.02606,0.001845,-0.121398,0.022681,-0.047729,-0.004826,-0.140754,-0.047253,0.053384,0.173778,-0.019768,0.027524,-0.05359,-0.005808,-0.034241,-0.064685,-0.094907,0.055859,-0.042542,0.109612,-0.002538,0.041345,-0.025529,-0.056968,-0.044174,0.030172,0.073603,-0.011741,0.052059,0.005534,24852
1,Bag of Organic Bananas,-0.066627,0.101306,0.008572,-2.5e-05,-0.036207,0.068493,0.012462,-0.042329,-0.002884,-0.029232,0.013908,0.000536,0.095057,-0.006788,0.050196,-0.023852,-0.032272,-0.1063,-0.023011,0.045814,0.010012,0.041671,-0.005364,-0.072528,0.034719,-0.046941,0.061004,-0.02614,0.016941,-0.046702,-0.054286,0.043781,-0.134836,-0.088285,-0.072889,0.02089,-0.058478,-0.083389,-0.011517,-0.062716,-0.013763,-0.031309,-0.047674,-0.016712,0.059,-0.000353,0.066537,-0.040011,0.03918,...,0.053752,-0.053323,0.099725,0.068075,-0.163714,-0.059472,-0.040178,0.010304,-0.05869,-0.032019,0.064264,0.031698,-0.02012,0.163105,0.003185,0.000442,0.077272,0.050757,0.087335,0.004912,-2e-06,-0.096418,0.009511,-0.043714,-0.002529,-0.138472,-0.064208,0.048529,0.153773,-0.041611,0.019613,-0.054578,0.031385,-0.028236,-0.056266,-0.07466,0.040972,-0.020841,0.133806,-0.000776,0.065239,-0.039335,-0.021015,-0.029158,0.017478,0.06623,0.013782,0.044743,0.022831,13176
2,Organic Strawberries,-0.070595,0.101302,0.002609,-0.012077,-0.033811,0.060255,-5.1e-05,-0.017269,-0.025632,-0.020555,0.019651,0.002034,0.092079,-0.003857,0.025978,-0.029851,-0.044397,-0.121127,-0.020989,0.05347,0.001963,0.067228,-0.002433,-0.061598,0.03877,-0.048752,0.044253,-0.021213,0.024708,-0.030917,-0.051659,-0.000695,-0.104469,-0.118458,-0.081147,0.050035,-0.060831,-0.067912,-0.017715,-0.050913,-0.017566,-0.02126,-0.055228,-0.013133,0.036887,-0.001758,0.072912,-0.056613,0.009606,...,0.060597,-0.017857,0.07925,0.061965,-0.153607,-0.052757,-0.044504,0.024185,-0.030431,-0.015183,0.07825,0.012549,-0.028284,0.164213,0.000771,0.019387,0.058388,0.037943,0.066984,0.007627,-0.001745,-0.088315,-0.015895,-0.048965,-0.007577,-0.116612,-0.056021,0.065049,0.163335,-0.04909,0.011107,-0.074846,0.029828,-0.043497,-0.068011,-0.0688,0.045139,-0.02795,0.142671,9.4e-05,0.067583,-0.041444,-0.027554,-0.024504,0.026834,0.070014,-0.036618,0.036141,0.01403,21137
3,Organic Baby Spinach,-0.061919,0.099162,0.010594,-0.014413,-0.028219,0.056061,0.009835,-0.022766,-0.033674,-0.018461,0.017196,-0.001575,0.089812,-0.004539,0.028994,-0.036098,-0.038788,-0.111499,-0.014256,0.053073,-0.001637,0.061981,0.005863,-0.066435,0.036565,-0.041932,0.044879,-0.031704,0.032779,-0.025617,-0.047014,0.009682,-0.100401,-0.110113,-0.071418,0.037565,-0.059379,-0.06179,-0.01335,-0.051914,-0.017408,-0.02618,-0.048856,-0.010244,0.03796,0.004692,0.082896,-0.054901,0.01537,...,0.052383,-0.013989,0.095883,0.065473,-0.160168,-0.058274,-0.042624,0.024536,-0.033527,-0.006106,0.07401,0.009827,-0.022269,0.167314,-0.005068,0.020342,0.069324,0.044136,0.060092,0.006877,-0.007597,-0.084507,-0.00415,-0.055515,-0.009068,-0.126374,-0.059648,0.069644,0.165102,-0.053618,0.015085,-0.075858,0.035796,-0.05065,-0.058581,-0.071324,0.067867,-0.028835,0.134164,-0.006271,0.071999,-0.036615,-0.030392,-0.033181,0.024443,0.057378,-0.038917,0.03153,0.021238,21903
4,Organic Hass Avocado,-0.053906,0.084408,0.011947,0.000929,-0.030131,0.050616,0.008229,-0.037045,-0.033453,-0.015213,0.006196,0.00721,0.08137,-0.014393,0.040744,-0.045629,-0.035848,-0.09952,-0.012261,0.061416,0.00785,0.048024,0.016454,-0.063064,0.055723,-0.049058,0.050735,-0.023525,0.041115,-0.028615,-0.03623,-0.005202,-0.107794,-0.102412,-0.069873,0.037217,-0.051181,-0.07195,-0.014714,-0.054045,-0.016852,-0.024077,-0.05419,-0.017076,0.032944,0.002704,0.097501,-0.051995,0.027206,...,0.050052,-0.01927,0.103203,0.074646,-0.158682,-0.075419,-0.050585,0.024413,-0.048141,-0.004899,0.076452,0.009516,-0.015986,0.163062,0.006944,0.023883,0.056937,0.046951,0.050126,0.011183,0.000315,-0.084061,-0.005261,-0.051258,-0.013627,-0.127084,-0.038918,0.074319,0.161964,-0.056272,0.010216,-0.059318,0.047057,-0.044848,-0.051766,-0.073076,0.06022,-0.018409,0.148073,5e-06,0.066826,-0.026363,-0.037771,-0.044428,0.035692,0.052863,-0.043461,0.028577,0.023018,47209


In [None]:
features.drop('product_name', axis=1, inplace=True)

In [None]:
features = features.merge(sample_embeddings, on='product_id')
features.drop('product_name', axis=1, inplace=True)

In [None]:
print(features.shape)
features.head()

### Split Train Test

In [34]:
# Ready the inputs into model
train_orders = orders[orders['eval_set']=='train']
test_orders = orders[orders['eval_set']=='test']

In [35]:
test_orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
38,2774568,3,test,13,5,15,11.0
44,329954,4,test,6,3,12,30.0
53,1528013,6,test,4,3,16,22.0
96,1376945,11,test,8,6,11,8.0
102,1356845,12,test,6,1,20,30.0


In [36]:
test_orders.shape

(75000, 7)

In [37]:
train_features = pandas.merge(train_orders, features.reset_index(), on='user_id')
test_features = pandas.merge(test_orders, features.reset_index(), on='user_id')

In [52]:
train_features.shape

(8474661, 26)

In [38]:
test_features.shape

(4833292, 26)

In [53]:
test_features.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,user_product_id,user_product_reorder_rate,user_distinct_products,user_reordered_products,user_product_dow,user_product_hod,user_product_dsp,user_product_orders,product_id,user_product_addCart,user_avg_basket_size,user_product_last_order_id,user_reorder_rate,product_total_orders,product_reorder_rate,product_reorders,user_orders,user_order_dsp,user_total_products
0,2774568,3,test,13,5,15,11.0,3016797,0.25,33,19.0,0.333333,15.0,7.0,3,16797,4.0,8.0,3160850,0.575758,142951,0.698155,99802.0,12,12.090909,96.0
1,2774568,3,test,13,5,15,11.0,3047766,0.75,33,19.0,0.888889,16.111111,12.125,9,47766,3.777778,7.222222,1402502,0.575758,176815,0.758103,134044.0,12,12.090909,86.666667
2,2774568,3,test,13,5,15,11.0,3000248,0.083333,33,19.0,3.0,19.0,9.0,1,248,3.0,9.0,444309,0.575758,6371,0.400251,2550.0,12,12.090909,108.0
3,2774568,3,test,13,5,15,11.0,3001005,0.083333,33,19.0,3.0,16.0,17.0,1,1005,5.0,6.0,676467,0.575758,463,0.440605,204.0,12,12.090909,72.0
4,2774568,3,test,13,5,15,11.0,3001819,0.25,33,19.0,0.666667,16.333333,11.333333,3,1819,2.666667,7.333333,1839752,0.575758,2424,0.492162,1193.0,12,12.090909,88.0


In [54]:
# del train_orders
# del test_orders
# del features

In [55]:
train_features['diff_order_hod'] = abs(train_features['user_product_hod']-train_features['order_hour_of_day']).map(lambda x: min(x, 24-x))
train_features['ratio_dsp'] = train_features['user_product_dsp']/train_features['days_since_prior_order']
train_features['diff_dow'] = abs(train_features['user_product_dow']-train_features['order_dow']).map(lambda x: min (x, 7-x))
train_features['user_product_orders_since_last'] = train_features['user_orders'] - train_features['user_product_last_order_id'].map(orders.order_number)
train_features['user_product_hour_vs_last'] = abs(train_features['order_hour_of_day'] - train_features['user_product_last_order_id'].map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x))

test_features['diff_order_hod'] = abs(test_features['user_product_hod']-test_features['order_hour_of_day']).map(lambda x: min(x, 24-x))
test_features['ratio_dsp'] = test_features['user_product_dsp']/test_features['days_since_prior_order']
test_features['diff_dow'] = abs(test_features['user_product_dow']-test_features['order_dow']).map(lambda x: min (x, 7-x))
test_features['user_product_orders_since_last'] = test_features['user_orders'] - test_features['user_product_last_order_id'].map(orders.order_number)
test_features['user_product_hour_vs_last'] = abs(test_features['order_hour_of_day'] - test_features['user_product_last_order_id'].map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x))

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,user_product_id,user_product_reorder_rate,user_distinct_products,user_reordered_products,user_product_dow,user_product_hod,user_product_dsp,user_product_orders,product_id,user_product_addCart,user_avg_basket_size,user_product_last_order_id,user_reorder_rate,product_total_orders,product_reorder_rate,product_reorders,user_orders,user_order_dsp,user_total_products,diff_order_hod,ratio_dsp,diff_dow,user_product_orders_since_last,user_product_hour_vs_last
0,2774568,3,test,13,5,15,11.0,3016797,0.25,33,19.0,0.333333,15.0,7.0,3,16797,4.0,8.0,3160850,0.575758,142951,0.698155,99802.0,12,12.090909,96.0,0.0,0.636364,2.333333,7.0,1.0
1,2774568,3,test,13,5,15,11.0,3047766,0.75,33,19.0,0.888889,16.111111,12.125,9,47766,3.777778,7.222222,1402502,0.575758,176815,0.758103,134044.0,12,12.090909,86.666667,1.111111,1.102273,2.888889,-14.0,0.0
2,2774568,3,test,13,5,15,11.0,3000248,0.083333,33,19.0,3.0,19.0,9.0,1,248,3.0,9.0,444309,0.575758,6371,0.400251,2550.0,12,12.090909,108.0,4.0,0.818182,2.0,-1.0,4.0
3,2774568,3,test,13,5,15,11.0,3001005,0.083333,33,19.0,3.0,16.0,17.0,1,1005,5.0,6.0,676467,0.575758,463,0.440605,204.0,12,12.090909,72.0,1.0,1.545455,2.0,6.0,2.0
4,2774568,3,test,13,5,15,11.0,3001819,0.25,33,19.0,0.666667,16.333333,11.333333,3,1819,2.666667,7.333333,1839752,0.575758,2424,0.492162,1193.0,12,12.090909,88.0,1.333333,1.030303,2.666667,-14.0,9.0


In [56]:
train_features.sort_values(['order_id','product_id'], inplace=True)
test_features.sort_values(['order_id','product_id'], inplace=True)

In [57]:
train_features.drop(['eval_set','order_number'], axis=1, inplace=True)
test_features.drop(['eval_set','order_number'], axis=1, inplace=True)

In [58]:
train_orders = orders[orders['eval_set']=='train']
trainProducts = pandas.merge(train_orders,trainingOrderItems, on='order_id')

In [59]:
trainProducts = trainProducts.groupby('user_id')['product_id'].apply(set)
trainProducts.head()

user_id
1    {196, 26405, 27845, 46149, 13032, 39657, 26088, 25133, 38928, 10258, 49235}                                                                                                                                        
2    {24838, 11913, 45066, 31883, 48523, 38547, 24852, 32792, 7963, 22559, 12324, 33957, 22825, 9387, 45613, 22963, 48821, 2361, 41787, 5699, 13640, 5450, 19019, 16589, 45645, 21329, 1757, 12007, 34284, 26352, 31612}
5    {40706, 21413, 20843, 48204, 21616, 19057, 20114, 15349, 16185}                                                                                                                                                    
7    {17638, 29894, 47272, 45066, 13198, 37999, 40852, 12053, 43967}                                                                                                                                                    
8    {27104, 15937, 5539, 41540, 31717, 48230, 22247, 41259, 37803, 21903, 10960, 7058, 4853, 47766, 48121, 25659, 23165, 28

In [60]:
trainLabels = []
for row in tqdm(train_features.itertuples()):
    trainLabels += [row.product_id in trainProducts[row.user_id]]
print(len(trainLabels))
print(train_features.shape)


8474661it [02:16, 62308.56it/s]


8474661
(8474661, 29)


In [61]:
trainLabels[:5]

[False, False, True, False, True]

In [62]:
num_feature_list=['diff_dow','ratio_dsp','diff_order_hod', 'user_avg_basket_size', 'user_product_hour_vs_last',
                  'days_since_prior_order','user_product_reorder_rate','user_product_dow','user_reordered_products',
                  'user_product_hod','user_product_dsp','user_product_orders','user_product_addCart',
                  'product_reorder_rate','product_total_orders', 'user_reorder_rate', 'user_distinct_products',
                  'user_product_orders_since_last']

In [63]:
train_features[num_feature_list] = (train_features[num_feature_list]-train_features[num_feature_list].mean())/((train_features[num_feature_list].max()-train_features[num_feature_list].min()))
test_features[num_feature_list] = (test_features[num_feature_list]-test_features[num_feature_list].mean())/((test_features[num_feature_list].max()-test_features[num_feature_list].min()))

In [64]:
train_features.head()

Unnamed: 0,order_id,user_id,order_dow,order_hour_of_day,days_since_prior_order,user_product_id,user_product_reorder_rate,user_distinct_products,user_reordered_products,user_product_dow,user_product_hod,user_product_dsp,user_product_orders,product_id,user_product_addCart,user_avg_basket_size,user_product_last_order_id,user_reorder_rate,product_total_orders,product_reorder_rate,product_reorders,user_orders,user_order_dsp,user_total_products,diff_order_hod,ratio_dsp,diff_dow,user_product_orders_since_last,user_product_hour_vs_last
4629306,1,112108,4,10,-0.178176,112108002067,0.181188,-0.140862,-0.098748,-0.292511,-0.153366,-0.18977,-0.01465,2067,-0.015349,-0.052874,2541372,0.268539,-0.040995,0.076044,1902.0,3,11.0,24.0,-0.319506,,0.4003,-0.052444,-0.298248
4629308,1,112108,4,10,-0.178176,112108005707,0.517923,-0.140862,-0.098748,-0.209177,0.085764,-0.056437,-0.004446,5707,-0.029238,-0.06329,186706,0.268539,-0.0451,0.044001,690.0,3,11.0,19.5,0.138827,,0.257443,-0.031931,-0.381581
4629307,1,112108,4,10,-0.178176,112108011109,0.517923,-0.140862,-0.098748,-0.292511,-0.066409,-0.18977,-0.004446,11109,-0.036182,-0.052874,2541372,0.268539,-0.038172,0.190429,3192.0,3,11.0,24.0,-0.15284,,0.4003,-0.052444,-0.298248
4629304,1,112108,4,10,-0.178176,112108014947,0.854658,-0.140862,-0.098748,-0.236955,0.064025,-0.056437,0.005758,14947,-0.038497,-0.059818,186706,0.268539,0.013487,0.295131,23463.0,3,11.0,21.0,0.09716,,0.305062,-0.031931,-0.381581
4629302,1,112108,4,10,-0.178176,112108022035,0.517923,-0.140862,-0.098748,-0.292511,-0.066409,-0.18977,-0.004446,22035,-0.029238,-0.052874,2541372,0.268539,0.078646,0.244622,45639.0,3,11.0,24.0,-0.15284,,0.4003,-0.052444,-0.298248


In [65]:
cat_feature_list = []
feature_list = num_feature_list+cat_feature_list

In [66]:
X_train = train_features[feature_list].fillna(0).as_matrix()[:500000]
Y_train = numpy.array(trainLabels).astype('int8')[:500000]

  """Entry point for launching an IPython kernel.


In [67]:
Y_train

array([0, 0, 1, ..., 0, 0, 0], dtype=int8)

In [68]:
# fix random seed for reproducibility
numpy.random.seed(7)

# create model
model = Sequential()
model.add(Dense(13, input_dim=len(feature_list), activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [69]:
# Compile model
adam = optimizers.Adam()

# Fit the model
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model.fit(X_train, Y_train, epochs=10,verbose=0, callbacks=[TQDMNotebookCallback()])





<keras.callbacks.History at 0x181ef79048>

In [70]:
X_test = test_features[feature_list].fillna(0).as_matrix()
test_preds = model.predict(X_test)


  """Entry point for launching an IPython kernel.


In [71]:
test_preds

array([[0.07427749],
       [0.11459035],
       [0.05726245],
       ...,
       [0.06803884],
       [0.11764333],
       [0.10903908]], dtype=float32)

In [72]:
past_order_id = -1
reorderedProducts = []
output = []
i = 0 
maxProd = [0,0]

In [73]:
test_features.head()

Unnamed: 0,order_id,user_id,order_dow,order_hour_of_day,days_since_prior_order,user_product_id,user_product_reorder_rate,user_distinct_products,user_reordered_products,user_product_dow,user_product_hod,user_product_dsp,user_product_orders,product_id,user_product_addCart,user_avg_basket_size,user_product_last_order_id,user_reorder_rate,product_total_orders,product_reorder_rate,product_reorders,user_orders,user_order_dsp,user_total_products,diff_order_hod,ratio_dsp,diff_dow,user_product_orders_since_last,user_product_hour_vs_last
858095,17,36855,6,15,-0.446237,36855001283,0.096375,-0.145215,-0.14103,-0.458892,-0.196971,0.576189,-0.014692,1283,-0.040674,-0.061448,234692,-0.16614,-0.047506,-0.351908,6.0,4,14.0,36.0,0.181576,,-0.170464,-0.06782,-0.132405
858092,17,36855,6,15,-0.446237,36855006291,0.096375,-0.145215,-0.14103,0.541108,0.02042,-0.223811,-0.014692,6291,-0.002212,-0.061448,3248434,-0.16614,-0.043403,0.090071,1217.0,4,14.0,36.0,-0.235091,,-0.456178,-0.231922,-0.215738
858088,17,36855,6,15,-0.446237,36855007035,0.096375,-0.145215,-0.14103,-0.292225,-0.023058,,-0.014692,7035,-0.059905,-0.098485,898818,-0.16614,-0.043477,-0.060709,924.0,4,14.0,20.0,-0.151758,,0.115251,-0.072948,-0.215738
858094,17,36855,6,15,-0.446237,36855011494,0.096375,-0.145215,-0.14103,-0.292225,-0.023058,,-0.014692,11494,-0.040674,-0.098485,898818,-0.16614,-0.044793,-0.184154,476.0,4,14.0,20.0,-0.151758,,0.115251,-0.072948,-0.215738
858097,17,36855,6,15,-0.446237,36855013107,0.601477,-0.145215,-0.14103,0.152219,-0.066536,0.042856,0.005716,13107,-0.066315,-0.07688,1058761,-0.16614,-0.046778,0.108324,237.0,4,14.0,29.333333,-0.068424,,0.210489,-0.098589,-0.132405


In [74]:
for row in tqdm(test_features.itertuples()):
    if (row.order_id!=past_order_id):
        if (past_order_id==-1):
            pass
        else:
            if (reorderedProducts == []):
                reorderedProducts.append(maxProd[0])
            output.append([past_order_id," ".join(reorderedProducts)])
            reorderedProducts = []
            maxProd = [0,0]
        past_order_id = row.order_id
        
    if (test_preds[i]>.2):
            reorderedProducts.append(str(row.product_id))
    else:
        if (test_preds[i] > maxProd[1]):
            maxProd = [str(row.product_id),test_preds[i]]
    i+=1


4833292it [00:26, 185618.68it/s]


In [75]:
output.append([past_order_id," ".join(reorderedProducts)])

output[5]

[313, '12779 13198 14077 21903 25890 45007 46906 49683']

In [79]:
output[1][1]

'21137 39180 39475 43504 47029 47766 47792'

In [83]:
order_id_list = []
product_output_list = []

for order_id in range(0,len(output)):
    order_id_list.append(output[order_id][0])
    
for product_list in range(0,len(output)):
    product_output_list.append(output[product_list][1]) 

In [84]:
product_output_list[5]

'12779 13198 14077 21903 25890 45007 46906 49683'

In [85]:
kernel_submission = pandas.DataFrame({'order_id':order_id_list, 'products':product_output_list})
print(kernel_submission.shape)
kernel_submission.head()

(75000, 2)


Unnamed: 0,order_id,products
0,17,13107 21463 21709 47766
1,34,21137 39180 39475 43504 47029 47766 47792
2,137,2326 23794 24852 25890 38689 41787 43352
3,182,5479 9337 13629 21903 24009 27104 30391 33000 34243 35951 39275 47209 47672
4,257,1025 4605 13176 13870 21137 24838 24852 27104 27966 28476 29837 30233 30391 36929 38558 45013 47766 49235


In [86]:
pwd

'/Users/waficel-assi/instacart-kaggle-competition'

In [87]:
kernel_submission.to_csv('./data/keras_kernel_submission_sample.csv', index=False)