In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
all_products = pd.read_csv('products.csv') #49688
all_orders = pd.read_csv('orders.csv')
prior_orders = pd.read_csv('order_products__prior.csv')
train_set = pd.read_csv('order_products__train.csv')

In [3]:
prior_orders_extended = prior_orders.merge(all_orders,on='order_id',how='left')

## FEATURES

In [4]:
prior_orders_extended.columns

Index(['order_id', 'product_id', 'add_to_cart_order', 'reordered', 'user_id',
       'eval_set', 'order_number', 'order_dow', 'order_hour_of_day',
       'days_since_prior_order'],
      dtype='object')

In [5]:
prior_orders_extended.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8.0
1,2,28985,2,1,202279,prior,3,5,9,8.0
2,2,9327,3,0,202279,prior,3,5,9,8.0
3,2,45918,4,1,202279,prior,3,5,9,8.0
4,2,30035,5,0,202279,prior,3,5,9,8.0


### CUSTOMER

In [6]:
cust_attribs1 = prior_orders_extended.groupby('user_id').agg({'product_id':['count','nunique'],'order_id':'nunique','reordered':'sum'})
cust_attribs1.columns = cust_attribs1.columns.map('_'.join)
cust_attribs1 = cust_attribs1.reset_index().rename(columns=\
    {'product_id_count':'total_basket_size','product_id_nunique':'unique_basket_size','order_id_nunique':'user_num_prior_orders','reordered_sum':'reordered_basket_size'})
cust_attribs1.head()

Unnamed: 0,user_id,total_basket_size,unique_basket_size,user_num_prior_orders,reordered_basket_size
0,1,59,18,10,41
1,2,195,102,14,93
2,3,88,33,12,55
3,4,18,17,5,1
4,5,37,23,4,14


In [7]:
#cust_attribs2 = prior_orders_extended.groupby('user_id')['product_id'].apply(set).reset_index().rename(columns={'product_id':'products_list'})
#cust_attribs2.head()

In [8]:
cust_attribs3 = all_orders.groupby('user_id').agg({'days_since_prior_order' : 'mean','order_id':'count'})
cust_attribs3 = cust_attribs3.reset_index().rename(columns=\
    {'days_since_prior_order':'user_avg_frequency','order_id':'user_num_orders'})
cust_attribs3.head()

Unnamed: 0,user_id,user_avg_frequency,user_num_orders
0,1,19.0,11
1,2,16.285714,15
2,3,12.0,13
3,4,17.0,6
4,5,11.5,5


In [9]:
#cust_attribs = cust_attribs1.merge(cust_attribs2,how='left',on='user_id').\
#     merge(cust_attribs3,how='left',on='user_id')
#del cust_attribs1,cust_attribs2,cust_attribs3
cust_attribs = cust_attribs1.merge(cust_attribs3,how='left',on='user_id')
del cust_attribs1,cust_attribs3

cust_attribs['average_basket_size'] = cust_attribs['total_basket_size']/cust_attribs['user_num_prior_orders']
cust_attribs['user_repeatability'] = cust_attribs['reordered_basket_size']/cust_attribs['total_basket_size']
cust_attribs.head()

Unnamed: 0,user_id,total_basket_size,unique_basket_size,user_num_prior_orders,reordered_basket_size,user_avg_frequency,user_num_orders,average_basket_size,user_repeatability
0,1,59,18,10,41,19.0,11,5.9,0.694915
1,2,195,102,14,93,16.285714,15,13.928571,0.476923
2,3,88,33,12,55,12.0,13,7.333333,0.625
3,4,18,17,5,1,17.0,6,3.6,0.055556
4,5,37,23,4,14,11.5,5,9.25,0.378378


### CUST-PRODUCT

In [10]:
t = prior_orders_extended.groupby(['user_id','product_id']).agg({'order_number':['max','count'],'add_to_cart_order':'mean','reordered':'sum'})
t.columns = t.columns.map('_'.join)
t = t.reset_index().rename(columns=\
    {'order_number_max':'user_prod_last_order_num','order_number_count':'user_prod_num_orders',\
     'add_to_cart_order_mean':'avg_prod_basket_position','reordered_sum':'user_prod_num_reorders'})
t.head()

Unnamed: 0,user_id,product_id,user_prod_last_order_num,user_prod_num_orders,avg_prod_basket_position,user_prod_num_reorders
0,1,196,10,10,1.4,9
1,1,10258,10,9,3.333333,8
2,1,10326,5,1,5.0,0
3,1,12427,10,10,3.3,9
4,1,13032,10,3,6.333333,2


In [11]:
user_prod_attribs = prior_orders_extended[['user_id','product_id','order_id','order_number']].merge(t,how='inner',left_on=['user_id','product_id','order_number'],right_on=['user_id','product_id','user_prod_last_order_num'])
user_prod_attribs.head()

Unnamed: 0,user_id,product_id,order_id,order_number,user_prod_last_order_num,user_prod_num_orders,avg_prod_basket_position,user_prod_num_reorders
0,202279,9327,2,3,3,1,3.0,0
1,202279,1819,2,3,3,2,9.5,1
2,205970,17668,3,16,16,6,9.0,5
3,156122,9633,5,42,42,16,10.8125,15
4,156122,12962,5,42,42,5,17.2,4


User product frequency would make more sense than user average frequency, as quantity of product is not given. Eg. A users usual order quanity can 1 kg of Pulses which he orders once a week, another's might be 2 kg ordered once in two weeks. So this might be a valuable feature. //Work later on bringing this - requires quite a bit of tweaking as order dates are not given as timestamps

In [12]:
user_prod_attribs.drop('order_number',axis=1,inplace=True)
user_prod_attribs.rename(columns={'order_id':'user_prod_last_order_id'},inplace=True)
user_prod_attribs['user_prod_repeatability'] = user_prod_attribs['user_prod_num_reorders']/ \
    user_prod_attribs['user_prod_num_orders']

### PRODUCT

In [13]:
prod_attribs = prior_orders_extended.groupby('product_id').agg({'reordered':['sum','count']})
prod_attribs.columns = prod_attribs.columns.map('_'.join)
prod_attribs = prod_attribs.reset_index().rename(columns=\
    {'reordered_sum':'prod_num_reorders','reordered_count':'prod_num_orders'})
prod_attribs['prod_repeatability'] = prod_attribs['prod_num_reorders']/ \
    prod_attribs['prod_num_orders']
prod_attribs.head()

Unnamed: 0,product_id,prod_num_reorders,prod_num_orders,prod_repeatability
0,1,1136,1852,0.613391
1,2,12,90,0.133333
2,3,203,277,0.732852
3,4,147,329,0.446809
4,5,9,15,0.6


### PREPARING TRAINING DATA

In [14]:
print(train_set.columns)
test_orders = all_orders[all_orders.eval_set == 'test']
train_orders = all_orders[all_orders.eval_set == 'train']
all_orders.head()

Index(['order_id', 'product_id', 'add_to_cart_order', 'reordered'], dtype='object')


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [15]:
print("Number of orders in train_set : ",train_set.order_id.nunique())
print("Number of orders in train_orders : ",train_orders.order_id.nunique())
print("Train set: ",train_set.shape)
print("Train orders: ",train_orders.shape)

Number of orders in train_set :  131209
Number of orders in train_orders :  131209
Train set:  (1384617, 4)
Train orders:  (131209, 7)


In [16]:
train_set.reordered.value_counts()

1    828824
0    555793
Name: reordered, dtype: int64

The training data set given and test data set dervied from orders have different columns. We'll pick order_id, product_id, reordered or not from the training set given, map it to train tagged data from all_orders set and there on add features which we generated. A user-product can be taken as index, rest as features and reordered/not as label and model can be trained.

In [17]:
train_df = train_set.merge(train_orders,how='left',on='order_id').\
    merge(prod_attribs,how='left',on='product_id').\
    merge(cust_attribs,how='left',on='user_id').\
    merge(all_products,how='left',on='product_id').\
    merge(user_prod_attribs,how='left',on=['user_id','product_id'])

print(train_orders.shape)
print(train_df.shape)
print(train_df.columns)
print("Number of orders in train_df : ",train_df.order_id.nunique())
print("Number of users in train_df : ",train_df.user_id.nunique())
print("Any NaNs in target var : ",sum(train_df.reordered != train_df.reordered))
train_df.set_index(['user_id','product_id'],inplace=True)
train_df.head()

(131209, 7)
(1384617, 30)
Index(['order_id', 'product_id', 'add_to_cart_order', 'reordered', 'user_id',
       'eval_set', 'order_number', 'order_dow', 'order_hour_of_day',
       'days_since_prior_order', 'prod_num_reorders', 'prod_num_orders',
       'prod_repeatability', 'total_basket_size', 'unique_basket_size',
       'user_num_prior_orders', 'reordered_basket_size', 'user_avg_frequency',
       'user_num_orders', 'average_basket_size', 'user_repeatability',
       'product_name', 'aisle_id', 'department_id', 'user_prod_last_order_id',
       'user_prod_last_order_num', 'user_prod_num_orders',
       'avg_prod_basket_position', 'user_prod_num_reorders',
       'user_prod_repeatability'],
      dtype='object')
Number of orders in train_df :  131209
Number of users in train_df :  131209
Any NaNs in target var :  0


Unnamed: 0_level_0,Unnamed: 1_level_0,order_id,add_to_cart_order,reordered,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,prod_num_reorders,prod_num_orders,...,user_repeatability,product_name,aisle_id,department_id,user_prod_last_order_id,user_prod_last_order_num,user_prod_num_orders,avg_prod_basket_position,user_prod_num_reorders,user_prod_repeatability
user_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
112108,49302,1,1,1,train,4,4,10,9.0,101.0,163.0,...,0.428571,Bulgarian Yogurt,120,16,2541372.0,2.0,2.0,2.5,1.0,0.5
112108,11109,1,2,1,train,4,4,10,9.0,3192.0,4472.0,...,0.428571,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16,2541372.0,2.0,2.0,4.0,1.0,0.5
112108,10246,1,3,0,train,4,4,10,9.0,12498.0,23826.0,...,0.428571,Organic Celery Hearts,83,4,,,,,,
112108,49683,1,4,0,train,4,4,10,9.0,67313.0,97315.0,...,0.428571,Cucumber Kirby,83,4,,,,,,
112108,43633,1,5,1,train,4,4,10,9.0,312.0,653.0,...,0.428571,Lightly Smoked Sardines in Olive Oil,95,15,186706.0,3.0,2.0,3.5,1.0,0.5


In [18]:
train_df.isnull().sum()

order_id                         0
add_to_cart_order                0
reordered                        0
eval_set                         0
order_number                     0
order_dow                        0
order_hour_of_day                0
days_since_prior_order           0
prod_num_reorders                9
prod_num_orders                  9
prod_repeatability               9
total_basket_size                0
unique_basket_size               0
user_num_prior_orders            0
reordered_basket_size            0
user_avg_frequency               0
user_num_orders                  0
average_basket_size              0
user_repeatability               0
product_name                     0
aisle_id                         0
department_id                    0
user_prod_last_order_id     555793
user_prod_last_order_num    555793
user_prod_num_orders        555793
avg_prod_basket_position    555793
user_prod_num_reorders      555793
user_prod_repeatability     555793
dtype: int64

The NaNs in these variables resulted from user_prod df meaning the user product combination in the training data is new. Verifying it. 
For imputing these missing values - we can ignore the user_prod_last_order_id,user_prod_last_order_num as we are not using them anywhere now. For rest, 
Since we've a single order per user, we can replace user_prod_num_orders with 1, avg basket position with the current basked position,user_prod_num_reorders and repeatability as 0

9 rows have NaNs on product related columns, which means the product appeared for the first time. Dropping these rows. 

In [19]:
train_df[train_df['user_prod_last_order_id'].isnull()].head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,order_id,add_to_cart_order,reordered,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,prod_num_reorders,prod_num_orders,...,user_repeatability,product_name,aisle_id,department_id,user_prod_last_order_id,user_prod_last_order_num,user_prod_num_orders,avg_prod_basket_position,user_prod_num_reorders,user_prod_repeatability
user_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
112108,10246,1,3,0,train,4,4,10,9.0,12498.0,23826.0,...,0.428571,Organic Celery Hearts,83,4,,,,,,
112108,49683,1,4,0,train,4,4,10,9.0,67313.0,97315.0,...,0.428571,Cucumber Kirby,83,4,,,,,,
112108,13176,1,6,0,train,4,4,10,9.0,315913.0,379450.0,...,0.428571,Bag of Organic Bananas,24,4,,,,,,
112108,47209,1,7,0,train,4,4,10,9.0,170131.0,213584.0,...,0.428571,Organic Hass Avocado,24,4,,,,,,
79431,39612,36,1,0,train,23,6,18,30.0,250.0,728.0,...,0.459893,Grated Pecorino Romano Cheese,2,16,,,,,,


In [20]:
prior_orders_extended[(prior_orders_extended['user_id']==112108) & (prior_orders_extended['product_id']==10246)]

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order


In [21]:
train_df.user_prod_num_orders.fillna(1,inplace=True)
train_df.avg_prod_basket_position = train_df.add_to_cart_order.where(\
        train_df.avg_prod_basket_position.isnull(),train_df.avg_prod_basket_position)
train_df.user_prod_num_reorders.fillna(0,inplace=True)
train_df.user_prod_repeatability.fillna(0,inplace=True)
train_df.dropna(subset=['prod_num_orders'],inplace=True,axis=0)
train_df.isnull().sum()

order_id                         0
add_to_cart_order                0
reordered                        0
eval_set                         0
order_number                     0
order_dow                        0
order_hour_of_day                0
days_since_prior_order           0
prod_num_reorders                0
prod_num_orders                  0
prod_repeatability               0
total_basket_size                0
unique_basket_size               0
user_num_prior_orders            0
reordered_basket_size            0
user_avg_frequency               0
user_num_orders                  0
average_basket_size              0
user_repeatability               0
product_name                     0
aisle_id                         0
department_id                    0
user_prod_last_order_id     555784
user_prod_last_order_num    555784
user_prod_num_orders             0
avg_prod_basket_position         0
user_prod_num_reorders           0
user_prod_repeatability          0
dtype: int64

In [22]:
num_features = ['order_hour_of_day', 'days_since_prior_order', 'total_basket_size',
       'unique_basket_size', 'user_num_prior_orders', 'reordered_basket_size','user_avg_frequency', 'user_num_orders',
       'average_basket_size', 'user_repeatability',    'user_prod_num_orders', 'avg_prod_basket_position',
       'user_prod_num_reorders', 'user_prod_repeatability',
       'prod_num_reorders', 'prod_num_orders', 'prod_repeatability']
cat_features = ['aisle_id','department_id','order_dow']
tot_features = list(set(num_features)|set(cat_features))

In [23]:
len(num_features)

17

In [24]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots(nrows = 6, ncols = 3 ,figsize=(30,30))
flat_ax = [item for sublist in ax for item in sublist]
for i,feature in enumerate(num_features):
     sns.distplot(train_df[feature],ax=flat_ax[i])

In [25]:
def get_normalization_parameters(traindf, features):
    """Get the normalization parameters (E.g., mean, std) for traindf for 
    features. We will use these parameters for training, eval, and serving."""

    def z_score_params(column):
        mean = np.float32(traindf[column].mean())
        std = np.float32(traindf[column].std())
        min = np.float32(traindf[column].min())
        max = np.float32(traindf[column].max())

        return {'mean': mean, 'std': std,'min':min,'max':max}

    normalization_parameters = {}
    for column in features:
        normalization_parameters[column] = z_score_params(column)
    return normalization_parameters
#normalization_parameters = get_normalization_parameters(x_train, num_features)
normalization_parameters = get_normalization_parameters(train_df, num_features)
print(normalization_parameters)

{'order_hour_of_day': {'mean': 13.577586, 'std': 4.238456, 'min': 0.0, 'max': 23.0}, 'days_since_prior_order': {'mean': 17.066156, 'std': 10.42642, 'min': 0.0, 'max': 30.0}, 'total_basket_size': {'mean': 207.57744, 'std': 247.88081, 'min': 3.0, 'max': 3638.0}, 'unique_basket_size': {'mean': 81.414764, 'std': 62.598648, 'min': 1.0, 'max': 726.0}, 'user_num_prior_orders': {'mean': 16.091412, 'std': 16.614069, 'min': 3.0, 'max': 99.0}, 'reordered_basket_size': {'mean': 126.162674, 'std': 197.51443, 'min': 0.0, 'max': 3167.0}, 'user_avg_frequency': {'mean': 15.318091, 'std': 6.795476, 'min': 0.0, 'max': 30.0}, 'user_num_orders': {'mean': 17.091412, 'std': 16.614069, 'min': 4.0, 'max': 100.0}, 'average_basket_size': {'mean': 12.952141, 'std': 6.7662363, 'min': 1.0, 'max': 70.25}, 'user_repeatability': {'mean': 0.44721356, 'std': 0.20843686, 'min': 0.0, 'max': 0.9895288}, 'user_prod_num_orders': {'mean': 3.462198, 'std': 5.584559, 'min': 1.0, 'max': 99.0}, 'avg_prod_basket_position': {'mean'

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion

In [None]:
get_normalization_parameters

In [26]:
num_features_s = []
for feature in num_features:
    train_df[feature+'_s'] = (train_df[feature]-normalization_parameters[feature]['min'])/(normalization_parameters[feature]['max']-normalization_parameters[feature]['min'])
    num_features_s.extend([feature+'_s'])
    print(feature)
tot_features_s = list(set(num_features_s)|set(cat_features))    

order_hour_of_day
days_since_prior_order
total_basket_size
unique_basket_size
user_num_prior_orders
reordered_basket_size
user_avg_frequency
user_num_orders
average_basket_size
user_repeatability
user_prod_num_orders
avg_prod_basket_position
user_prod_num_reorders
user_prod_repeatability
prod_num_reorders
prod_num_orders
prod_repeatability


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion

In [27]:
from sklearn.model_selection import train_test_split
x_train, x_eval, y_train, y_eval = train_test_split(train_df[tot_features_s], train_df['reordered'], test_size=0.2)

In [28]:
del train_df,all_orders,prior_orders_extended,prior_orders,train_set,all_products

In [29]:
import tensorflow as tf

tf.enable_eager_execution()

tf.logging.set_verbosity(tf.logging.ERROR)
tf.set_random_seed(123)

In [30]:
fc = tf.feature_column
all_features = []

def std_scaler(feature,mean,std):
      return (feature - mean)/std

for feature_name in cat_features:
  # Need to one-hot encode categorical features.
    vocabulary = x_train[feature_name].unique()
    all_features.append(fc.indicator_column(
      fc.categorical_column_with_vocabulary_list(feature_name,vocabulary)))

for feature_name in num_features_s:
#     fet_mean = normalization_parameters[feature_name]['mean']
#     fet_std = normalization_parameters[feature_name]['std']
    all_features.append(fc.numeric_column(feature_name,dtype=tf.float32,\
           # normalizer_fn=lambda x:((x-fet_mean)/fet_std)\
                       ))
#Normalizing is creating a problem - check later

In [31]:
print(len(all_features))
all_features

20


[_IndicatorColumn(categorical_column=_VocabularyListCategoricalColumn(key='aisle_id', vocabulary_list=(3, 93, 8, 114, 124, 4, 84, 91, 48, 38, 24, 59, 130, 83, 21, 107, 36, 120, 32, 123, 115, 52, 117, 88, 19, 16, 112, 12, 53, 78, 1, 67, 81, 64, 5, 23, 98, 7, 20, 116, 61, 122, 26, 29, 92, 74, 104, 127, 111, 50, 131, 77, 69, 110, 128, 86, 129, 96, 109, 35, 94, 49, 37, 42, 106, 45, 121, 57, 66, 17, 2, 39, 108, 100, 79, 133, 30, 55, 75, 85, 54, 31, 72, 95, 27, 58, 63, 13, 41, 51, 105, 89, 99, 62, 119, 76, 125, 34, 43, 71, 40, 9, 15, 90, 82, 47, 25, 68, 103, 56, 70, 132, 14, 46, 22, 60, 80, 65, 28, 73, 6, 101, 118, 44, 97, 87, 126, 11, 33, 18, 102, 134, 113, 10), dtype=tf.int64, default_value=-1, num_oov_buckets=0)),
 _IndicatorColumn(categorical_column=_VocabularyListCategoricalColumn(key='department_id', vocabulary_list=(19, 3, 17, 5, 9, 16, 14, 1, 4, 15, 7, 13, 20, 12, 11, 18, 6, 21, 8, 10, 2), dtype=tf.int64, default_value=-1, num_oov_buckets=0)),
 _IndicatorColumn(categorical_column=_Vo

In [32]:
example = x_train.head(1)

In [33]:
fc.input_layer(dict(example), all_features)

<tf.Tensor: id=317, shape=(1, 179), dtype=float32, numpy=
array([[1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.

In [34]:
x_train.shape,x_eval.shape

((1107686, 20), (276922, 20))

In [35]:
NUM_EXAMPLES = len(y_train) #1107693

def make_input_fn(X, y, n_epochs=None, shuffle=True):
    def input_fn():
        dataset = tf.data.Dataset.from_tensor_slices((dict(X), y))
        if shuffle:
            dataset = dataset.shuffle(NUM_EXAMPLES)
        # For training, cycle thru dataset as many times as need (n_epochs=None).    
        dataset = dataset.repeat(n_epochs)  
        # In memory training doesn't use batching.
        dataset = dataset.batch(NUM_EXAMPLES)
        return dataset
    return input_fn

train_input_fn = make_input_fn(x_train.head(800), y_train.head(800))
eval_input_fn = make_input_fn(x_eval.head(1000), y_eval.head(1000), shuffle=False, n_epochs=1)

In [36]:
linear_est = tf.estimator.LinearClassifier(all_features)

# Train model.
linear_est.train(train_input_fn, max_steps=10)

# Evaluation.
results = linear_est.evaluate(eval_input_fn)
print('Accuracy : ', results['accuracy'])
print('Dummy model: ', results['accuracy_baseline'])

Accuracy :  0.681
Dummy model:  0.613


In [None]:
param_grid = {'rf__n_estimators': [50, 100, 150], 'rf__max_depth': [10, 20, 30]}
rf_cv = GridSearchCV(modeling_pipeline, param_grid, iid=False, cv=2)
rf_cv.fit(X_train, y_train)
rf_tuned_hyperparameters = rf_cv.best_params_

In [37]:
def populate_features(df,chosen_features):
    df_new = df.merge(cust_attribs,how='left',on='user_id').\
        merge(user_prod_attribs,how='left',on='user_id').\
        merge(prod_attribs,how='left',on='product_id').\
        merge(train_set,how='left',on=['order_id','product_id']).\
        merge(all_products,how='left',on='product_id')
    return df_new[chosen_features]