In [1]:
import pandas as pd
import numpy as np
import gc
import spacy
from tqdm import tqdm
import matplotlib.pyplot as plt
import sys
import pickle

In [2]:
order_prior = pd.read_csv("order_products__prior.csv")
orders      = pd.read_csv("orders.csv")

In [3]:
temp = order_prior[["order_id",'reordered']].groupby('order_id').agg('sum')

none_prior=pd.DataFrame({'order_id':temp[temp.reordered==0].index,'product_id':'None','add_to_cart_order':0,'reordered':1})
none_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,6,,0,1
1,7,,0,1
2,13,,0,1
3,20,,0,1
4,24,,0,1


In [4]:
prior_data = pd.concat([order_prior.drop(['add_to_cart_order'],axis=1),none_prior.drop(['add_to_cart_order'],axis=1)])
prior_data = prior_data.merge(orders,on='order_id').fillna(0)

prior_data.head()

Unnamed: 0,order_id,product_id,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,202279,prior,3,5,9,8.0
1,2,28985,1,202279,prior,3,5,9,8.0
2,2,9327,0,202279,prior,3,5,9,8.0
3,2,45918,1,202279,prior,3,5,9,8.0
4,2,30035,0,202279,prior,3,5,9,8.0


In [5]:
prior_data.user_id[prior_data.product_id!='None'].nunique()

206209

In [6]:
orders

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0
3421079,1854736,206209,prior,11,4,10,30.0
3421080,626363,206209,prior,12,1,12,18.0
3421081,2977660,206209,prior,13,1,12,7.0


In [7]:
test_order = orders[orders.eval_set=='test']
test_order = test_order.drop(['eval_set'],axis=1)
#prior_data = prior_data[prior_data.reordered==1]
temp = prior_data[['user_id','product_id']].drop_duplicates()
test_data = test_order.merge(temp,on='user_id')
test_data.head()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id
0,2774568,3,13,5,15,11.0,38596
1,2774568,3,13,5,15,11.0,21903
2,2774568,3,13,5,15,11.0,248
3,2774568,3,13,5,15,11.0,40604
4,2774568,3,13,5,15,11.0,8021


In [8]:
test_data.shape

(4908292, 7)

In [9]:
test_data.order_dow = test_data.order_dow.astype(np.int8)
test_data.order_hour_of_day = test_data.order_hour_of_day.astype(np.int8)
test_data.order_number = test_data.order_number.astype(np.int8)
test_data.days_since_prior_order = test_data.days_since_prior_order.astype(np.int8)

In [10]:
test_data['cos_cyclic_hour'] = np.cos((2*np.pi*test_data.order_hour_of_day)/24).astype(np.float16)
test_data['sin_cyclic_hour'] = np.sin((2*np.pi*test_data.order_hour_of_day)/24).astype(np.float16)
test_data['cos_cyclic_day']  = np.cos((2*np.pi*test_data.order_dow)/7).astype(np.float32)
test_data['sin_cyclic_day']  = np.sin((2*np.pi*test_data.order_dow)/7).astype(np.float32)

In [11]:
day_prior_ratio = pickle.load(open('day_prior_ratio.pkl','rb'))
hour_ratio      = pickle.load(open('hour_of_week.pkl','rb'))
day_week_ratio  = pickle.load(open('day_of_week.pkl','rb'))
user_product    = pickle.load(open('user_product_ratio.pkl','rb'))
product_day_week_ratio  = pickle.load(open('product_day_week_ratio.pkl','rb'))
product_hour_ratio=pickle.load(open('product_hour_ratio.pkl','rb'))
user_day_ratio  = pickle.load(open('user_day_ratio.pkl','rb'))  
user_hour_ratio = pickle.load(open('user_hour_ratio.pkl','rb'))
user_since_product = pickle.load(open('user_since_product.pkl','rb'))
user_times_product = pickle.load(open('user_times_product.pkl','rb'))

In [12]:
test_data = test_data.merge(day_prior_ratio,on='days_since_prior_order')

In [13]:
test_data = test_data.merge(hour_ratio,on='order_hour_of_day')

In [14]:
test_data = test_data.merge(product_hour_ratio,on=['product_id','order_hour_of_day'],how='left')

In [15]:
test_data = test_data.merge(day_week_ratio,on='order_dow')

In [16]:
test_data = test_data.merge(product_day_week_ratio,on=['product_id','order_dow'],how='left')

In [17]:
test_data = test_data.merge(user_product,on=['user_id','product_id'],how='left').fillna(0)

In [18]:
test_data = test_data.merge(user_day_ratio,on=['user_id','order_dow'],how='left')

In [19]:
test_data = test_data.merge(user_hour_ratio,on=['user_id','order_hour_of_day'],how='left')

In [20]:
test_data = test_data.merge(user_since_product,on=['user_id','product_id'],how='left')

In [21]:
test_data = test_data.merge(user_times_product,on=['user_id','product_id'],how='left')

In [22]:
test_data.isna().sum()

order_id                        0
user_id                         0
order_number                    0
order_dow                       0
order_hour_of_day               0
days_since_prior_order          0
product_id                      0
cos_cyclic_hour                 0
sin_cyclic_hour                 0
cos_cyclic_day                  0
sin_cyclic_day                  0
daypriorratio                   0
hod_ratio                       0
product_hour_ratio              0
dow_ratio                       0
product_day_ratio               0
user_product_ratio              0
user_day_ratio             684510
user_hour_ratio           1516943
since_order_product         75000
user_times_product        2899174
dtype: int64

In [23]:
test_data = test_data.fillna(0)

In [24]:
pro_pro = []
for i in tqdm(range(len(test_data))):
    if test_data.loc[i,'since_order_product']==0:
        pro_pro.append(test_data.loc[i,'days_since_prior_order'])
    else:
        pro_pro.append(test_data.loc[i,'since_order_product'])
test_data['pro_pro'] = pro_pro

100%|█████████████████████████████████████████████████████████████████████| 4908292/4908292 [03:34<00:00, 22870.55it/s]


-> These weighted features are referenced from https://github.com/alexanderrich/instacart-analysis/blob/master/preprocess.py

In [25]:
test_data['weight7days_sin_since_product'] = (1.01 + np.sin(2*np.pi*(test_data['pro_pro']/7)))/2
test_data['weight7days_cos_since_product'] = (1.01 + np.cos(2*np.pi*(test_data['pro_pro']/7)))/2

In [26]:
test_data['weight14days_sin_since_product'] = (1.01 + np.sin(2*np.pi*(test_data['pro_pro']/14)))/2
test_data['weight14days_cos_since_product'] = (1.01 + np.cos(2*np.pi*(test_data['pro_pro']/14)))/2

In [27]:
test_data['weight30days_sin_since_product'] = (1.01 + np.sin(2*np.pi*(test_data['pro_pro']/30)))/2
test_data['weight30days_cos_since_product'] = (1.01 + np.cos(2*np.pi*(test_data['pro_pro']/30)))/2

In [28]:
test_data = test_data.drop(['pro_pro'],axis=1)

In [29]:
test_data.shape,test_data.drop_duplicates().shape,test_data.isna().sum()

((4908292, 27),
 (4908292, 27),
 order_id                          0
 user_id                           0
 order_number                      0
 order_dow                         0
 order_hour_of_day                 0
 days_since_prior_order            0
 product_id                        0
 cos_cyclic_hour                   0
 sin_cyclic_hour                   0
 cos_cyclic_day                    0
 sin_cyclic_day                    0
 daypriorratio                     0
 hod_ratio                         0
 product_hour_ratio                0
 dow_ratio                         0
 product_day_ratio                 0
 user_product_ratio                0
 user_day_ratio                    0
 user_hour_ratio                   0
 since_order_product               0
 user_times_product                0
 weight7days_sin_since_product     0
 weight7days_cos_since_product     0
 weight14days_sin_since_product    0
 weight14days_cos_since_product    0
 weight30days_sin_since_product    0
 weigh

In [30]:
products = pickle.load(open('products_pca.pkl','rb'))

In [31]:
col = ['pca'+str(i) for i in range(0,30)]
products[col] = products[col].astype(np.float32)

In [32]:
test_data = test_data.merge(products,on='product_id')

In [33]:
test_data.shape

(4908292, 57)

In [34]:
sys.getsizeof(test_data)

1592911624

In [35]:
test_data.isna().sum().sum()

0

In [36]:
pickle.dump(test_data,open('test_data_2.pkl','wb'))