In [1]:
import pandas as pd # dataframes
import numpy as np # algebra & calculus
import nltk # text preprocessing & manipulation
import matplotlib.pyplot as plt # plotting
import seaborn as sns # plotting

from functools import partial
color = sns.color_palette() # adjusting plotting style
import warnings
warnings.filterwarnings('ignore') # silence annoying warnings

In [2]:
# aisles 
aisles = pd.read_csv('data/aisles.csv', engine='c')
print('Total aisles: {}'.format(aisles.shape[0]))
aisles.head()

Total aisles: 134


Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


In [3]:
# departments 
departments = pd.read_csv('data/departments.csv', engine = 'c')
print('Total departments: {}'.format(departments.shape[0]))
departments.head()

Total departments: 21


Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


In [4]:
# products
products = pd.read_csv('data/products.csv', engine='c')
print('Total products: {}'.format(products.shape[0]))
products.head(5)

Total products: 49688


Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [5]:
# getting it on 'merge' command
# merge all goods info into one
# turns out to be unnecessary
goods = pd.merge(products, departments, on='department_id', how='left')
goods = pd.merge(goods, aisles, on='aisle_id', how='left')
del products, aisles
goods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,department,aisle
0,1,Chocolate Sandwich Cookies,61,19,snacks,cookies cakes
1,2,All-Seasons Salt,104,13,pantry,spices seasonings
2,3,Robust Golden Unsweetened Oolong Tea,94,7,beverages,tea
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,frozen,frozen meals
4,5,Green Chile Anytime Sauce,5,13,pantry,marinades meat preparation


In [6]:
# train dataset
# assign dtype to reduce memory usage
dtype={'order_id': np.int32, 'product_id': np.int32, 
                              'add_to_cart_order': np.int16, 'reordered': np.int8}
op_train = pd.read_csv('data/order_products__train.csv', engine='c', dtype=dtype)
print(op_train.info())
op_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1384617 entries, 0 to 1384616
Data columns (total 4 columns):
order_id             1384617 non-null int32
product_id           1384617 non-null int32
add_to_cart_order    1384617 non-null int16
reordered            1384617 non-null int8
dtypes: int16(1), int32(2), int8(1)
memory usage: 14.5 MB
None


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [7]:
# test dataset (submission)
test = pd.read_csv('data/sample_submission.csv', engine='c')
print('Total orders(test): {}'.format(test.shape[0]))
test.head()

Total orders(test): 75000


Unnamed: 0,order_id,products
0,17,39276 29259
1,34,39276 29259
2,137,39276 29259
3,182,39276 29259
4,257,39276 29259


In [8]:
# prior 
op_prior = pd.read_csv('data/order_products__prior.csv', engine='c', dtype=dtype)
print(op_prior.info())
op_prior.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
order_id             int32
product_id           int32
add_to_cart_order    int16
reordered            int8
dtypes: int16(1), int32(2), int8(1)
memory usage: 340.3 MB
None


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [9]:
# orders
dtype={'order_id': np.int32, 
       'user_id': np.int32, 
       'order_number': np.int32, 
       'order_dow': np.int8, 
       'order_hour_of_day': np.int8, 
       'days_since_prior_order': np.float16}
orders = pd.read_csv('data/orders.csv', engine='c', dtype=dtype)
# replace string with int8 to also reduce memory usage
orders.eval_set = orders.eval_set.replace({'prior': 0, 'train': 1, 'test':2}).astype(np.int8)
print(orders.info())
orders.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
order_id                  int32
user_id                   int32
eval_set                  int8
order_number              int32
order_dow                 int8
order_hour_of_day         int8
days_since_prior_order    float16
dtypes: float16(1), int32(3), int8(3)
memory usage: 55.5 MB
None


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,0,1,2,8,
1,2398795,1,0,2,3,7,15.0
2,473747,1,0,3,3,12,21.0
3,2254736,1,0,4,4,7,29.0
4,431534,1,0,5,4,15,28.0


In [10]:
%%time
# for train
from functools import partial
# turns out apply+partial runs at pretty much the same speed with lambda
order_details = pd.merge(
                left=op_train,
                right=orders, 
                how='left', 
                on='order_id'
        ).apply(lambda x: pd.to_numeric(x, errors='ignore', downcast='integer'))
# .apply(partial(pd.to_numeric, errors='ignore', downcast='integer'))

order_details = pd.merge(
                left=order_details,
                right=goods[['product_id', 
                             'aisle_id', 
                             'department_id']].apply(partial(pd.to_numeric, 
                                                             errors='ignore', 
                                                             downcast='integer')),
                how='left',
                on='product_id'
)

print(order_details.shape, op_train.shape)

# delete (redundant now) dataframes
# del op_train
print order_details.info()
order_details.head()

((1384617, 12), (1384617, 4))
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1384617 entries, 0 to 1384616
Data columns (total 12 columns):
order_id                  1384617 non-null int32
product_id                1384617 non-null int32
add_to_cart_order         1384617 non-null int8
reordered                 1384617 non-null int8
user_id                   1384617 non-null int32
eval_set                  1384617 non-null int8
order_number              1384617 non-null int8
order_dow                 1384617 non-null int8
order_hour_of_day         1384617 non-null int8
days_since_prior_order    1384617 non-null int8
aisle_id                  1384617 non-null int16
department_id             1384617 non-null int8
dtypes: int16(1), int32(3), int8(8)
memory usage: 39.6 MB
None
Wall time: 1.69 s


In [11]:
%%time
# for prior
order_details = pd.concat([order_details, 
                            pd.merge(
                                left=pd.merge(
                                    left=op_prior,
                                    right=goods[['product_id',
                                                 'aisle_id',
                                                 'department_id']].apply(lambda x: 
                                                                         pd.to_numeric(
                                                                             x, 
                                                                             errors='ignored', 
                                                                             downcast='integer')),
                                    how='left',
                                    on='product_id'),
                               right=orders,
                               how='left',
                               on='order_id'),
                            ])
# make sure we didn't forget to retain test dataset :D
test_orders = orders[orders.eval_set == 2]
# delete (redundant now) dataframes
# del op_prior, orders

Wall time: 13 s


In [13]:
# order_details.to_csv('agg.csv', index=False)
test_orders.to_csv('tset.csv', index=False)

In [80]:
%%time
# baseline
test_history = order_details[(order_details.user_id.isin(test_orders.user_id))]
last_orders = test_history.groupby('user_id')['order_number'].max()

def get_last_orders_reordered():
    t = pd.merge(
            left=pd.merge(
                    left=last_orders.reset_index(),
                    right=test_history[test_history.reordered == 1],
                    how='left',
                    on=['user_id', 'order_number']
                )[['user_id', 'product_id']],
            right=test_orders[['user_id', 'order_id']],
            how='left',
            on='user_id'
    ).fillna(-1).groupby('order_id')['product_id'].apply(lambda x: ' '.join([str(int(e)) for e in set(x)]) 
                                                  ).reset_index().replace(to_replace='-1', 
                                                                          value='None')
    t.columns = ['order_id', 'products']
    return t

# save submission
a = get_last_orders_reordered()
# get_last_orders_reordered().to_csv('less_dumb_subm_last_order_reordered_only.csv', 
#                          encoding='utf-8', 
#                          index=False)

Wall time: 7.61 s
