In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
import gc

path_data = 'data/'


In [2]:
priors = pd.read_csv(path_data + 'order_products__prior.csv', 
                     dtype={
                            'order_id': np.int32,
                            'product_id': np.uint16,
                            'add_to_cart_order': np.int16,
                            'reordered': np.int8})
train = pd.read_csv(path_data + 'order_products__train.csv', 
                dtype={
                        'order_id': np.int32,
                        'product_id': np.uint16,
                        'add_to_cart_order': np.int16,
                        'reordered': np.int8})
'''
--------------------------------order--------------------------------
* This file tells us which set (prior, train, test) an order belongs
* Unique in order_id
* order_id in train, prior, test has no intersection
* this is the #order_number order of this user
'''
orders = pd.read_csv(path_data + 'orders.csv', 
                     dtype={
                            'order_id': np.int32,
                            'user_id': np.int64,
                            'eval_set': 'category',
                            'order_number': np.int16,
                            'order_dow': np.int8,
                            'order_hour_of_day': np.int8,
                            'days_since_prior_order': np.float32})

In [3]:
products = pd.read_csv(path_data + 'products.csv', 
                       dtype={
                            'product_id': np.uint32,
                            'order_id': np.int32,
                            'aisle_id': np.uint8,
                            'department_id': np.uint8},
                            usecols=['product_id', 'aisle_id', 'department_id'])
aisles = pd.read_csv(path_data + "aisles.csv")
departments = pd.read_csv(path_data + "departments.csv")
sample_submission = pd.read_csv(path_data + "sample_submission.csv")

In [4]:
# works in a way of context manager,
# combined with "with" statement
class tick_tock:
    def __init__(self, process_name, verbose=1):
        self.process_name = process_name
        self.verbose = verbose
    def __enter__(self):
        if self.verbose:
            print(self.process_name + " begin ......")
            self.begin_time = time.time()
    def __exit__(self, type, value, traceback):
        if self.verbose:
            end_time = time.time()
            print(self.process_name + " end ......")
            print('time lapsed {0} s \n'.format(end_time - self.begin_time))

In [5]:
def ka_add_groupby_features_1_vs_n(df, group_columns_list, agg_dict, only_new_feature=True):
    '''Create statistical columns, group by [N columns] and compute stats on [N column]

       Parameters
       ----------
       df: pandas dataframe
          Features matrix
       group_columns_list: list_like
          List of columns you want to group with, could be multiple columns
       agg_dict: python dictionary

       Return
       ------
       new pandas dataframe with original columns and new added columns

       Example
       -------
       {real_column_name: {your_specified_new_column_name : method}}
       agg_dict = {'user_id':{'prod_tot_cnts':'count'},
                   'reordered':{'reorder_tot_cnts_of_this_prod':'sum'},
                   'user_buy_product_times': {'prod_order_once':lambda x: sum(x==1),
                                              'prod_order_more_than_once':lambda x: sum(x==2)}}
       ka_add_stats_features_1_vs_n(train, ['product_id'], agg_dict)
    '''
    with tick_tock("add stats features"):
        try:
            if type(group_columns_list) == list:
                pass
            else:
                raise TypeError(group_columns_list + " should be a list")
        except TypeError as e:
            print(e)
            raise

        df_new = df.copy()
        grouped = df_new.groupby(group_columns_list)

        the_stats = grouped.agg(agg_dict)
        the_stats.columns = the_stats.columns.droplevel(0)
        the_stats.reset_index(inplace=True)
        if only_new_feature:
            df_new = the_stats
        else:
            df_new = pd.merge(left=df_new, right=the_stats, on=group_columns_list, how='left')

    return df_new

def ka_add_groupby_features_n_vs_1(df, group_columns_list, target_columns_list, methods_list, keep_only_stats=True, verbose=1):
    '''Create statistical columns, group by [N columns] and compute stats on [1 column]

       Parameters
       ----------
       df: pandas dataframe
          Features matrix
       group_columns_list: list_like
          List of columns you want to group with, could be multiple columns
       target_columns_list: list_like
          column you want to compute stats, need to be a list with only one element
       methods_list: list_like
          methods that you want to use, all methods that supported by groupby in Pandas

       Return
       ------
       new pandas dataframe with original columns and new added columns

       Example
       -------
       ka_add_stats_features_n_vs_1(train, group_columns_list=['x0'], target_columns_list=['x10'])
    '''
    with tick_tock("add stats features", verbose):
        dicts = {"group_columns_list": group_columns_list , "target_columns_list": target_columns_list, "methods_list" :methods_list}

        for k, v in dicts.items():
            try:
                if type(v) == list:
                    pass
                else:
                    raise TypeError(k + "should be a list")
            except TypeError as e:
                print(e)
                raise

        grouped_name = ''.join(group_columns_list)
        target_name = ''.join(target_columns_list)
        combine_name = [[grouped_name] + [method_name] + [target_name] for method_name in methods_list]

        df_new = df.copy()
        grouped = df_new.groupby(group_columns_list)

        the_stats = grouped[target_name].agg(methods_list).reset_index()
        the_stats.columns = [grouped_name] + ['_%s_%s_by_%s' % (grouped_name, method_name, target_name) 
                                              for (grouped_name, method_name, target_name) in combine_name]
        if keep_only_stats:
            return the_stats
        else:
            df_new = pd.merge(left=df_new, right=the_stats, on=group_columns_list, how='left')
        return df_new

In [17]:
priors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
order_id             int32
product_id           uint16
add_to_cart_order    int16
reordered            int8
dtypes: int16(1), int32(1), int8(1), uint16(1)
memory usage: 278.4 MB


In [23]:
# Products information ----------------------------------------------------------------
# add order information to priors set
# inner/right use less memory than left/outer, why?
priors_orders_detail = orders.merge(right=priors, how='inner', on='order_id')
# priors_orders_detail = pd.merge(left=priors, right=orders, how='left', on='order_id')
priors_orders_detail.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32434489 entries, 0 to 32434488
Data columns (total 10 columns):
order_id                  int32
user_id                   int64
eval_set                  object
order_number              int16
order_dow                 int8
order_hour_of_day         int8
days_since_prior_order    float32
product_id                uint16
add_to_cart_order         int16
reordered                 int8
dtypes: float32(1), int16(2), int32(1), int64(1), int8(3), object(1), uint16(1)
memory usage: 1.2+ GB


In [26]:
# create new variables
# _user_buy_product_times: how many times a user has bought this product
# add a couple 3, 4, < 5 ...
priors_orders_detail.loc[:,'_user_buy_product_times'] = priors_orders_detail.groupby(
    ['user_id', 'product_id']).cumcount() + 1
priors_orders_detail.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_user_buy_product_times
0,2539329,1,prior,1,2,8,,196,1,0,1
1,2539329,1,prior,1,2,8,,14084,2,0,1
2,2539329,1,prior,1,2,8,,12427,3,0,1
3,2539329,1,prior,1,2,8,,26088,4,0,1
4,2539329,1,prior,1,2,8,,26405,5,0,1


In [28]:
agg_dict = {'user_id':{'_prod_tot_cnts':'count'}, 
            'reordered':{'_prod_reorder_tot_cnts':'sum'}, 
            '_user_buy_product_times': {'_prod_buy_first_time_total_cnt':lambda x: sum(x==1),
                                        '_prod_buy_second_time_total_cnt':lambda x: sum(x==2),
#                                         '_prod_buy_more_than_5_time_total_cnt':lambda x: sum(x>5),
#                                         '_prod_buy_more_than_10_time_total_cnt':lambda x: sum(x>10),
#                                         '_prod_buy_more_than_15_time_total_cnt':lambda x: sum(x>15),
                                       }}
prd = ka_add_groupby_features_1_vs_n(priors_orders_detail, ['product_id'], agg_dict)
prd.head()

add stats features begin ......
add stats features end ......
time lapsed 89.375 s 



Unnamed: 0,product_id,_prod_buy_second_time_total_cnt,_prod_buy_first_time_total_cnt,_prod_tot_cnts,_prod_reorder_tot_cnts
0,1,276,716,1852,1136.0
1,2,8,78,90,12.0
2,3,36,74,277,203.0
3,4,64,182,329,147.0
4,5,4,6,15,9.0


In [30]:
#　reordered_times + 1, why?
prd['_prod_reorder_prob'] = prd._prod_buy_second_time_total_cnt / prd._prod_buy_first_time_total_cnt
prd['_prod_reorder_ratio'] = prd._prod_reorder_tot_cnts / prd._prod_tot_cnts
prd['_prod_reorder_times'] = 1 + prd._prod_reorder_tot_cnts / prd._prod_buy_first_time_total_cnt
prd.head()

Unnamed: 0,product_id,_prod_buy_second_time_total_cnt,_prod_buy_first_time_total_cnt,_prod_tot_cnts,_prod_reorder_tot_cnts,_prod_reorder_prob,_prod_reorder_ratio,_prod_reorder_times
0,1,276,716,1852,1136.0,0.385475,0.613391,2.586592
1,2,8,78,90,12.0,0.102564,0.133333,1.153846
2,3,36,74,277,203.0,0.486486,0.732852,3.743243
3,4,64,182,329,147.0,0.351648,0.446809,1.807692
4,5,4,6,15,9.0,0.666667,0.6,2.5


In [31]:
# user part
# _user_total_orders: 用户的总订单数
# 可以考虑加入其它统计指标++++++++++++++++++++++++++
# _user_sum_days_since_prior_order: 距离上次购买时间(和),这个只能在orders表里面计算，
# priors_orders_detail不是在order level上面unique
# _user_mean_days_since_prior_order: 距离上次购买时间(均值)
# I don't think the sum will help a lot, but first let's figure out a product line
agg_dict_2 = {'order_number':{'_user_total_orders':'max'},
              'days_since_prior_order':{'_user_sum_days_since_prior_order':'sum', 
                                        '_user_mean_days_since_prior_order': 'mean'}}
users = ka_add_groupby_features_1_vs_n(orders[orders.eval_set == 'prior'], ['user_id'], agg_dict_2)
users.head()

add stats features begin ......
add stats features end ......
time lapsed 0.323999881744 s 



Unnamed: 0,user_id,_user_mean_days_since_prior_order,_user_sum_days_since_prior_order,_user_total_orders
0,1,19.555555,176.0,10
1,2,15.230769,198.0,14
2,3,12.090909,133.0,12
3,4,13.75,55.0,5
4,5,13.333333,40.0,4


In [32]:
# _user_reorder_ratio: reorder的总次数 / 第一单后买后的总次数
# _user_total_products: 用户购买的总商品数
# _user_distinct_products: 用户购买的unique商品数
agg_dict_3 = {'reordered':
              {
                  '_user_reorder_ratio': 
                       lambda x: sum(priors_orders_detail.ix[x.index,'reordered']==1)/
                                 sum(priors_orders_detail.ix[x.index,'order_number'] > 1)
              },
              'product_id':
              {
                  '_user_total_products':'count', 
                  '_user_distinct_products': lambda x: x.nunique()         
              }
             }
us = ka_add_groupby_features_1_vs_n(priors_orders_detail, ['user_id'], agg_dict_3)
us.head()

add stats features begin ......
add stats features end ......
time lapsed 457.193000078 s 



Unnamed: 0,user_id,_user_total_products,_user_distinct_products,_user_reorder_ratio
0,1,59,18,0
1,2,195,102,0
2,3,88,33,0
3,4,18,17,0
4,5,37,23,0


In [33]:
users = users.merge(us, how='inner')
users.head()

Unnamed: 0,user_id,_user_mean_days_since_prior_order,_user_sum_days_since_prior_order,_user_total_orders,_user_total_products,_user_distinct_products,_user_reorder_ratio
0,1,19.555555,176.0,10,59,18,0
1,2,15.230769,198.0,14,195,102,0
2,3,12.090909,133.0,12,88,33,0
3,4,13.75,55.0,5,18,17,0
4,5,13.333333,40.0,4,37,23,0


In [32]:
# 平均每单的商品数
# 每单中最多的商品数，最少的商品数++++++++++++++
users['_user_average_basket'] = users._user_total_products / users._user_total_orders
# Last order
us = orders[orders.eval_set != "prior"][['user_id', 'order_id', 'eval_set', 'days_since_prior_order']]
us.rename(index=str, columns={'days_since_prior_order': 'time_since_last_order'}, inplace=True)
us.head()

Unnamed: 0,user_id,order_id,eval_set,time_since_last_order
10,1,1187899,train,14.0
25,2,1492625,train,30.0
38,3,2774568,test,11.0
44,4,329954,test,30.0
49,5,2196797,train,6.0


In [33]:
users = users.merge(us, how='inner')
users.head()

Unnamed: 0,user_id,_user_mean_days_since_prior_order,_user_sum_days_since_prior_order,_user_total_orders,_user_total_products,_user_distinct_products,_user_reorder_ratio,_user_average_basket,order_id,eval_set,time_since_last_order
0,1,19.555555,176.0,10,59,18,0,5.9,1187899,train,14.0
1,2,15.230769,198.0,14,195,102,0,13.928571,1492625,train,30.0
2,3,12.090909,133.0,12,88,33,0,7.333333,2774568,test,11.0
3,4,13.75,55.0,5,18,17,0,3.6,329954,test,30.0
4,5,13.333333,40.0,4,37,23,0,9.25,2196797,train,6.0


In [34]:
# 这里应该还有很多变量可以被添加
# _up_order_count: 用户购买该商品的次数
# _up_first_order_number: 用户第一次购买该商品所处的订单数
# _up_last_order_number: 用户最后一次购买该商品所处的订单数
# _up_average_cart_position: 该商品被添加到购物篮中的平均位置
agg_dict_4 = {'order_number':{'_up_order_count': 'count', 
                              '_up_first_order_number': 'min', 
                              '_up_last_order_number':'max'}, 
              'add_to_cart_order':{'_up_average_cart_position': 'mean'}}

data = ka_add_groupby_features_1_vs_n(df=priors_orders_detail, 
                                                      group_columns_list=['user_id', 'product_id'], 
                                                      agg_dict=agg_dict_4)

data.head()

add stats features begin ......
add stats features end ......
time lapsed 14.1860001087 s 



Unnamed: 0,user_id,product_id,_up_average_cart_position,_up_order_count,_up_first_order_number,_up_last_order_number
0,1,196,1.4,10,1,10
1,1,10258,3.333333,9,2,10
2,1,10326,5.0,1,5,5
3,1,12427,3.3,10,1,10
4,1,13032,6.333333,3,2,10


In [35]:
data = data.merge(prd, how='inner', on='product_id').merge(users, how='inner', on='user_id')
data.head()

Unnamed: 0,user_id,product_id,_up_average_cart_position,_up_order_count,_up_first_order_number,_up_last_order_number,_prod_buy_second_time_total_cnt,_prod_buy_first_time_total_cnt,_prod_tot_cnts,_prod_reorder_tot_cnts,_prod_reorder_prob,_prod_reorder_ratio,_prod_reorder_times,_user_mean_days_since_prior_order,_user_sum_days_since_prior_order,_user_total_orders,_user_total_products,_user_distinct_products,_user_reorder_ratio
0,1,196,1.4,10,1,10,4660,8000,35791,27791.0,0.5825,0.77648,4.473875,19.555555,176.0,10,59,18,0
1,1,10258,3.333333,9,2,10,308,557,1946,1389.0,0.552962,0.713772,3.493716,19.555555,176.0,10,59,18,0
2,1,10326,5.0,1,5,5,1003,1923,5526,3603.0,0.521581,0.652009,2.873635,19.555555,176.0,10,59,18,0
3,1,12427,3.3,10,1,10,889,1679,6476,4797.0,0.529482,0.740735,3.857058,19.555555,176.0,10,59,18,0
4,1,13032,6.333333,3,2,10,617,1286,3751,2465.0,0.479782,0.657158,2.916796,19.555555,176.0,10,59,18,0


In [36]:
# 该商品购买次数 / 总的订单数
# 最近一次购买商品 - 最后一次购买该商品
# 该商品购买次数 / 第一次购买该商品到最后一次购买商品的的订单数
data['_up_order_rate'] = data._up_order_count / data._user_total_orders
data['_up_order_since_last_order'] = data._user_total_orders - data._up_last_order_number
data['_up_order_rate_since_first_order'] = data._up_order_count / (data._user_total_orders - data._up_first_order_number + 1)

# add user_id to train set
train = train.merge(right=orders[['order_id', 'user_id']], how='left', on='order_id')
data = data.merge(train[['user_id', 'product_id', 'reordered']], on=['user_id', 'product_id'], how='left')
data.head()

Unnamed: 0,user_id,product_id,_up_average_cart_position,_up_order_count,_up_first_order_number,_up_last_order_number,_prod_buy_second_time_total_cnt,_prod_buy_first_time_total_cnt,_prod_tot_cnts,_prod_reorder_tot_cnts,...,_user_mean_days_since_prior_order,_user_sum_days_since_prior_order,_user_total_orders,_user_total_products,_user_distinct_products,_user_reorder_ratio,_up_order_rate,_up_order_since_last_order,_up_order_rate_since_first_order,reordered
0,1,196,1.4,10,1,10,4660,8000,35791,27791.0,...,19.555555,176.0,10,59,18,0,1.0,0,1.0,1.0
1,1,10258,3.333333,9,2,10,308,557,1946,1389.0,...,19.555555,176.0,10,59,18,0,0.9,0,1.0,1.0
2,1,10326,5.0,1,5,5,1003,1923,5526,3603.0,...,19.555555,176.0,10,59,18,0,0.1,5,0.166667,
3,1,12427,3.3,10,1,10,889,1679,6476,4797.0,...,19.555555,176.0,10,59,18,0,1.0,0,1.0,
4,1,13032,6.333333,3,2,10,617,1286,3751,2465.0,...,19.555555,176.0,10,59,18,0,0.3,0,0.333333,1.0


In [37]:
# release Memory
del train, prd, users
# gc.collect()
# release Memory
del priors_orders_detail, orders
gc.collect()
# save data to be called by another script
# data.to_csv('df_rdy/df%s.csv' %data.shape[1], index=False)
# Actually, to load it again, we need to specify dtype, which is problematic in pipeline.

In [35]:
import xgboost
from sklearn.model_selection import train_test_split
train = data.loc[data.eval_set == "train",:]
train.drop(['eval_set', 'user_id', 'product_id', 'order_id'], axis=1, inplace=True)
train.loc[:, 'reordered'] = train.reordered.fillna(0)

X_test_real = data.loc[data.eval_set == "test",:]

# subsample 让training时间更短
X_train, X_test, y_train, y_test = train_test_split(train.drop('reordered', axis=1), train.reordered,
                                                    test_size=0.9, random_state=42)
d_train = xgboost.DMatrix(X_train, y_train)
xgb_params = {
    "objective"         : "reg:logistic"
    ,"eval_metric"      : "logloss"
    ,"eta"              : 0.1
    ,"max_depth"        : 6
    ,"min_child_weight" :10
    ,"gamma"            :0.70
    ,"subsample"        :0.76
    ,"colsample_bytree" :0.95
    ,"alpha"            :2e-05
    ,"lambda"           :10
}

watchlist= [(d_train, "train")]
bst = xgboost.train(params=xgb_params, dtrain=d_train, num_boost_round=80, evals=watchlist, verbose_eval=10)
xgboost.plot_importance(bst)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


[0]	train-logloss:0.625588
[10]	train-logloss:0.335118
[20]	train-logloss:0.268198
[30]	train-logloss:0.251
[40]	train-logloss:0.246311
[50]	train-logloss:0.244751
[60]	train-logloss:0.243935
[70]	train-logloss:0.243328
[79]	train-logloss:0.242816


<matplotlib.axes._subplots.AxesSubplot at 0x1d6476d8>

In [25]:
# d_test = xgboost.DMatrix(X_test_real.drop(['eval_set', 'user_id', 'order_id', 'reordered', 'product_id'], axis=1))
X_test_real.loc[:,'reordered'] = (bst.predict(d_test) > 0.21).astype(int)
X_test_real.loc[:, 'product_id'] = X_test_real.product_id.astype(str)
submit = ka_add_groupby_features_n_vs_1(X_test_real[X_test_real.reordered == 1], 
                                               group_columns_list=['order_id'],
                                               target_columns_list= ['product_id'],
                                               methods_list=[lambda x: ' '.join(set(x))], keep_only_stats=True)
submit.columns = sample_submission.columns.tolist()
submit_final = sample_submission[['order_id']].merge(submit, how='left').fillna('None')
submit_final.to_csv("python_test.csv", index=False)

add stats features begin ......
add stats features end ......
time lapsing 3.1819999218 s 

