In [46]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import os

from IPython.display import Image

import warnings
warnings.filterwarnings('ignore')

import featuretools as ft

import utils

import lightgbm as lgb

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error

import time

In [4]:
data_dir = '../data/'
filenames = os.listdir(data_dir)
for i, ele in enumerate(filenames):
    print(i, ele)

0 olist_customers_dataset.csv
1 olist_geolocation_dataset.csv
2 olist_orders_dataset.csv
3 olist_order_items_dataset.csv
4 olist_order_payments_dataset.csv
5 olist_order_reviews_dataset.csv
6 olist_products_dataset.csv
7 olist_sellers_dataset.csv
8 processed
9 product_category_name_translation.csv


In [6]:
df_customers = pd.read_csv(data_dir+filenames[0])
df_orders = pd.read_csv(data_dir+filenames[2], parse_dates=['order_purchase_timestamp',
                                                            'order_approved_at', 
                                                            'order_delivered_carrier_date',
                                                            'order_delivered_customer_date',
                                                            'order_estimated_delivery_date'])

df_items = pd.read_csv(data_dir+filenames[3], parse_dates=['shipping_limit_date'])
# df_payments = pd.read_csv(data_dir+filenames[4])
df_payments = utils.read_pickles('../data/processed/payments')

  0%|          | 0/3 [00:00<?, ?it/s]100%|██████████| 3/3 [00:00<00:00, 35.71it/s]


In [7]:
# df_items.head()

In [8]:
# payment_sequentiala customer may pay an order with more than one payment method. If he does so,
# a sequence will be created to accommodate all payments.

# df_payments = df_payments.groupby('order_id').agg({
#                                 'payment_sequential': 'sum',
#                                 'payment_type': [lambda x:x.value_counts().index[0], 'nunique', 'count'],
#                                 'payment_installments': 'sum',
#                                 'payment_value': 'sum'})

# df_payments.reset_index(inplace=True)

# df_payments.columns = ['order_id', 'payment_sequential', 'payment_type_mode', 'payment_type_counts', 
#                        'orders_count', 'payment_installments', 'payment_value']

In [9]:
# utils.to_pickles(df_payments, '../data/processed/payments')

In [10]:
# Create entityset
entity_set = ft.EntitySet(id = 'order_val')

entity_set = entity_set.entity_from_dataframe(
    entity_id='orders',
    dataframe=df_orders,
    index='order_id'
#     variable_types={
#         'plan': ft.variable_types.Categorical
#     }
)

# Create item entity
entity_set = entity_set.entity_from_dataframe(
    entity_id='items',
    dataframe=df_items,
    make_index=True,
    index = 'item_id',
    time_index = 'shipping_limit_date'
)

# Create payments entity
entity_set = entity_set.entity_from_dataframe(
    entity_id='payments',
    dataframe=df_payments,
    index='order_id'
)

# Create customers entity
# entity_set = entity_set.entity_from_dataframe(
#     entity_id='customers',
#     dataframe=df_customers,
#     index = 'customer_id'
# )

entity_set

Entityset: order_val
  Entities:
    orders [Rows: 99441, Columns: 8]
    items [Rows: 112650, Columns: 8]
    payments [Rows: 99440, Columns: 7]
  Relationships:
    No relationships

In [11]:
# Create the relationships (parents-->child) (parent have one id child may have multiple parent ids)
relationship_payments_orders = ft.Relationship(entity_set['orders']['order_id'],
                                               entity_set['payments']['order_id'])

relationship_items_orders = ft.Relationship(entity_set['orders']['order_id'],
                                            entity_set['items']['order_id'])

# relationship_customer_orders = ft.Relationship(entity_set['customers']['customer_id'],
#                                                entity_set['orders']['customer_id'])

# Add the relationships to the entity set
entity_set = entity_set.add_relationship(relationship_payments_orders)
entity_set = entity_set.add_relationship(relationship_items_orders)
# entity_set = entity_set.add_relationship(relationship_customer_orders)

# Check entity_set
entity_set

Entityset: order_val
  Entities:
    orders [Rows: 99441, Columns: 8]
    items [Rows: 112650, Columns: 8]
    payments [Rows: 99440, Columns: 7]
  Relationships:
    payments.order_id -> orders.order_id
    items.order_id -> orders.order_id

In [47]:
transformation_feats = ['years', 'month', 'weekday', 'subtract', 'time_since_previous']
aggregate_feats = ['mean', 'std']

df_features, feature_names = ft.dfs(entityset = entity_set,
                                      target_entity = "payments",
                                      trans_primitives = transformation_feats,
                                      agg_primitives = aggregate_feats,                  
                                      max_depth = 2,
                                      verbose = True)

Built 54 features
Elapsed: 00:00 | Remaining: ? | Progress:   0%|          | Calculated: 0/10 chunksElapsed: 00:01 | Remaining: 00:15 | Progress:  10%|█         | Calculated: 1/10 chunksElapsed: 00:03 | Remaining: 00:14 | Progress:  20%|██        | Calculated: 2/10 chunksElapsed: 00:05 | Remaining: 00:12 | Progress:  30%|███       | Calculated: 3/10 chunksElapsed: 00:06 | Remaining: 00:10 | Progress:  40%|████      | Calculated: 4/10 chunksElapsed: 00:08 | Remaining: 00:08 | Progress:  50%|█████     | Calculated: 5/10 chunksElapsed: 00:09 | Remaining: 00:06 | Progress:  60%|██████    | Calculated: 6/10 chunksElapsed: 00:11 | Remaining: 00:04 | Progress:  70%|███████   | Calculated: 7/10 chunksElapsed: 00:12 | Remaining: 00:03 | Progress:  80%|████████  | Calculated: 8/10 chunksElapsed: 00:13 | Remaining: 00:01 | Progress:  90%|█████████ | Calculated: 9/10 chunksElapsed: 00:14 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 10/10 chunks


In [48]:
## years missing

In [49]:
df_features.head()

Unnamed: 0_level_0,payment_sequential,payment_type_mode,payment_type_counts,orders_count,payment_installments,payment_value,payment_installments - payment_type_counts,payment_type_counts - payment_value,payment_installments - payment_value,payment_value - payment_sequential,...,orders.MONTH(order_purchase_timestamp),orders.MONTH(order_approved_at),orders.MONTH(order_delivered_carrier_date),orders.MONTH(order_delivered_customer_date),orders.MONTH(order_estimated_delivery_date),orders.WEEKDAY(order_purchase_timestamp),orders.WEEKDAY(order_approved_at),orders.WEEKDAY(order_delivered_carrier_date),orders.WEEKDAY(order_delivered_customer_date),orders.WEEKDAY(order_estimated_delivery_date)
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00010242fe8c5a6d1ba2dd792cb16214,1,credit_card,1,1,2,72.19,1,-71.19,-70.19,71.19,...,9,9.0,9.0,9.0,9,2,2.0,1.0,2.0,4
00018f77f2f0320c557190d7a144bdd3,1,credit_card,1,1,3,259.83,2,-258.83,-256.83,258.83,...,4,4.0,5.0,5.0,5,2,2.0,3.0,4.0,0
000229ec398224ef6ca0657da4fc703e,1,credit_card,1,1,5,216.87,4,-215.87,-211.87,215.87,...,1,1.0,1.0,1.0,2,6,6.0,1.0,0.0,0
00024acbcdf0a6daa1e931b038114c75,1,credit_card,1,1,2,25.78,1,-24.78,-23.78,24.78,...,8,8.0,8.0,8.0,8,2,2.0,4.0,1.0,0
00042b26cf59d7ce69dfabb4e55b4fd9,1,credit_card,1,1,3,218.04,2,-217.04,-215.04,217.04,...,2,2.0,2.0,3.0,3,5,5.0,3.0,2.0,4


In [55]:
target = df_features.pop('payment_value')
df_features['payment_type_mode'] = df_features['payment_type_mode'].astype('category').cat.codes.values
df_features['orders.order_status'] = df_features['orders.order_status'].astype('category').cat.codes.values
df_features.drop('orders.customer_id', axis=1, inplace=True)

x_train, x_test, y_train, y_test = train_test_split(df_features, target, test_size=0.33, random_state=42)

In [51]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 32, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.005,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "nthread": 4,
         "verbosity": -1}

In [57]:
folds = KFold(n_splits=3, shuffle=True, random_state=15)
oof = np.zeros(len(x_train))
predictions = np.zeros(len(x_test))
start = time.time()
feature_importance_df = pd.DataFrame()

features = list(x_train.columns)

for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train.values, y_train.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(x_train.iloc[trn_idx][features], label=y_train.iloc[trn_idx])
#                            , categorical_feature=categorical_feats)
    val_data = lgb.Dataset(x_train.iloc[val_idx][features], label=y_train.iloc[val_idx])
#                            , categorical_feature=categorical_feats)

    num_round = 1000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data],
                    verbose_eval=100, early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(x_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(x_test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(mean_squared_error(oof, y_train)**0.5))

fold n°0
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 137.221	valid_1's rmse: 165.324
[200]	training's rmse: 90.6117	valid_1's rmse: 124.703
[300]	training's rmse: 65.2312	valid_1's rmse: 104.569
[400]	training's rmse: 52.3811	valid_1's rmse: 95.1728
[500]	training's rmse: 46.2403	valid_1's rmse: 90.8028
[600]	training's rmse: 43.2572	valid_1's rmse: 88.624
[700]	training's rmse: 41.6111	valid_1's rmse: 87.3837
[800]	training's rmse: 40.5324	valid_1's rmse: 86.671
[900]	training's rmse: 39.6781	valid_1's rmse: 86.1703
[1000]	training's rmse: 38.9355	valid_1's rmse: 85.7178
Did not meet early stopping. Best iteration is:
[1000]	training's rmse: 38.9355	valid_1's rmse: 85.7178
fold n°1
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 150.588	valid_1's rmse: 134.293
[200]	training's rmse: 106.936	valid_1's rmse: 86.0383
[300]	training's rmse: 84.8664	valid_1's rmse: 60.2128
[400]	training's rmse: 74.5358	valid_1