## Import libraries

In [1]:
import pandas as pd
import numpy as np

import os

import featuretools as ft

import utils

import warnings
warnings.filterwarnings('ignore')

In [2]:
data_dir = '../data/'
filenames = os.listdir(data_dir)
for i, ele in enumerate(filenames):
    print(i, ele)

0 olist_customers_dataset.csv
1 olist_geolocation_dataset.csv
2 olist_orders_dataset.csv
3 olist_order_items_dataset.csv
4 olist_order_payments_dataset.csv
5 olist_order_reviews_dataset.csv
6 olist_products_dataset.csv
7 olist_sellers_dataset.csv
8 processed
9 product_category_name_translation.csv


## Read files/ outputs of preprocessing

In [3]:
df_customers = utils.read_pickles('../data/processed/customers')
df_orders = utils.read_pickles('../data/processed/orders')
df_items = pd.read_csv(data_dir+filenames[3], parse_dates=['shipping_limit_date'])
df_payments = utils.read_pickles('../data/processed/payments')
df_products = utils.read_pickles('../data/processed/products')
df_sellers = utils.read_pickles('../data/processed/sellers')

  0%|          | 0/3 [00:00<?, ?it/s] 33%|███▎      | 1/3 [00:00<00:00,  3.98it/s]100%|██████████| 3/3 [00:00<00:00,  8.90it/s]
  0%|          | 0/3 [00:00<?, ?it/s]100%|██████████| 3/3 [00:00<00:00, 34.09it/s]
  0%|          | 0/3 [00:00<?, ?it/s]100%|██████████| 3/3 [00:00<00:00, 36.60it/s]
  0%|          | 0/3 [00:00<?, ?it/s]100%|██████████| 3/3 [00:00<00:00, 73.18it/s]
  0%|          | 0/3 [00:00<?, ?it/s]100%|██████████| 3/3 [00:00<00:00, 93.76it/s]


## Dropping variables:
* customer_unique_id and customer_id didn't add any value to the dataset
* items price and frieght_value is removed to avoid **any information leakage**

In [19]:
df_customers.drop(['customer_unique_id'], axis=1, inplace=True)
df_orders.drop('customer_id', axis=1, inplace=True)
df_items.drop(['price', 'freight_value'], axis=1, inplace=True)

## Defining entities and specific datatypes

In [30]:
# Create entityset
entity_set = ft.EntitySet(id = 'order_val')

# order entity
entity_set = entity_set.entity_from_dataframe(
    entity_id='orders',
    dataframe=df_orders,
    index='order_id',
    variable_types = {'order_status': ft.variable_types.Categorical}
)

# item entity
entity_set = entity_set.entity_from_dataframe(
    entity_id='items',
    dataframe=df_items,
    make_index=True,
    index = 'item_id',
    time_index = 'shipping_limit_date'
)

# payments entity
entity_set = entity_set.entity_from_dataframe(
    entity_id='payments',
    dataframe=df_payments,
    make_index=True,
    index='payment_id',
    variable_types = {'payment_type': ft.variable_types.Categorical}
)

# customers entity
entity_set = entity_set.entity_from_dataframe(
    entity_id='customers',
    dataframe=df_customers,
    index='customer_id',
    variable_types = {'customer_city': ft.variable_types.Categorical,
                     'customer_state': ft.variable_types.Categorical}
)

# product entity
entity_set = entity_set.entity_from_dataframe(
    entity_id='products',
    dataframe=df_products,
    index='product_id',
    variable_types = {'product_category_name': ft.variable_types.Categorical}
)

# seller entity
entity_set = entity_set.entity_from_dataframe(
    entity_id='sellers',
    dataframe=df_sellers,
    index='seller_id',
    variable_types = {'seller_city': ft.variable_types.Categorical,
                     'seller_state': ft.variable_types.Categorical}
)

entity_set

Entityset: order_val
  Entities:
    orders [Rows: 99441, Columns: 7]
    items [Rows: 112650, Columns: 6]
    payments [Rows: 103886, Columns: 6]
    customers [Rows: 99441, Columns: 5]
    products [Rows: 32951, Columns: 9]
    sellers [Rows: 3095, Columns: 4]
  Relationships:
    No relationships

## Define relationship among datasets

In [31]:
# Create the relationships (parents-->child) (parent have one id child may have multiple parent ids)
relationship_orders_payments = ft.Relationship(entity_set['orders']['order_id'],
                                               entity_set['payments']['order_id'])

relationship_orders_items = ft.Relationship(entity_set['orders']['order_id'],
                                            entity_set['items']['order_id'])

relationship_orders_customers = ft.Relationship(entity_set['orders']['order_id'],
                                            entity_set['customers']['order_id'])

relationship_products_items = ft.Relationship(entity_set['products']['product_id'],
                                              entity_set['items']['product_id']
                                            )

relationship_sellers_items = ft.Relationship(entity_set['sellers']['seller_id'],
                                             entity_set['items']['seller_id']
                                            )

# Add the relationships to the entity set
entity_set = entity_set.add_relationship(relationship_orders_payments)
entity_set = entity_set.add_relationship(relationship_orders_items)
entity_set = entity_set.add_relationship(relationship_orders_customers)
entity_set = entity_set.add_relationship(relationship_products_items)
entity_set = entity_set.add_relationship(relationship_sellers_items)

# Check entity_set
entity_set

Entityset: order_val
  Entities:
    orders [Rows: 99441, Columns: 7]
    items [Rows: 112650, Columns: 6]
    payments [Rows: 103886, Columns: 6]
    customers [Rows: 99441, Columns: 5]
    products [Rows: 32951, Columns: 9]
    sellers [Rows: 3095, Columns: 4]
  Relationships:
    payments.order_id -> orders.order_id
    items.order_id -> orders.order_id
    customers.order_id -> orders.order_id
    items.product_id -> products.product_id
    items.seller_id -> sellers.seller_id

## Building featureset

In [34]:
transformation_feats = [ 'year', 'month', 'weekday', 'subtract']
aggregate_feats = ['sum', 'mean', 'std', 'mode']
feature_names = ft.dfs(entityset = entity_set,
                          target_entity = "orders",
                          trans_primitives = transformation_feats,
                          agg_primitives = aggregate_feats,                 
                          max_depth = 2,
                          verbose = True,
                          features_only=True)

Built 294 features


In [35]:
df_features, feature_names = ft.dfs(entityset = entity_set,
                                      target_entity = "orders",
                                      trans_primitives = transformation_feats,
                                      agg_primitives = aggregate_feats,                  
                                      max_depth = 2,
                                      verbose = True)

Built 294 features

Elapsed: 00:00 | Remaining: ? | Progress:   0%|          | Calculated: 0/11 chunks
Elapsed: 00:38 | Remaining: 06:27 | Progress:   9%|▉         | Calculated: 1/11 chunks
Elapsed: 01:14 | Remaining: 05:33 | Progress:  18%|█▊        | Calculated: 2/11 chunks
Elapsed: 01:49 | Remaining: 04:52 | Progress:  27%|██▋       | Calculated: 3/11 chunks
Elapsed: 03:03 | Remaining: 05:21 | Progress:  36%|███▋      | Calculated: 4/11 chunks
Elapsed: 04:15 | Remaining: 05:06 | Progress:  45%|████▌     | Calculated: 5/11 chunks
Elapsed: 05:18 | Remaining: 04:25 | Progress:  55%|█████▍    | Calculated: 6/11 chunks
Elapsed: 06:19 | Remaining: 03:36 | Progress:  64%|██████▎   | Calculated: 7/11 chunks
Elapsed: 07:19 | Remaining: 02:44 | Progress:  73%|███████▎  | Calculated: 8/11 chunks
Elapsed: 08:01 | Remaining: 01:47 | Progress:  82%|████████▏ | Calculated: 9/11 chunks
Elapsed: 08:46 | Remaining: 00:52 | Progress:  91%|█████████ | Calculated: 10/11 chunks
Elapsed: 08:47

## Eleminating features containing target information

In [38]:
drop_cols = []
for i,item in enumerate(df_features.columns):
    if 'payment_value' in item:
        drop_cols.append(item)

# 8 SUM(payments.payment_value) is the target
drop_cols.remove('SUM(payments.payment_value)')
len(drop_cols)

file = '../results/dropped_cols_ft6.2.txt'
outfile = open(file, 'w')
outfile.write("\n".join(drop_cols))

df_features.drop(drop_cols, axis=1, inplace=True)

92

In [52]:
df_features.rename(columns={'SUM(payments.payment_value)':'target'}, inplace=True)

## Drop cols with > 10% missing values

In [66]:
missing_cols = []
rows = df_features.shape[0]
for ele in df_features.columns:
    if df_features[ele].isna().sum()/rows*100 > 10:
        missing_cols.append(ele)
        
df_features.drop(missing_cols, axis=1, inplace=True)

## Save the output

In [70]:
utils.to_pickles(df_features, '../data/processed/feature_level')


0it [00:00, ?it/s]
1it [00:00,  7.72it/s]
2it [00:00,  7.74it/s]
3it [00:00,  7.88it/s]
