# Data Exploration

## Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

## Load data

In [4]:
customer_df = pd.read_csv("../data/customers_dataset.csv")
geolocation_df = pd.read_csv("../data/geolocation_dataset.csv")
order_item_df = pd.read_csv("../data/order_items_dataset.csv")
order_payments_df = pd.read_csv("../data/order_payments_dataset.csv")
order_reviews_df = pd.read_csv("../data/order_reviews_dataset.csv")
orders_df = pd.read_csv("../data/orders_dataset.csv")
product_category_name_df = pd.read_csv("../data/product_category.csv")
products_df = pd.read_csv("../data/products_dataset.csv")
seller_df = pd.read_csv("../data/sellers_dataset.csv")

In [19]:
display(customer_df.head(1))
display(geolocation_df.head(1))
display(order_item_df.head(1))
display(order_payments_df.head(1))
display(orders_df.head(1))

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP


Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29


Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00


## Early Explore Data

In [20]:
all_dfs = [
    customer_df,
    geolocation_df,
    order_item_df,
    order_payments_df,
    order_reviews_df,
    orders_df,
    product_category_name_df,
    products_df,
    seller_df,
]

df_names = [
    "customer_df",
    "geolocation_df",
    "order_item_df",
    "order_payments_df",
    "order_reviews_df",
    "orders_df",
    "product_category_name_df",
    "products_df",
    "seller_df",
]

In [25]:
for i, df in enumerate(all_dfs):
    print(f"Datafram Info: {df_names[i]}")
    print(df.info())
    print("\n" + "=" * 50 + "\n")

Datafram Info: customer_df
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB
None


Datafram Info: geolocation_df
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   geolocation_zip_code_prefix  1000163 non-null  int64  
 1   geolocation_lat              1000163 non-null  float64
 2   geolocation_lng              1000163 non-nu

## Data Merging

In [28]:
customer_orders_df = pd.merge(customer_df, orders_df, on="customer_id", how="inner")

print(customer_orders_df.info())
display(customer_orders_df.head(1))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   customer_id                    99441 non-null  object
 1   customer_unique_id             99441 non-null  object
 2   customer_zip_code_prefix       99441 non-null  int64 
 3   customer_city                  99441 non-null  object
 4   customer_state                 99441 non-null  object
 5   order_id                       99441 non-null  object
 6   order_status                   99441 non-null  object
 7   order_purchase_timestamp       99441 non-null  object
 8   order_approved_at              99281 non-null  object
 9   order_delivered_carrier_date   97658 non-null  object
 10  order_delivered_customer_date  96476 non-null  object
 11  order_estimated_delivery_date  99441 non-null  object
dtypes: int64(1), object(11)
memory usage: 9.1+ MB
None


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,order_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,00e7ee1b050b8499577073aeb2a297a1,delivered,2017-05-16 15:05:35,2017-05-16 15:22:12,2017-05-23 10:47:57,2017-05-25 10:35:35,2017-06-05 00:00:00


## Early Feature Engineering for Segmentation

In [45]:
order_counts = (
    customer_orders_df.groupby("customer_unique_id")["order_id"].nunique().reset_index()
)

order_counts.rename(columns={"order_id": "order_count"}, inplace=True)
order_counts.head(1)

Unnamed: 0,customer_unique_id,order_count
0,0000366f3b9a7992bf8c76cfdf3221e2,1


In [46]:
# merge order_count to customer_orders_df
customer_orders_df = pd.merge(
    customer_orders_df, order_counts, on="customer_unique_id", how="left"
)

customer_orders_df.head(1)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,order_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,order_count_x,order_count_y,order_count
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,00e7ee1b050b8499577073aeb2a297a1,delivered,2017-05-16 15:05:35,2017-05-16 15:22:12,2017-05-23 10:47:57,2017-05-25 10:35:35,2017-06-05 00:00:00,1,1,1


## cleaning Column After Merging

In [47]:
customer_orders_df.drop(columns=["order_count_x", "order_count_y"], inplace=True)
customer_orders_df.head(1)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,order_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,order_count
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,00e7ee1b050b8499577073aeb2a297a1,delivered,2017-05-16 15:05:35,2017-05-16 15:22:12,2017-05-23 10:47:57,2017-05-25 10:35:35,2017-06-05 00:00:00,1


## calculate total purchase value per customer

In [None]:
# merge customer_orders with order_item
customer_orders_df = pd.merge(
    customer_orders_df, order_item_df, on="order_id", how="inner"
)

# calculate total price per item
customer_orders_df["total_price"] = (
    customer_orders_df["price"] * customer_orders_df["order_item_id"]
)

# calculate total value purchase per customer_unique_id
total_spending = (
    customer_orders_df.groupby("customer_unique_id")["total_price"].sum().reset_index()
)
total_spending.rename(columns={"total_price": "total_spending"}, inplace=True)

In [54]:
total_spending.head()

Unnamed: 0,customer_unique_id,total_spending
0,0000366f3b9a7992bf8c76cfdf3221e2,129.9
1,0000b849f77a49e4a4ce2b2a4ca5be3f,18.9
2,0000f46a3911fa3c0805444483337064,69.0
3,0000f6ccb0745a6a4b88665a16c9f078,25.99
4,0004aac84e0df4da2b147fca70cf8255,180.0


In [58]:
# merge total_spending to customer_order_df
customer_order_df = pd.merge(
    customer_orders_df, total_spending, on="customer_unique_id", how="left"
)

customer_orders_df.head(1)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,order_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,order_count,order_item_id_x,product_id_x,seller_id_x,shipping_limit_date_x,price_x,freight_value_x,total_price,order_item_id_y,product_id_y,seller_id_y,shipping_limit_date_y,price_y,freight_value_y,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,00e7ee1b050b8499577073aeb2a297a1,delivered,2017-05-16 15:05:35,2017-05-16 15:22:12,2017-05-23 10:47:57,2017-05-25 10:35:35,2017-06-05 00:00:00,1,1,a9516a079e37a9c9c36b9b78b10169e8,7c67e1448b00f6e969d365cea6b010ab,2017-05-22 15:22:12,124.99,21.88,124.99,1,a9516a079e37a9c9c36b9b78b10169e8,7c67e1448b00f6e969d365cea6b010ab,2017-05-22 15:22:12,124.99,21.88,1,a9516a079e37a9c9c36b9b78b10169e8,7c67e1448b00f6e969d365cea6b010ab,2017-05-22 15:22:12,124.99,21.88
