In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [62]:
# Show all orders
orders_data = pd.read_csv("/kaggle/input/brazilian-ecommerce/olist_orders_dataset.csv", 
                          parse_dates = [
                              "order_purchase_timestamp",
                              "order_approved_at",
                              "order_delivered_carrier_date",
                              "order_delivered_customer_date",
                              "order_estimated_delivery_date"
                          ])
orders_data.head(10)

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26
5,a4591c265e18cb1dcee52889e2d8acc3,503740e9ca751ccdda7ba28e9ab8f608,delivered,2017-07-09 21:57:05,2017-07-09 22:10:13,2017-07-11 14:58:04,2017-07-26 10:57:55,2017-08-01
6,136cce7faa42fdb2cefd53fdc79a6098,ed0271e0b7da060a393796590e7b737a,invoiced,2017-04-11 12:22:08,2017-04-13 13:25:17,NaT,NaT,2017-05-09
7,6514b8ad8028c9f2cc2374ded245783f,9bdf08b4b3b52b5526ff42d37d47f222,delivered,2017-05-16 13:10:30,2017-05-16 13:22:11,2017-05-22 10:07:46,2017-05-26 12:55:51,2017-06-07
8,76c6e866289321a7c93b82b54852dc33,f54a9f0e6b351c431402b8461ea51999,delivered,2017-01-23 18:29:09,2017-01-25 02:50:47,2017-01-26 14:16:31,2017-02-02 14:08:10,2017-03-06
9,e69bfb5eb88e0ed6a785585b27e16dbf,31ad1d1b63eb9962463f764d4e6e0c9d,delivered,2017-07-29 11:55:02,2017-07-29 12:05:32,2017-08-10 19:45:24,2017-08-16 17:14:30,2017-08-23


In [63]:
orders_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  object        
 1   customer_id                    99441 non-null  object        
 2   order_status                   99441 non-null  object        
 3   order_purchase_timestamp       99441 non-null  datetime64[ns]
 4   order_approved_at              99281 non-null  datetime64[ns]
 5   order_delivered_carrier_date   97658 non-null  datetime64[ns]
 6   order_delivered_customer_date  96476 non-null  datetime64[ns]
 7   order_estimated_delivery_date  99441 non-null  datetime64[ns]
dtypes: datetime64[ns](5), object(3)
memory usage: 6.1+ MB


In [64]:
# How many total orders are there?
total_orders = orders_data.shape[0]
total_orders

99441

In [65]:
# List all the distict order status?
orders_data["order_status"].unique()

array(['delivered', 'invoiced', 'shipped', 'processing', 'unavailable',
       'canceled', 'created', 'approved'], dtype=object)

In [66]:
# How many distinct order statuses are there?
orders_data["order_status"].nunique()

8

In [67]:
# How many orders are in each order status?
order_by_status = orders_data.groupby("order_status").size().reset_index(name = "order_count")
order_by_status

Unnamed: 0,order_status,order_count
0,approved,2
1,canceled,625
2,created,5
3,delivered,96478
4,invoiced,314
5,processing,301
6,shipped,1107
7,unavailable,609


In [68]:
tab10_colors = ['#5e5c52','#526334','#2ca02c','#848044','#213330', '#b1ccb6','#e377c2', '#4a73ba']
fig = px.bar(
    order_by_status,  
    x = "order_status", 
    y = "order_count", 
    color = tab10_colors, 
    labels = {"order_status": "Status", "order_count": "Order Count"},
    title = "Order count by status"
)
fig.show()

In [69]:
# How many orders were delivered?
orders_data[orders_data["order_status"] == "delivered"].shape[0]

96478

In [70]:
# How many unique customers placed orders?
orders_data["customer_id"].nunique()

99441

In [71]:
# What’s the earliest and latest order purchase date?
earliest_date = orders_data["order_purchase_timestamp"].min()
latest_date = orders_data["order_purchase_timestamp"].max()
earliest_date, latest_date

(Timestamp('2016-09-04 21:15:19'), Timestamp('2018-10-17 17:30:18'))

In [72]:
# How many orders were delivered late? (Delivered date > estimated delivery date)
late_orders = orders_data[
    (orders_data["order_delivered_customer_date"].notnull()) &
    (orders_data["order_estimated_delivery_date"].notnull()) &
    (orders_data["order_delivered_customer_date"] > orders_data["order_estimated_delivery_date"])
]
late_orders.shape[0]

7827

In [73]:
# How many orders have missing delivery timestamps?
missing_timestamp_delivery_orders = orders_data[orders_data["order_delivered_customer_date"].isna()]
missing_timestamp_delivery_orders.shape[0]

2965

In [74]:
# List all possible combinations of order status and delivery status presence.
order_delivered = orders_data[(orders_data["order_status"] == "delivered") & (orders_data["order_delivered_customer_date"].notnull())].shape[0]
order_not_delivered = orders_data[(orders_data["order_status"] != "delivered") | (orders_data["order_delivered_customer_date"].isna())].shape[0]
order_delivered, order_not_delivered

(96470, 2971)

In [75]:
# What is the average delivery time (in days) for delivered orders?
valid_orders = orders_data[
    (orders_data["order_delivered_customer_date"].notna()) & 
    (orders_data["order_approved_at"].notna())
]
orders_data["delivery_in_days"] = (valid_orders["order_delivered_customer_date"] - valid_orders["order_approved_at"]).dt.days

In [76]:
average_delivery_time = orders_data[orders_data["order_status"] == "delivered"]["delivery_in_days"].mean().round(2)
average_delivery_time

11.64

In [77]:
# How many orders were placed each month?
monthly_orders = (
    orders_data[orders_data["order_approved_at"].notnull()]
    .groupby(orders_data["order_approved_at"].dt.to_period("M"))
    .size()
    .reset_index(name = "orders_count")
    .rename(columns = {"order_approved_at": "order_month"})
    .sort_values(by = "order_month", ascending = True)
)
monthly_orders.head(10)

Unnamed: 0,order_month,orders_count
0,2016-09,1
1,2016-10,320
2,2016-12,1
3,2017-01,760
4,2017-02,1765
5,2017-03,2689
6,2017-04,2374
7,2017-05,3693
8,2017-06,3252
9,2017-07,3974


In [78]:
monthly_orders["order_month"] = monthly_orders["order_month"].dt.to_timestamp()
fig = px.line(
    monthly_orders, 
    x = "order_month", 
    y = "orders_count", 
    labels = {"order_month": "Date", "orders_count": "Count"},
    title = "Number of orders placed in each month",
    markers = True
)
fig.show()

In [79]:
# Which day had the most orders
most_ordered_date = (
    orders_data[orders_data["order_approved_at"].notnull()]
    .groupby(orders_data["order_approved_at"].dt.date)
    .size()
    .reset_index(name = "orders_count")
    .rename(columns = {"order_approved_at": "order_date"})
    .sort_values(by = "orders_count", ascending = False)
)
most_ordered_date.head(1)

Unnamed: 0,order_date,orders_count
484,2018-04-24,990


In [80]:
# How many orders were canceled after being shipped?
cancelled_orders_after_ship = orders_data[(orders_data["order_status"] == "canceled") & (orders_data["order_delivered_customer_date"].notnull())]
cancelled_orders_after_ship.shape[0]

6

In [81]:
# What is the average time between order purchase and order approval?
mask = orders_data[orders_data["order_approved_at"].notna() & orders_data["order_purchase_timestamp"].notna()]
average_time_betn = ((mask["order_approved_at"] - mask["order_purchase_timestamp"]).dt.total_seconds() / 86400).mean()
average_time_betn

0.43412892924665386

In [82]:
# What is the distribution of order statuses by month?
# Convert to monthly period
orders_data["order_month"] = orders_data["order_purchase_timestamp"].dt.to_period("M")

# Group by month and order status, and count
distribution = (
    orders_data[orders_data["order_purchase_timestamp"].notna()]
    .groupby(["order_month", "order_status"])
    .size()
    .reset_index(name="orders_count")
)

distribution.head(10)

Unnamed: 0,order_month,order_status,orders_count
0,2016-09,canceled,2
1,2016-09,delivered,1
2,2016-09,shipped,1
3,2016-10,canceled,24
4,2016-10,delivered,265
5,2016-10,invoiced,18
6,2016-10,processing,2
7,2016-10,shipped,8
8,2016-10,unavailable,7
9,2016-12,delivered,1


In [83]:
# List customers who placed more than 5 orders.
customer_orders = orders_data.groupby("customer_id").size().reset_index(name = "orders_count")
customer_orders[customer_orders["orders_count"] > 5]

Unnamed: 0,customer_id,orders_count


In [84]:
# Which customers placed multiple orders on the same day?
orders_data["order_date"] = orders_data["order_purchase_timestamp"].dt.date
multiple_orders = (
    orders_data.groupby(["customer_id", "order_date"])
    .size()
    .reset_index(name="order_count")
)
multiple_orders = multiple_orders[multiple_orders["order_count"] > 1]
multiple_orders.head(10)

Unnamed: 0,customer_id,order_date,order_count


In [85]:
# How many orders are still waiting for approved?
orders_data[(orders_data["order_purchase_timestamp"].notna()) & (orders_data["order_approved_at"].isna())].shape[0]

160

In [100]:
# Which month had the highest delivery delays on average?
valid_orders = orders_data[(orders_data["order_delivered_customer_date"].notna()) & (orders_data["order_estimated_delivery_date"].notna())].copy()

valid_orders["delay_days"] = (
    (valid_orders["order_delivered_customer_date"] - valid_orders["order_estimated_delivery_date"])
    .dt.total_seconds() / 86400
)

delayed_orders = valid_orders[valid_orders["delay_days"] > 0]

avg_delay_by_month = (
    delayed_orders.groupby(delayed_orders["order_approved_at"].dt.to_period("M"))["delay_days"]
    .mean().round(2)
    .reset_index(name = "avg_delay")
    .sort_values(by = "order_approved_at", ascending = True)
)
avg_delay_by_month.head(1)

Unnamed: 0,order_approved_at,avg_delay
0,2016-09,36.32


In [101]:
# Which customer had the highest average delivery time?
valid_orders = orders_data[(orders_data["order_delivered_customer_date"].notna()) & (orders_data["order_approved_at"].notna())].copy()

valid_orders["delivery_time"] = (
    (valid_orders["order_delivered_customer_date"] - valid_orders["order_approved_at"])
    .dt.total_seconds() / 86400
)

avg_delivery_per_customer = (
    valid_orders.groupby("customer_id")["delivery_time"]
    .mean().round(2)
    .reset_index(name = "avg_delivery_time")
    .sort_values("avg_delivery_time", ascending = False)
)
avg_delivery_per_customer.head(1)

Unnamed: 0,customer_id,avg_delivery_time
44110,75683a92331068e2d281b11a7866ba44,208.5


In [102]:
# Rank customers by number of delivered orders.
delivered_orders = orders_data[(orders_data["order_status"] == "delivered") | (orders_data["order_delivered_customer_date"].notna())].copy()

customer_delivered_count = (
    delivered_orders.groupby("customer_id")
    .size()
    .reset_index(name = "orders_count")
    .sort_values("orders_count", ascending = False)
)
customer_delivered_count["rank_customer"] = customer_delivered_count.groupby("customer_id").cumcount() + 1
customer_delivered_count.head(10)

Unnamed: 0,customer_id,orders_count,rank_customer
0,00012a2ce6f8dcda20d059ce98491703,1,1
64319,aa5e0a0c36015d39de864189a2b00b09,1,1
64328,aa6190994371836fc1d4596edd827abd,1,1
64327,aa606152f23a5fa9ba3d0b87f7a492e8,1,1
64326,aa601b3c45980c0918042d5ca7a25054,1,1
64325,aa5f8aa4c21fe89a04e8d221d4cd8ac1,1,1
64324,aa5f87b79875f43b90b310decb62356e,1,1
64323,aa5f5931b8901ae1ca4b8302f9b94c02,1,1
64322,aa5e99afaf6a4dce3da0dee9f41beac8,1,1
64321,aa5e567592c7fce76e3937e35c6a4ecb,1,1


In [103]:
# What is the percentage of orders delivered on or before the estimated delivery date?
orders_delivered_on_time = orders_data[
    (orders_data["order_delivered_customer_date"].notna()) &
    (orders_data["order_estimated_delivery_date"].notna()) &
    (orders_data["order_delivered_customer_date"] <= orders_data["order_estimated_delivery_date"])
]
percentage_delivered_on_time = np.round((orders_delivered_on_time.shape[0] / total_orders) * 100, 1)
percentage_delivered_on_time

89.1

In [104]:
# What’s the average delivery delay by order status?
valid_orders = orders_data[orders_data["delay_days"].notna()]

average_delay_per_status = (
    valid_orders.groupby("order_status")["delay_days"]
    .mean().round(2)
    .reset_index(name = "avg_delay")
)
average_delay_per_status.head(10)

Unnamed: 0,order_status,avg_delay
0,canceled,-27.16
1,delivered,-11.18


In [105]:
# Find top 5 dates with the most delayed deliveries.
valid_delay_orders = orders_data.dropna(subset=["delay_days", "order_approved_at"]).copy()
valid_delay_orders = valid_delay_orders[valid_delay_orders["delay_days"] > 0]

valid_delay_orders["order_approved_at"] = pd.to_datetime(valid_delay_orders["order_approved_at"]).dt.date

delayed_delivery_dates = (
    valid_delay_orders.groupby("order_approved_at")
    .size()
    .reset_index(name="orders_count")
    .sort_values("orders_count", ascending=False)
)

print(delayed_delivery_dates.head(5))

    order_approved_at  orders_count
308        2017-11-25           155
307        2017-11-24           152
311        2017-11-28            98
409        2018-03-06            98
546        2018-08-07            93


In [106]:
# Which order had the longest delay from purchase to delivery?
valid_delivery_after_purchase = orders_data.dropna(subset = ["order_purchase_timestamp", "order_delivered_customer_date"]).copy()
valid_delivery_after_purchase["delay_from_purchase"] = (
    valid_delivery_after_purchase["order_delivered_customer_date"] - valid_delivery_after_purchase["order_purchase_timestamp"]
).dt.total_seconds() / (3600 * 24)
valid_delivery_after_purchase.sort_values("delay_from_purchase", ascending = False)
valid_delivery_after_purchase.head(1)

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,delivery_in_days,order_month,order_date,delay_days,delay_from_purchase
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,8.0,2017-10,2017-10-02,-7.107488,8.436574


In [107]:
# Find the average approval time for each order status.
valid_approval_orders = orders_data.dropna(subset = ["order_approved_at", "order_purchase_timestamp"]).copy()
valid_approval_orders = valid_approval_orders[valid_approval_orders["order_approved_at"] >= valid_approval_orders["order_purchase_timestamp"]]
valid_approval_orders["approval_time"] = (
    valid_approval_orders["order_approved_at"] - valid_approval_orders["order_purchase_timestamp"]
).dt.total_seconds() / (3600 * 24)

avg_approval_time_orders = (
    valid_approval_orders.groupby("order_status")["approval_time"]
    .mean().round(2)
    .reset_index(name = "avg_approval_time")
)
avg_approval_time_orders.head(10)

Unnamed: 0,order_status,avg_approval_time
0,approved,2.9
1,canceled,0.61
2,delivered,0.43
3,invoiced,0.38
4,processing,0.71
5,shipped,0.49
6,unavailable,1.02


In [108]:
# Which statuses are more likely to result in delivery delays?
valid_orders = orders_data[
    (orders_data["order_delivered_customer_date"].notna()) &
    (orders_data["order_estimated_delivery_date"].notna())
].copy()

valid_orders["is_delayed"] = (valid_orders["order_delivered_customer_date"] > valid_orders["order_estimated_delivery_date"]).astype(int)

status_delay_stats = (
    valid_orders.groupby("order_status")["is_delayed"]
    .agg(
        total_orders = "count",
        delayed_orders = "sum"
    )
    .reset_index()
)
status_delay_stats["delay_perc"] = np.round((status_delay_stats["delayed_orders"] / status_delay_stats["total_orders"]) * 100, 2)
status_delay_stats.head(10)

Unnamed: 0,order_status,total_orders,delayed_orders,delay_perc
0,canceled,6,1,16.67
1,delivered,96470,7826,8.11


In [109]:
# How many orders were approved but never shipped?
valid_orders = orders_data[(orders_data["order_approved_at"].notna()) & (orders_data["order_delivered_customer_date"].isna())]
valid_orders.shape[0]

2819