In [1]:
# Import Library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import normaltest, chi2_contingency, mannwhitneyu, ttest_ind, kstest
import statsmodels.api as sm
import os
import missingno as msno

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
pd.set_option('display.max_colwidth', None)

In [12]:
# List of date columns for each Olist dataset
# This dictionary maps each dataset filename to a list of columns that should be parsed as dates.
date_cols = {
    'olist_orders_dataset.csv': [
        'order_purchase_timestamp',
        'order_approved_at',
        'order_delivered_carrier_date',
        'order_delivered_customer_date',
        'order_estimated_delivery_date',
    ],
    'olist_order_items_dataset.csv': [
        'shipping_limit_date',
    ],
    'olist_order_reviews_dataset.csv': [
        'review_creation_date',
        'review_answer_timestamp',
    ],
    # The following datasets have NO date columns:
    # 'olist_customers_dataset.csv'
    # 'olist_geolocation_dataset.csv'
    # 'olist_order_payments_dataset.csv'
    # 'olist_products_dataset.csv'
    # 'olist_sellers_dataset.csv'
    # 'product_category_name_translation.csv'
    'master_olist_dataset.csv': [
        'order_purchase_timestamp',
        'order_approved_at',
        'order_delivered_carrier_date',
        'order_delivered_customer_date',
        'order_estimated_delivery_date',
        'shipping_limit_date',
        'review_creation_date',
        'review_answer_timestamp',
    ]
}

def read_olist_csv(path):
    """
    Reads an Olist CSV and parses dates for the correct columns.
    Args:
        path (str): Path to the CSV file.
    Returns:
        pd.DataFrame: Loaded dataframe with date columns parsed as datetime.
    """
    # Extract just the filename, e.g., 'olist_orders_dataset.csv'
    filename = os.path.basename(path)
    # Get the correct date columns for this file, or an empty list
    parse_dates = date_cols.get(filename, [])
    print(parse_dates)  # Debugging: print the date columns being parsed
    # Read the CSV, parsing the specified date columns (if any)
    return pd.read_csv(path, parse_dates=parse_dates)

Load original (raw) datasets:

In [3]:
original_df_orders           = read_olist_csv('../data/original_data/olist_orders_dataset.csv')
original_df_customers        = read_olist_csv('../data/original_data/olist_customers_dataset.csv')
original_df_order_items      = read_olist_csv('../data/original_data/olist_order_items_dataset.csv')
original_df_order_payments   = read_olist_csv('../data/original_data/olist_order_payments_dataset.csv')
original_df_reviews          = read_olist_csv('../data/original_data/olist_order_reviews_dataset.csv')
original_df_products         = read_olist_csv('../data/original_data/olist_products_dataset.csv')
original_df_prod_cat_tr      = read_olist_csv('../data/original_data/product_category_name_translation.csv')
original_df_sellers          = read_olist_csv('../data/original_data/olist_sellers_dataset.csv')
original_df_geolocation      = read_olist_csv('../data/original_data/olist_geolocation_dataset.csv')

Load cleaned datasets

In [19]:
cleaned_delivered_df_orders           = read_olist_csv('../data/cleaned_data/olist_orders_dataset.csv')
cleaned_df_customers        = read_olist_csv('../data/cleaned_data/olist_customers_dataset.csv')
cleaned_delivered_df_order_items      = read_olist_csv('../data/cleaned_data/olist_order_items_dataset.csv')
cleaned_delivered_df_order_payments   = read_olist_csv('../data/cleaned_data/olist_order_payments_dataset.csv')
cleaned_delivered_df_reviews          = read_olist_csv('../data/cleaned_data/olist_order_reviews_dataset.csv')
cleaned_df_products         = read_olist_csv('../data/cleaned_data/olist_products_dataset.csv')
cleaned_df_prod_cat_tr      = read_olist_csv('../data/cleaned_data/product_category_name_translation.csv')
cleaned_df_sellers          = read_olist_csv('../data/cleaned_data/olist_sellers_dataset.csv')
cleaned_df_geolocation      = read_olist_csv('../data/cleaned_data/olist_geolocation_dataset.csv')

['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']
[]
['shipping_limit_date']
[]
['review_creation_date', 'review_answer_timestamp']
[]
[]
[]
[]


In [9]:
df_meta = pd.read_csv("../data/cleaned_data/olist_orders_dataset.csv", nrows=0)
print(df_meta.columns.tolist())

['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']


In [15]:
read_olist_csv('../data/cleaned_data/olist_orders_dataset.csv').describe()

['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']


Unnamed: 0,order_purchase_timestamp,order_delivered_customer_date,order_estimated_delivery_date
count,96282,96282,96282
mean,2018-01-01 17:15:44.571612672,2018-01-14 06:54:24.274360576,2018-01-25 10:57:44.273696
min,2016-09-15 12:16:38,2016-10-11 13:46:32,2016-10-04 00:00:00
25%,2017-09-13 23:10:07,2017-09-25 21:22:16.750000128,2017-10-05 00:00:00
50%,2018-01-20 13:59:55.500000,2018-02-02 16:26:08,2018-02-16 00:00:00
75%,2018-05-05 09:58:34.500000,2018-05-15 19:48:47.750000128,2018-05-28 00:00:00
max,2018-08-29 15:00:37,2018-10-17 13:22:46,2018-10-25 00:00:00


In [16]:
for col in [
    "order_purchase_timestamp",
    "order_approved_at",
    "order_delivered_carrier_date",
    "order_delivered_customer_date",
    "order_estimated_delivery_date"
]:
    cleaned_delivered_df_orders[col] = pd.to_datetime(cleaned_delivered_df_orders[col], errors="coerce", infer_datetime_format=True)

  cleaned_delivered_df_orders[col] = pd.to_datetime(cleaned_delivered_df_orders[col], errors="coerce", infer_datetime_format=True)
  cleaned_delivered_df_orders[col] = pd.to_datetime(cleaned_delivered_df_orders[col], errors="coerce", infer_datetime_format=True)
  cleaned_delivered_df_orders[col] = pd.to_datetime(cleaned_delivered_df_orders[col], errors="coerce", infer_datetime_format=True)
  cleaned_delivered_df_orders[col] = pd.to_datetime(cleaned_delivered_df_orders[col], errors="coerce", infer_datetime_format=True)
  cleaned_delivered_df_orders[col] = pd.to_datetime(cleaned_delivered_df_orders[col], errors="coerce", infer_datetime_format=True)


In [20]:
cleaned_delivered_df_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96282 entries, 0 to 96281
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       96282 non-null  object        
 1   customer_id                    96282 non-null  object        
 2   order_status                   96282 non-null  object        
 3   order_purchase_timestamp       96282 non-null  datetime64[ns]
 4   order_approved_at              96282 non-null  object        
 5   order_delivered_carrier_date   96282 non-null  object        
 6   order_delivered_customer_date  96282 non-null  datetime64[ns]
 7   order_estimated_delivery_date  96282 non-null  datetime64[ns]
dtypes: datetime64[ns](3), object(5)
memory usage: 5.9+ MB


In [18]:
cleaned_delivered_df_orders[cleaned_delivered_df_orders['order_approved_at'].isna()]

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
87500,14a54a1ffd16f037929c4553a244f9ed,f51dd48ef39ca937e5f1c9e87f6f2881,delivered,2018-04-02 12:50:49,NaT,2018-04-04 21:58:23,2018-05-05 16:51:54,2018-05-07
87501,d725835898fee2ec8063d4aa035bf2dc,02e9e4af4b3ac6cd523d656776e0580b,delivered,2018-01-04 18:27:34,NaT,2018-01-05 21:09:41,2018-01-16 21:09:13,2018-01-31
87502,561d2ce55a3b6c040a6ba77fa5428056,ae9dcc4124363f82adbd2148d6e240b7,delivered,2017-10-09 14:53:21,NaT,2017-10-10 19:43:53,2017-11-15 01:11:56,2017-11-17
87503,36d2f4c466cc35f518fac94f87653109,ae276554ae951afd264f67a513ae3b0c,delivered,2018-08-16 08:20:16,NaT,2018-08-20 15:16:00,2018-08-22 21:38:36,2018-08-28
87504,3bfd4e221ebf6b1a72ee73a74bdeec8b,157174b2ef72b1c700c5b65e7536d6ba,delivered,2017-02-20 10:34:41,NaT,2017-02-21 15:53:09,2017-03-03 12:29:10,2017-03-27
...,...,...,...,...,...,...,...,...
96277,9c5dedf39a927c1b2549525ed64a053c,39bd1228ee8140590ac3aca26f2dfe00,delivered,2017-03-09 09:54:05,NaT,2017-03-10 11:18:03,2017-03-17 15:08:01,2017-03-28
96278,63943bddc261676b46f01ca7ac2f7bd8,1fca14ff2861355f6e5f14306ff977a7,delivered,2018-02-06 12:58:58,NaT,2018-02-07 23:22:42,2018-02-28 17:37:56,2018-03-02
96279,83c1379a015df1e13d02aae0204711ab,1aa71eb042121263aafbe80c1b562c9c,delivered,2017-08-27 14:46:43,NaT,2017-08-28 20:52:26,2017-09-21 11:24:17,2017-09-27
96280,11c177c8e97725db2631073c19f07b62,b331b74b18dc79bcdf6532d51e1637c1,delivered,2018-01-08 21:28:27,NaT,2018-01-12 15:35:03,2018-01-25 23:32:54,2018-02-15


In [21]:
cleaned_delivered_df_orders[cleaned_delivered_df_orders['order_id'] == '14a54a1ffd16f037929c4553a244f9ed']

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
87500,14a54a1ffd16f037929c4553a244f9ed,f51dd48ef39ca937e5f1c9e87f6f2881,delivered,2018-04-02 12:50:49,2018-04-03 08:09:01,2018-04-04 21:58:23,2018-05-05 16:51:54,2018-05-07


Load master dataset

In [5]:
master_olist_dataset = read_olist_csv('../data/cleaned_data/master_olist_dataset.csv')

In [6]:
master_olist_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115093 entries, 0 to 115092
Data columns (total 41 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       115093 non-null  object        
 1   customer_id                    115093 non-null  object        
 2   order_status                   115093 non-null  object        
 3   order_purchase_timestamp       115093 non-null  datetime64[ns]
 4   order_approved_at              115093 non-null  object        
 5   order_delivered_carrier_date   115093 non-null  object        
 6   order_delivered_customer_date  115093 non-null  datetime64[ns]
 7   order_estimated_delivery_date  115093 non-null  datetime64[ns]
 8   customer_unique_id             115093 non-null  object        
 9   customer_zip_code_prefix       115093 non-null  int64         
 10  customer_city                  115093 non-null  object        
 11  

In [7]:
master_olist_dataset.describe()

Unnamed: 0,order_purchase_timestamp,order_delivered_customer_date,order_estimated_delivery_date,customer_zip_code_prefix,order_item_id,shipping_limit_date,price,freight_value,product_name_lenght,product_description_lenght,...,product_weight_g,product_length_cm,product_height_cm,product_width_cm,seller_zip_code_prefix,payment_sequential,payment_installments,payment_value,review_score,review_answer_timestamp
count,115093,115093,115093,115093.0,115093.0,115093,115093.0,115093.0,115093.0,115093.0,...,115093.0,115093.0,115093.0,115093.0,115093.0,115093.0,115093.0,115093.0,113721.0,113721
mean,2017-12-31 11:30:04.998384128,2018-01-12 23:24:51.927893248,2018-01-24 07:25:19.567653888,35057.944697,1.196554,2018-01-07 02:08:09.010356992,120.060541,19.988354,48.119729,774.216355,...,2107.770021,30.242352,16.587812,23.062628,24451.798702,1.091248,2.938737,171.93889,4.0825,2018-01-16 08:06:48.084285440
min,2016-10-03 09:44:50,2016-10-11 13:46:32,2016-10-27 00:00:00,1004.0,1.0,2016-10-08 10:34:01,0.85,0.0,0.0,0.0,...,0.0,7.0,2.0,6.0,1001.0,1.0,0.0,0.0,1.0,2016-10-16 03:20:17
25%,2017-09-12 22:11:35,2017-09-24 13:14:56,2017-10-04 00:00:00,11250.0,1.0,2017-09-20 02:43:52,39.9,13.08,42.0,340.0,...,300.0,18.0,8.0,15.0,6429.0,1.0,1.0,60.85,4.0,2017-09-28 00:37:34
50%,2018-01-18 22:23:16,2018-02-01 12:46:41,2018-02-15 00:00:00,24310.0,1.0,2018-01-25 17:00:30,74.9,16.28,51.0,594.0,...,700.0,25.0,13.0,20.0,13660.0,1.0,2.0,108.12,5.0,2018-02-05 00:55:20
75%,2018-05-04 03:59:20,2018-05-14 22:56:29,2018-05-25 00:00:00,58701.0,1.0,2018-05-10 09:50:40,133.0,21.17,57.0,977.0,...,1800.0,38.0,20.0,30.0,27930.0,1.0,4.0,188.93,5.0,2018-05-20 11:36:07
max,2018-08-29 15:00:37,2018-10-17 13:22:46,2018-10-25 00:00:00,99980.0,21.0,2020-04-09 22:35:08,6735.0,409.68,76.0,3992.0,...,40425.0,105.0,105.0,118.0,99730.0,26.0,24.0,13664.08,5.0,2018-10-29 12:27:35
std,,,,29844.633374,0.700871,,183.015633,15.734193,11.483035,652.666471,...,3775.318176,16.131124,13.426735,11.731936,27584.181337,0.687378,2.774541,266.193435,1.346116,


In [None]:
# 4. Core delay & flag features
master_olist_dataset['late_flag']    = master_olist_dataset['order_delivered_customer_date'] > master_olist_dataset['order_estimated_delivery_date']
master_olist_dataset['late_days']    = (master_olist_dataset['order_delivered_customer_date'] - master_olist_dataset['order_estimated_delivery_date']).dt.days
master_olist_dataset['dispatch_delay'] = (master_olist_dataset['order_delivered_carrier_date'] - master_olist_dataset['shipping_limit_date']).dt.days
master_olist_dataset['dispatch_time'] = (master_olist_dataset['order_delivered_carrier_date'] - master_olist_dataset['order_approved_at']).dt.days
master_olist_dataset['transit_time']  = (master_olist_dataset['order_delivered_customer_date']  - master_olist_dataset['order_delivered_carrier_date']).dt.days

# 5. Review bucket
master_olist_dataset['review_bucket'] = master_olist_dataset['review_score'].apply(lambda x: 'good (4-5)' if x > 3 else 'bad (1-3)')

TypeError: cannot subtract DatetimeArray from ndarray

In [None]:

# 6. Customer repeat & first-order flags
master_olist_dataset = master_olist_dataset.sort_values(['customer_unique_id','order_purchase_timestamp'])
master_olist_dataset['order_rank']        = master_olist_dataset.groupby('customer_unique_id')['order_purchase_timestamp'] \
                              .rank(method='first')
master_olist_dataset['first_order_flag']  = master_olist_dataset['order_rank'] == 1

order_counts = master_olist_dataset.groupby('customer_unique_id')['order_id'].nunique()
master_olist_dataset['customer_repeat_flag'] = master_olist_dataset['customer_unique_id'].map(order_counts > 1)

# 7. Retention delta (Δ repeat-rate between on-time vs. late first orders)
firsts = master_olist_dataset[master_olist_dataset['first_order_flag']]
ret = firsts.groupby('late_flag').agg(
    total_customers=('customer_unique_id','nunique'),
    repeat_customers=('customer_repeat_flag', 'sum')
).reset_index()
ret['repeat_rate'] = ret['repeat_customers']/ret['total_customers']

rr_on_time = ret.loc[ret['late_flag']==False, 'repeat_rate'].iloc[0]
rr_late    = ret.loc[ret['late_flag']==True,  'repeat_rate'].iloc[0]
retention_delta = rr_on_time - rr_late

# 8. Impact model: lost revenue from late first orders
late_first_count = ret.loc[ret['late_flag']==True, 'total_customers'].iloc[0]
avg_order_value  = master_olist_dataset['payment_value'].mean()   # adjust column as needed
lost_revenue     = retention_delta * late_first_count * avg_order_value

# 9. Seller- and state-level aggregates
seller_agg = master_olist_dataset.groupby('seller_id').agg(
    total_orders   = ('order_id','nunique'),
    late_orders    = ('late_flag','sum'),
    late_rate      = ('late_flag','mean'),
    avg_late_days  = ('late_days','mean')
).reset_index()

state_agg = master_olist_dataset.groupby('customer_state').agg(
    total_orders   = ('order_id','nunique'),
    late_orders    = ('late_flag','sum'),
    late_rate      = ('late_flag','mean'),
    avg_late_days  = ('late_days','mean')
).reset_index()

# 10. Write outputs
os.makedirs(OUTPUT_DIR, exist_ok=True)
master_olist_dataset.to_csv(MASTER_OUT, index=False)
seller_agg.to_csv(SELLER_OUT, index=False)
state_agg.to_csv(STATE_OUT, index=False)

# 11. (Optional) print key metrics
print(f"Retention Δ (on-time vs late first orders): {retention_delta:.4f}")
print(f"Estimated lost revenue from late first orders: {lost_revenue:,.2f}")
