In [1]:
# Data Handling & Manipulation
import pandas as pd                                         # for dataframes, CSV/Excel reading, tabular data manipulation
import numpy as np                                          # for numerical operations and array handling
import datetime                                             # for date and time manipulation
from datetime import datetime, timedelta                    # for date arithmetic (e.g., adding days)
from workalendar.america import Brazil                      # for Brazilian holidays (e.g., to check if a date is a holiday)

# Data Visualisation (Static)
import matplotlib.pyplot as plt                             # for creating static plots
from matplotlib.ticker import FuncFormatter                 # for customising tick labels (e.g., currency, %)
import seaborn as sns                                       # for statistical visualisation (heatmaps, distplots, etc.)

# Data Visualisation (Interactive)
import plotly.express as px                                 # for quick and interactive visualisation
import plotly.graph_objects as go                           # for custom interactive plots
from plotly.subplots import make_subplots                   # for interactive subplots

# Statistical Testing & Inference
from statsmodels.stats.proportion import proportions_ztest  # for comparing proportions (e.g., late vs. on-time)
from scipy.stats import (
    normaltest,                                             # for checking normality
    chi2_contingency,                                       # for categorical association
    mannwhitneyu,                                           # for non-parametric testing
    ttest_ind,                                              # for independent sample t-test
    f_oneway,                                               # for one-way ANOVA
    kruskal,                                                # for Kruskal-Wallis test
    kstest,                                                 # for Kolmogorov-Smirnov test
    spearmanr,                                              # for spearmean correlation
    pointbiserialr                                          # for point-biserial correlation (binary vs. continuous)
)
import statsmodels.api as sm                                # for advanced statistical modelling and diagnostics
import statsmodels.formula.api as smf                       # for formula-based statistical models
from statsmodels.stats.multicomp import pairwise_tukeyhsd   # for post-hoc tests after ANOVA

# Data Quality & Missing Value Visualisation
import missingno as msno                                    # for visualising missing data patterns

# System & Settings
import os                                                   # for file handling and directory operations
import warnings                                             # to suppress or manage warning messages
warnings.filterwarnings("ignore", category=FutureWarning)
pd.set_option('display.max_colwidth', None)                 # display full content in cells (useful for text data)

In [2]:
# List of date columns for each Olist dataset:
# This dictionary maps each dataset filename to a list of columns that should be parsed as dates.
date_cols = {
    'olist_orders_dataset.csv': [
        'order_purchase_timestamp',
        'order_approved_at',
        'order_delivered_carrier_date',
        'order_delivered_customer_date',
        'order_estimated_delivery_date',
    ],
    'olist_order_items_dataset.csv': [
        'shipping_limit_date',
    ],
    'olist_order_reviews_dataset.csv': [
        'review_creation_date',
        'review_answer_timestamp',
    ],
    # The following datasets have NO date columns:
    # 'olist_customers_dataset.csv'
    # 'olist_geolocation_dataset.csv'
    # 'olist_order_payments_dataset.csv'
    # 'olist_products_dataset.csv'
    # 'olist_sellers_dataset.csv'
    # 'product_category_name_translation.csv'
    'master_olist_dataset.csv': [
        'order_purchase_timestamp',
        'order_approved_at',
        'order_delivered_carrier_date',
        'order_delivered_customer_date',
        'order_estimated_delivery_date',
        'shipping_limit_date',
        'review_creation_date',
        'review_answer_timestamp',
    ],
}

def read_olist_csv(path):
    """
    Reads an Olist CSV and parses dates for the correct columns.
    Args:
        path (str): Path to the CSV file.
    Returns:
        pd.DataFrame: Loaded dataframe with date columns parsed as datetime.
    """
    # Extract just the filename, e.g., 'olist_orders_dataset.csv':
    filename = os.path.basename(path)
    # Get the correct date columns for this file, or an empty list:
    parse_dates = date_cols.get(filename, [])
    # Read the CSV, parsing the specified date columns (if any):
    return pd.read_csv(path, parse_dates=parse_dates)

In [3]:
df = read_olist_csv('../data/cleaned_data/olist_ml_ready_dataset.csv')
df.head()

Unnamed: 0,purchase_to_approve_hrs,approve_to_estimated_days,approve_to_shipping_limit_days,purchase_hour,purchase_dow,purchase_month,is_weekend,is_brazil_holiday,distance_km,same_state,...,total_order_lifetime,sum_freight_value,price,total_payment_value,customer_state,seller_state,freight_value,product_category_name_english,payment_types,is_late
0,0.178333,15,4,10,0,10,0,0,18.657513,1,...,53,8.72,29.99,38.71,SP,SP,8.72,housewares,"credit_card, voucher",False
1,30.713889,17,4,20,1,7,0,0,861.068703,0,...,125,22.76,118.7,141.46,BA,SP,22.76,perfumery,boleto,False
2,0.276111,26,5,8,2,8,0,0,514.560686,0,...,1148,19.22,159.9,179.12,GO,SP,19.22,auto,credit_card,False
3,0.298056,26,5,19,5,11,1,0,1821.871635,0,...,155,27.2,45.0,72.2,RN,MG,27.2,pet_shop,credit_card,False
4,1.030556,12,5,21,1,2,0,0,29.623876,1,...,171,8.72,19.9,28.62,SP,SP,8.72,stationery,credit_card,False


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109933 entries, 0 to 109932
Data columns (total 29 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   purchase_to_approve_hrs                         109933 non-null  float64
 1   approve_to_estimated_days                       109933 non-null  int64  
 2   approve_to_shipping_limit_days                  109933 non-null  int64  
 3   purchase_hour                                   109933 non-null  int64  
 4   purchase_dow                                    109933 non-null  int64  
 5   purchase_month                                  109933 non-null  int64  
 6   is_weekend                                      109933 non-null  int64  
 7   is_brazil_holiday                               109933 non-null  int64  
 8   distance_km                                     109933 non-null  float64
 9   same_state                

In [5]:
df.describe()

Unnamed: 0,purchase_to_approve_hrs,approve_to_estimated_days,approve_to_shipping_limit_days,purchase_hour,purchase_dow,purchase_month,is_weekend,is_brazil_holiday,distance_km,same_state,...,seller_30d_late_rate_is_dispatch_late_smoothed,seller_30d_order_count,seller_90d_late_rate_is_dispatch_late_raw,seller_90d_late_rate_is_dispatch_late_smoothed,seller_90d_order_count,total_order_lifetime,sum_freight_value,price,total_payment_value,freight_value
count,109933.0,109933.0,109933.0,109933.0,109933.0,109933.0,109933.0,109933.0,109933.0,109933.0,...,109933.0,109933.0,109933.0,109933.0,109933.0,109933.0,109933.0,109933.0,109933.0,109933.0
mean,10.533222,22.912974,5.985355,14.751258,2.746537,6.030701,0.227302,0.027699,596.771304,0.36182,...,0.123733,35.30988,0.044501,0.095948,90.02809,422.691303,27.248761,120.044161,179.50991,19.949499
std,21.000989,8.832961,5.467422,5.319016,1.963767,3.23358,0.419091,0.164109,587.318831,0.480529,...,0.124669,49.067543,0.115396,0.114392,127.36208,556.063689,33.270072,182.449223,271.586656,15.701392
min,0.0,-7.0,-7.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.002604,1.0,0.0,0.001502,1.0,1.0,0.0,0.85,9.59,0.0
25%,0.216389,17.0,4.0,11.0,1.0,3.0,0.0,0.0,185.951083,0.0,...,0.037037,6.0,0.0,0.023158,12.0,57.0,14.29,39.9,65.55,13.08
50%,0.350556,22.0,6.0,15.0,3.0,6.0,0.0,0.0,433.348916,0.0,...,0.083333,15.0,0.0,0.054054,35.0,171.0,18.16,74.9,114.34,16.26
75%,15.194722,28.0,6.0,19.0,4.0,8.0,0.0,0.0,793.859072,1.0,...,0.166667,42.0,0.035714,0.125,105.0,521.0,29.17,134.5,194.91,21.15
max,741.443611,153.0,1051.0,23.0,6.0,12.0,1.0,1.0,3398.54822,1.0,...,0.916667,382.0,1.0,0.923077,699.0,1992.0,1794.96,6735.0,13664.08,409.68


In [6]:
df.describe(include='all')

Unnamed: 0,purchase_to_approve_hrs,approve_to_estimated_days,approve_to_shipping_limit_days,purchase_hour,purchase_dow,purchase_month,is_weekend,is_brazil_holiday,distance_km,same_state,...,total_order_lifetime,sum_freight_value,price,total_payment_value,customer_state,seller_state,freight_value,product_category_name_english,payment_types,is_late
count,109933.0,109933.0,109933.0,109933.0,109933.0,109933.0,109933.0,109933.0,109933.0,109933.0,...,109933.0,109933.0,109933.0,109933.0,109933,109933,109933.0,109933,109933,109933
unique,,,,,,,,,,,...,,,,,27,22,,74,6,2
top,,,,,,,,,,,...,,,,,SP,SP,,bed_bath_table,credit_card,False
freq,,,,,,,,,,,...,,,,,46345,78416,,10929,81901,101225
mean,10.533222,22.912974,5.985355,14.751258,2.746537,6.030701,0.227302,0.027699,596.771304,0.36182,...,422.691303,27.248761,120.044161,179.50991,,,19.949499,,,
std,21.000989,8.832961,5.467422,5.319016,1.963767,3.23358,0.419091,0.164109,587.318831,0.480529,...,556.063689,33.270072,182.449223,271.586656,,,15.701392,,,
min,0.0,-7.0,-7.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.85,9.59,,,0.0,,,
25%,0.216389,17.0,4.0,11.0,1.0,3.0,0.0,0.0,185.951083,0.0,...,57.0,14.29,39.9,65.55,,,13.08,,,
50%,0.350556,22.0,6.0,15.0,3.0,6.0,0.0,0.0,433.348916,0.0,...,171.0,18.16,74.9,114.34,,,16.26,,,
75%,15.194722,28.0,6.0,19.0,4.0,8.0,0.0,0.0,793.859072,1.0,...,521.0,29.17,134.5,194.91,,,21.15,,,
