In [None]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('master_olist_dataset.csv')

# 1. Target: Is late?
df['order_delivered_customer_date'] = pd.to_datetime(df['order_delivered_customer_date'], errors='coerce')
df['order_estimated_delivery_date'] = pd.to_datetime(df['order_estimated_delivery_date'], errors='coerce')
df['is_late'] = (df['order_delivered_customer_date'] > df['order_estimated_delivery_date']).astype(int)

# 2. Date Features
df['order_approved_at'] = pd.to_datetime(df['order_approved_at'], errors='coerce')
df['approval_dayofweek'] = df['order_approved_at'].dt.dayofweek
df['approval_hour'] = df['order_approved_at'].dt.hour
df['approval_month'] = df['order_approved_at'].dt.month

# 3. Days from approval to estimate
df['days_approval_to_estimate'] = (df['order_estimated_delivery_date'] - df['order_approved_at']).dt.days

# 4. Days from approval to delivered (leakage, only for exploration)
df['days_approval_to_delivered'] = (df['order_delivered_customer_date'] - df['order_approved_at']).dt.days

# 5. Encode Categorical
for col in ['seller_state', 'product_category_name_english', 'payment_type']:
    df[col] = df[col].astype('category').cat.codes

# 6. Fill missing seller performance if available
if 'seller_90d_performance' in df.columns:
    df['seller_90d_performance'] = df['seller_90d_performance'].fillna(df['seller_90d_performance'].mean())
else:
    # placeholder, since the column may be from another table
    df['seller_90d_performance'] = 0

# 7. Distance features (if coordinates available, otherwise use states)
# You may need to merge latitude/longitude from customer and seller tables for actual distance.
df['same_state'] = (df['customer_state'] == df['seller_state']).astype(int)

# 8. Order Value
df['order_value'] = df['price'] + df['freight_value']

# 9. Select final features for ML
features = [
    'approval_dayofweek', 'approval_hour', 'approval_month',
    'days_approval_to_estimate',
    'freight_value', 'order_value',
    'seller_90d_performance',
    'seller_state', 'product_category_name_english',
    'payment_type', 'same_state',
    # add more as needed
]

X = df[features]
y = df['is_late']

# Now X, y are ready for train-test split and modeling!
