Objective: Create meaningful new features based on the available data.

In [1]:
import pandas as pd

# Load the cleaned data, ensuring the date column is parsed correctly
df = pd.read_csv('../data/processed/cleaned_orders.csv', parse_dates=['order_date_dateorders'])

In [2]:
# 1. Shipping Delay (instead of delivery delay)
# This calculates if the shipment was early, on time, or late.
df['shipping_delay'] = df['days_for_shipping_real'] - df['days_for_shipment_scheduled']

# 2. Profit Margin Ratio
# We can calculate this from 'benefit_per_order' and 'sales_per_customer'
# To avoid division by zero, we replace 0 sales with a small number (e.g., 1)
df['profit_margin_ratio'] = df['benefit_per_order'] / df['sales_per_customer'].replace(0, 1)

# 3. Extract Time-Based Features from the order date
df['order_year'] = df['order_date_dateorders'].dt.year
df['order_month'] = df['order_date_dateorders'].dt.month
df['order_weekday'] = df['order_date_dateorders'].dt.dayofweek

# 4. A perfect order is on time (late_delivery_risk == 0) and profitable (benefit_per_order > 0)
df['is_perfect_order'] = ((df['late_delivery_risk'] == 0) & (df['benefit_per_order'] > 0)).astype(int)


In [3]:
df.to_csv('../data/processed/final_features.csv', index=False)
print("Feature-engineered data saved successfully to 'data/processed/final_features.csv'")

Feature-engineered data saved successfully to 'data/processed/final_features.csv'
