In [10]:
# Import libraries
import pandas as pd
import numpy as np

In [12]:
# Load processed data
train = pd.read_parquet("../data/processed/train.parquet")
test = pd.read_parquet("../data/processed/test.parquet")

In [13]:
# Quick overview
print("Train shape:", train.shape)
print("Test shape:", test.shape)
print(train.head())

Train shape: (91259, 7)
Test shape: (22815, 7)
                           order_id order_purchase_timestamp  delivery_delay  \
0  c66d11687a807ceb590246a11aa1bc48      2017-07-12 10:10:23           -13.0   
1  8aad27637248090b933a0697be57c443      2018-03-15 15:54:57           -12.0   
2  ca8ee1fce14344746b41f8e2d2c1f4bf      2018-03-30 14:48:28           -10.0   
3  fdc0c4c2efc395cf5b80b60f573097cd      2017-11-08 21:35:07            -5.0   
4  867effa42bd3592c4dbada5a0ebc42bc      2017-06-11 20:27:59           -18.0   

   delivery_time  payment_value  product_category_name  is_returned  
0            7.0         140.08                     57            0  
1           13.0         250.74                     32            0  
2           10.0          70.27                     66            0  
3           14.0         133.76                     11            0  
4            7.0         354.08                     12            0  


# Feature Engineering

In [14]:
# 1. Extract purchase hour from order_purchase_timestamp
train['order_hour'] = pd.to_datetime(train['order_purchase_timestamp']).dt.hour
test['order_hour'] = pd.to_datetime(test['order_purchase_timestamp']).dt.hour

In [None]:
# 2. Calculate price per item (assuming 'price' and 'order_item_id' exist or replace accordingly)
# Note: If 'price' is not in your features, skip or add if available in raw data
# For example, if you merged price info, else comment this out
# train['price_per_item'] = train['payment_value'] / train['order_item_id']
# test['price_per_item'] = test['payment_value'] / test['order_item_id']

In [15]:
# 3. Additional features (example)
# Flag for late delivery (delivery_delay > 0)
train['late_delivery_flag'] = (train['delivery_delay'] > 0).astype(int)
test['late_delivery_flag'] = (test['delivery_delay'] > 0).astype(int)

In [None]:
# 4. Normalize or scale features if needed (example)
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# numeric_features = ['delivery_delay', 'delivery_time', 'payment_value', 'order_hour']
# train[numeric_features] = scaler.fit_transform(train[numeric_features])
# test[numeric_features] = scaler.transform(test[numeric_features])

In [17]:
# 5. Target and features separation
target = 'is_returned'
features = train.columns.drop([target, 'order_id', 'order_purchase_timestamp'])  # drop non-features

X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

print("Features used for training:")
print(features)

print("\nSample of features data:")
print(X_train.head())

Features used for training:
Index(['delivery_delay', 'delivery_time', 'payment_value',
       'product_category_name', 'order_hour', 'late_delivery_flag'],
      dtype='object')

Sample of features data:
   delivery_delay  delivery_time  payment_value  product_category_name  \
0           -13.0            7.0         140.08                     57   
1           -12.0           13.0         250.74                     32   
2           -10.0           10.0          70.27                     66   
3            -5.0           14.0         133.76                     11   
4           -18.0            7.0         354.08                     12   

   order_hour  late_delivery_flag  
0          10                   0  
1          15                   0  
2          14                   0  
3          21                   0  
4          20                   0  


In [20]:
# Save engineered features to disk for model training
X_train.to_parquet("../data/processed/X_train.parquet", index=False)
y_train.to_frame().to_parquet("../data/processed/y_train.parquet", index=False)
X_test.to_parquet("../data/processed/X_test.parquet", index=False)
y_test.to_frame().to_parquet("../data/processed/y_test.parquet", index=False)

print("✅ Feature engineering complete and data saved.")

✅ Feature engineering complete and data saved.
