## Amazon Delivery Time Prediction - Feature Engineering

In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [77]:
# Load cleaned data
df = pd.read_csv('cleaned_data.csv')

In [79]:
df.head()

Unnamed: 0,Order_ID,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Order_Date,Order_Time,Pickup_Time,...,Pickup_Hour,Pickup_Minute,Order_Day,Order_Month,Weather_Encoded,Traffic_Encoded,Vehicle_Encoded,Area_Encoded,Category_Encoded,Order_Day_Encoded
0,ialx566343618,37,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,11:30:00,11:45:00,...,11,45,Saturday,3,4,0,1,3,2,2
1,akqg208421122,34,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,19:45:00,19:50:00,...,19,50,Friday,3,3,1,2,0,4,0
2,njpu434582536,23,4.4,12.914264,77.6784,12.924264,77.6884,2022-03-19,08:30:00,08:45:00,...,8,45,Saturday,3,2,2,1,3,14,2
3,rjto796129700,38,4.7,11.003669,76.976494,11.053669,77.026494,2022-04-05,18:00:00,18:10:00,...,18,10,Tuesday,4,4,3,1,0,3,5
4,zguw716275638,32,4.6,12.972793,80.249982,13.012793,80.289982,2022-03-26,13:30:00,13:45:00,...,13,45,Saturday,3,0,0,2,0,15,2


In [81]:
# Initialize scaler
scaler = StandardScaler()

In [83]:
# Calculate time differences
df['Order_to_Pickup_Minutes'] = (
    (pd.to_datetime(df['Pickup_Time'], format='%H:%M:%S') - 
     pd.to_datetime(df['Order_Time'], format='%H:%M:%S')).dt.total_seconds() / 60
)

In [85]:
df['Is_Weekend'] = df['Order_Day'].isin(['Saturday', 'Sunday']).astype(int)
df['Is_Peak_Hour'] = df['Order_Hour'].isin([8, 9, 12, 13, 17, 18, 19]).astype(int)

In [87]:
# Create distance features
df['Distance_Category'] = pd.qcut(
    df['Distance'],
    q=4,
    labels=['Short', 'Medium', 'Long', 'Very Long']
)

In [89]:
# Scale numerical features
numerical_cols = ['Distance', 'Order_to_Pickup_Minutes', 'Agent_Age', 'Agent_Rating']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [91]:
df.isnull().sum()

Order_ID                   0
Agent_Age                  0
Agent_Rating               0
Store_Latitude             0
Store_Longitude            0
Drop_Latitude              0
Drop_Longitude             0
Order_Date                 0
Order_Time                 0
Pickup_Time                0
Weather                    0
Traffic                    0
Vehicle                    0
Area                       0
Delivery_Time              0
Category                   0
Distance                   0
Order_Hour                 0
Order_Minute               0
Pickup_Hour                0
Pickup_Minute              0
Order_Day                  0
Order_Month                0
Weather_Encoded            0
Traffic_Encoded            0
Vehicle_Encoded            0
Area_Encoded               0
Category_Encoded           0
Order_Day_Encoded          0
Order_to_Pickup_Minutes    0
Is_Weekend                 0
Is_Peak_Hour               0
Distance_Category          0
dtype: int64

In [93]:
# Save featured dataset
df.to_csv('featured_data.csv', index=False)
print("\nFeatured data saved to 'featured_data.csv'")


Featured data saved to 'featured_data.csv'


In [95]:
# Print new features created
original_columns = set(pd.read_csv('cleaned_data.csv').columns)
new_features = set(df.columns) - original_columns
print("\nNew features created:")
for feature in sorted(new_features):
    print(f"- {feature}")


New features created:
- Distance_Category
- Is_Peak_Hour
- Is_Weekend
- Order_to_Pickup_Minutes
