## Amazon Delivery Time Prediction - Data Preparation

In [119]:
# Import necessary libraries
import pandas as pd
import numpy as np
from datetime import datetime
from geopy.distance import geodesic

### Load and preprocess the dataset

In [121]:
df = pd.read_csv('amazon_delivery.csv')
df.head()

Unnamed: 0,Order_ID,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Order_Date,Order_Time,Pickup_Time,Weather,Traffic,Vehicle,Area,Delivery_Time,Category
0,ialx566343618,37,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,11:30:00,11:45:00,Sunny,High,motorcycle,Urban,120,Clothing
1,akqg208421122,34,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,19:45:00,19:50:00,Stormy,Jam,scooter,Metropolitian,165,Electronics
2,njpu434582536,23,4.4,12.914264,77.6784,12.924264,77.6884,2022-03-19,08:30:00,08:45:00,Sandstorms,Low,motorcycle,Urban,130,Sports
3,rjto796129700,38,4.7,11.003669,76.976494,11.053669,77.026494,2022-04-05,18:00:00,18:10:00,Sunny,Medium,motorcycle,Metropolitian,105,Cosmetics
4,zguw716275638,32,4.6,12.972793,80.249982,13.012793,80.289982,2022-03-26,13:30:00,13:45:00,Cloudy,High,scooter,Metropolitian,150,Toys


In [123]:
df.shape

(43739, 16)

In [125]:
# Checking missing values
df.isnull().sum()

Order_ID            0
Agent_Age           0
Agent_Rating       54
Store_Latitude      0
Store_Longitude     0
Drop_Latitude       0
Drop_Longitude      0
Order_Date          0
Order_Time          0
Pickup_Time         0
Weather            91
Traffic             0
Vehicle             0
Area                0
Delivery_Time       0
Category            0
dtype: int64

In [127]:
# Convert date, time columns
df['Order_Date'] = pd.to_datetime(df['Order_Date'], format='%Y-%m-%d', errors='coerce')
df['Order_Time'] = pd.to_datetime(df['Order_Time'], format='%H:%M:%S', errors='coerce').dt.time
df['Pickup_Time'] = pd.to_datetime(df['Pickup_Time'], format='%H:%M:%S', errors='coerce').dt.time

In [129]:
# Convert times to string format
df['Order_Time'] = df['Order_Time'].astype(str)
df['Pickup_Time'] = df['Pickup_Time'].astype(str)

In [131]:
df.dtypes

Order_ID                   object
Agent_Age                   int64
Agent_Rating              float64
Store_Latitude            float64
Store_Longitude           float64
Drop_Latitude             float64
Drop_Longitude            float64
Order_Date         datetime64[ns]
Order_Time                 object
Pickup_Time                object
Weather                    object
Traffic                    object
Vehicle                    object
Area                       object
Delivery_Time               int64
Category                   object
dtype: object

## Feature engineering

In [133]:
# Calculate distance between store and drop location
distances = []
for _, row in df.iterrows():
    store_coords = (row['Store_Latitude'], row['Store_Longitude'])
    drop_coords = (row['Drop_Latitude'], row['Drop_Longitude'])
    
    # Check if coordinates are valid
    if (pd.isna(store_coords[0]) or pd.isna(store_coords[1]) or 
        pd.isna(drop_coords[0]) or pd.isna(drop_coords[1])):
        distances.append(0)
    else:
        current_distance = geodesic(store_coords, drop_coords).kilometers
        distances.append(current_distance if pd.notna(current_distance) else 0)

df['Distance'] = distances

In [135]:
# Extract time features
order_hours = []
order_minutes = []
pickup_hours = []
pickup_minutes = []

for order_time, pickup_time in zip(df['Order_Time'], df['Pickup_Time']):
    # Process Order Time
    if pd.isna(order_time) or order_time == 'NaN' or order_time == 'None':
        order_hours.append(0)
        order_minutes.append(0)
    else:
        time_obj = datetime.strptime(str(order_time), '%H:%M:%S').time() if order_time != 'NaT' else datetime.min.time()
        order_hours.append(time_obj.hour)
        order_minutes.append(time_obj.minute)
        
    # Process Pickup Time
    if pd.isna(pickup_time) or pickup_time == 'NaN' or pickup_time == 'None':
        pickup_hours.append(0)
        pickup_minutes.append(0)
    else:
        time_obj = datetime.strptime(str(pickup_time), '%H:%M:%S').time() if pickup_time != 'NaT' else datetime.min.time()
        pickup_hours.append(time_obj.hour)
        pickup_minutes.append(time_obj.minute)

In [137]:
# Add extracted time features to dataframe
df['Order_Hour'] = order_hours
df['Order_Minute'] = order_minutes
df['Pickup_Hour'] = pickup_hours
df['Pickup_Minute'] = pickup_minutes

In [139]:
# Extract day and month
df['Order_Day'] = df['Order_Date'].dt.day_name()
df['Order_Month'] = df['Order_Date'].dt.month

In [141]:
# Save processed dataset
df.to_csv('prepared_data.csv', index=False)
print("\nPrepared data saved to 'prepared_data.csv'")


Prepared data saved to 'prepared_data.csv'
