In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import geodesic

In [2]:
data = pd.read_csv(r"data\delivery_data.csv")

In [3]:
# remove the null values
data.dropna(inplace=True)


In [4]:
#  remove duplicates
data.drop_duplicates(inplace = True)

In [5]:
# get distance from pick up to delivery place
def calculate_distance(row):
    store = (row['Store_Latitude'], row['Store_Longitude'])
    drop = (row['Drop_Latitude'], row['Drop_Longitude'])
    return geodesic(store, drop).kilometers

# Apply the function row-wise to calculate distance
data['Distance_km'] = data.apply(calculate_distance, axis=1)

In [6]:
data['Order_Date']= pd.to_datetime(data['Order_Date'])

# create cols for order year and month
data['Order_Year']= data['Order_Date'].dt.year
data['Order_Month']= data['Order_Date'].dt.month
data['Order_Day']= data['Order_Date'].dt.day_of_year
data['Week_day']= data['Order_Date'].dt.day_of_week
data['Weekday_Name'] = data['Order_Date'].dt.day_name()

In [7]:
# drop unecessary cols 

data.drop(columns= ['Order_ID','Store_Latitude','Store_Longitude','Drop_Latitude','Drop_Longitude','Order_Date'], inplace=True)

In [8]:
#data.to_csv("data_cleaned.csv")

In [9]:
data['Pickup_Time'] = pd.to_datetime(data['Pickup_Time'], format='%H:%M:%S').apply(lambda x: x.replace(year=2023, month=1, day=1))

data['Pickup_Time_minutes'] = data['Pickup_Time'].dt.hour * 60 + data['Pickup_Time'].dt.minute

### Exploratory Data Analysis

In [10]:
data.describe()

Unnamed: 0,Agent_Age,Agent_Rating,Pickup_Time,Delivery_Time,Distance_km,Order_Year,Order_Month,Order_Day,Week_day,Pickup_Time_minutes
count,43594.0,43594.0,43594,43594.0,43594.0,43594.0,43594.0,43594.0,43594.0,43594.0
mean,29.555719,4.635287,2023-01-01 17:37:46.282056960,124.916433,26.65442,2022.0,2.981052,72.700326,3.000092,1057.771368
min,20.0,2.5,2023-01-01 00:00:00,10.0,1.463837,2022.0,2.0,42.0,0.0,0.0
25%,25.0,4.5,2023-01-01 14:35:00,90.0,4.654125,2022.0,3.0,63.0,1.0,875.0
50%,30.0,4.7,2023-01-01 19:10:00,125.0,9.204106,2022.0,3.0,74.0,3.0,1150.0
75%,35.0,4.9,2023-01-01 21:35:00,160.0,13.660855,2022.0,3.0,86.0,5.0,1295.0
max,39.0,5.0,2023-01-01 23:55:00,270.0,6852.617172,2022.0,4.0,96.0,6.0,1435.0
std,5.760689,0.313827,,51.941975,298.155434,0.0,0.545886,15.463604,1.969743,321.520109


### Data Preprocessing

In [11]:
# Drop unnecessary columns
data = data.drop(columns=['Order_Time', 'Pickup_Time', 'Weekday_Name','Order_Day',	'Week_day'])


In [12]:
# Encode categorical columns using LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Initialize label encoder
encoder = LabelEncoder()

# Encode 'Weather'
data['Weather'] = encoder.fit_transform(data['Weather'])

# Encode 'Traffic'
data['Traffic'] = encoder.fit_transform(data['Traffic'])

# Encode 'Vehicle'
data['Vehicle'] = encoder.fit_transform(data['Vehicle'])

# Encode 'Area'
data['Area'] = encoder.fit_transform(data['Area'])

data['Category']=  encoder.fit_transform(data['Category'])



In [13]:
# Scale data

from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# List the columns you want to scale
columns_to_scale = ['Agent_Age', 'Agent_Rating', 'Distance_km','Pickup_Time_minutes']

# Fit and transform the data
data[columns_to_scale] = scaler.fit_transform(data[columns_to_scale])


In [14]:
# save scaled data

#data.to_csv("scaled_data.csv")