In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler, StandardScaler, PCA, OneHotEncoder, StringIndexer
import pandas as pd
import numpy as np
from pyspark.ml.clustering import KMeans
import matplotlib.pyplot as plt
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql.types import ArrayType, DoubleType
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_regression

### Read the delivery data

In [2]:
postgresql_jdbc_jar = r"C:/Program Files/PostgreSQL/17/postgresql-42.7.4.jar"
spark = SparkSession.builder.appName('FeatureEnginering')\
                            .config("spark.jars", postgresql_jdbc_jar) \
                            .config("spark.driver.extraClassPath", postgresql_jdbc_jar) \
                            .config("spark.driver.memory", "12g")\
                            .config("spark.executor.memory", "12g")\
                            .config("spark.executor.cores", "4")\
                            .getOrCreate()
                            
# delivery_file = r"C:\Users\Dusty\Downloads\Internship\Last-Mile-Delivery-Delays-and-Route-Optimization\data\cleaned_delivery_data.parquet"
# df_delivery = spark.read.parquet(delivery_file,inferSchema=True)

In [3]:
# Database connection parameters
url = "jdbc:postgresql://localhost:5432/postgres"
properties = {
    "user": "postgres",
    "password": "root",
    "driver": "org.postgresql.Driver"
}

# Query to filter cities Hangzhou and Shanghai
query = "(SELECT * FROM delivery_data WHERE city IN ('Chongqing', 'Shanghai', 'Yantai', 'Jilin')) AS filtered_data"

# Load the data into a PySpark DataFrame
df_delivery = spark.read.jdbc(url=url, table=query, properties=properties)

# Show the first few rows
row_count = df_delivery.count()
print(f"Number of rows in the DataFrame: {row_count}")

Number of rows in the DataFrame: 2653061


### Time-Based Features

In [4]:
# Extracting hour, day of the week, and month from the `accept_time` column
df_delivery = df_delivery.withColumn("hour_of_day", F.hour(df_delivery["accept_time"])) \
       .withColumn("day_of_week", F.dayofweek(df_delivery["accept_time"])) \
       .withColumn("month", F.month(df_delivery["accept_time"]))

# Calculate time taken for delivery (in minutes) which is already calculated in delivery_eta_minutes

# Calculate the time difference between expected and actual delivery time in minutes
# expected_delivery_time = time_window_start
# actual delivery time = delivery_time
# A positive value means the delivery happened after the expected time (late delivery), 
# while a negative value means it happened earlier than expected (early delivery).
df_delivery = df_delivery.withColumn("delivery_time_delay",
                  (df_delivery["delivery_time"].cast("long") - df_delivery["accept_time"].cast("long")) / 60)

In [5]:
df_delivery.select("hour_of_day", "day_of_week", "month", "delivery_time_delay").show(5)

+-----------+-----------+-----+-------------------+
|hour_of_day|day_of_week|month|delivery_time_delay|
+-----------+-----------+-----+-------------------+
|         18|          4|    6|              124.0|
|         18|          4|    6|              113.0|
|         18|          4|    6|               83.0|
|         18|          4|    6|              107.0|
|         18|          4|    6|              203.0|
+-----------+-----------+-----+-------------------+
only showing top 5 rows



In [6]:
# Filter rows where delivery_time_difference is negative
df_negative_deliverys = df_delivery.filter(df_delivery["delivery_time_delay"] < 0)
df_negative_deliverys.select("order_id","city", "delivery_time_delay").show(5)

+--------+---------+-------------------+
|order_id|     city|delivery_time_delay|
+--------+---------+-------------------+
| 3212380|Chongqing|          -418858.0|
| 3739029|Chongqing|          -421357.0|
| 3956975|Chongqing|          -397678.0|
+--------+---------+-------------------+



### Geospatial Features

#### Calculate the distance between delivery and delivery locations using Haversine Formula.

In [7]:
# Define the Haversine function to calculate distance between two points on the Earth
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the Earth in kilometers
    
    dlat = radians(lat2 - lat1)  # Difference in latitudes (radians)
    dlon = radians(lon2 - lon1)  # Difference in longitudes (radians)
    
    # Apply Haversine formula to calculate the "a" value
    a = sin(dlat / 2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2)**2
    # calculate the central angle between two points
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    
    # Compute the distance in KM using the radius of Earth (R) and the central angle (c)
    distance = R * c 
    return distance

# Apply the Haversine function to the DataFrame to calculate the distance in KM
df_delivery = df_delivery.withColumn("delivery_distance_km", 
                                 haversine(df_delivery["delivery_gps_lat"], df_delivery["delivery_gps_lng"],
                                           df_delivery["accept_gps_lat"], df_delivery["accept_gps_lng"]))
# Round the 'delivery_distance_km' column to 2 decimal places
df_delivery = df_delivery.withColumn("delivery_distance_km", round(df_delivery["delivery_distance_km"], 2))

In [8]:
df_delivery.select("order_id","delivery_gps_lat","delivery_gps_lng","accept_gps_lat","accept_gps_lng","delivery_distance_km").show(10)

+--------+----------------+----------------+--------------+--------------+--------------------+
|order_id|delivery_gps_lat|delivery_gps_lng|accept_gps_lat|accept_gps_lng|delivery_distance_km|
+--------+----------------+----------------+--------------+--------------+--------------------+
| 2276923|        31.26975|       121.68544|      31.26371|     121.67363|                1.31|
| 1122939|        31.26982|       121.68659|      31.26382|     121.67371|                1.39|
| 4105928|        31.27092|       121.68459|      31.26388|     121.67365|                 1.3|
| 2328097|        31.26993|        121.6886|      31.26377|     121.67371|                1.57|
|  599171|        31.25661|       121.70587|      31.26373|     121.67375|                3.15|
| 1579494|        31.00484|       121.26835|      31.00086|     121.28744|                1.87|
| 1291123|         31.2899|       121.42525|       31.2915|     121.42332|                0.26|
| 3563974|        31.19635|       121.62

#### Cluster regions using K-Means or DBSCAN for better route mapping.

In [9]:
# observing above regions and number of orders they have we can see here is a large variation in the number of rows (packages) across regions, 
# DBSCAN would be a more appropriate clustering algorithm for this scenario as It can handle regions where some areas are densely packed with 
# packages (e.g., urban centers) while others have fewer packages (e.g., rural areas).
# Also it doesn't require the number of clusters to be predefined and can naturally identify clusters of different shapes and sizes.

In [10]:
# Step 1: Select the relevant geographic columns (Longitude and Latitude)
feature_columns = ["delivery_gps_lng", "delivery_gps_lat"]

# Step 2: Vectorize the geographic features into a single vector column 'features'
vector_assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df_vectorized = vector_assembler.transform(df_delivery)

# Step 4: Apply KMeans with a specified number of clusters (let's say 5)
kmeans = KMeans(k=25, seed=1, featuresCol="features", predictionCol="cluster")
model = kmeans.fit(df_vectorized)

# Step 5: Make predictions (assign clusters to the data)
predictions = model.transform(df_vectorized)

In [11]:
# Step 7: Show some of the prediction results (assigned cluster labels)
predictions.select("order_id", "city", "cluster").show()

+--------+--------+-------+
|order_id|    city|cluster|
+--------+--------+-------+
| 2276923|Shanghai|     18|
| 1122939|Shanghai|     18|
| 4105928|Shanghai|     18|
| 2328097|Shanghai|     18|
|  599171|Shanghai|     18|
| 1579494|Shanghai|     13|
| 1291123|Shanghai|     11|
| 3563974|Shanghai|     18|
|  290278|Shanghai|     18|
| 3701861|Shanghai|     13|
| 2345163|Shanghai|     18|
|  814767|Shanghai|     11|
| 2031720|Shanghai|     12|
| 2645003|Shanghai|     18|
| 4305322|Shanghai|     12|
| 3886147|Shanghai|     18|
| 4126477|Shanghai|      0|
|  729432|Shanghai|     18|
| 2734734|Shanghai|      0|
|   26720|Shanghai|      0|
+--------+--------+-------+
only showing top 20 rows



In [12]:
predictions = predictions.drop('features')

### Courier & Package Features

#### Compute the average delivery time per courier

In [13]:
# Group by courier_id and calculate the average delivery time delay (in minutes)
average_delivery_time_per_courier = predictions.groupBy("courier_id").agg(
    F.avg("delivery_time_delay").alias("avg_delivery_time_minutes")
)

# Show the result
predictions.show(5)
predictions = predictions.join(
    average_delivery_time_per_courier, on="courier_id", how="left")

+--------+---------+--------+----------+------+--------+-------------------+-------------------+---+---------+--------+----------------+----------------+--------------+--------------+--------------------+-----------+-----------+-----+-------------------+--------------------+-------+
|order_id|region_id|    city|courier_id|aoi_id|aoi_type|        accept_time|      delivery_time| ds|      lng|     lat|delivery_gps_lng|delivery_gps_lat|accept_gps_lng|accept_gps_lat|delivery_eta_minutes|hour_of_day|day_of_week|month|delivery_time_delay|delivery_distance_km|cluster|
+--------+---------+--------+----------+------+--------+-------------------+-------------------+---+---------+--------+----------------+----------------+--------------+--------------+--------------------+-----------+-----------+-----+-------------------+--------------------+-------+
| 2276923|       71|Shanghai|      2812| 48945|       1|2025-06-11 18:03:00|2025-06-11 20:07:00|611| 121.6854|31.26977|       121.68544|        31.2

#### Extract package delivery patterns (e.g., peak hours, busiest locations).

#### Peak hours

In [14]:
# Step 1: Group by hour_of_day and count the number of deliverys (orders)
delivery_by_hour = predictions.groupBy("hour_of_day").agg(
    F.count("order_id").alias("delivery_order_count")
)

# Step 2: Sort by the delivery count in descending order to find the peak hours
delivery_by_hour_sorted = delivery_by_hour.orderBy(F.col("delivery_order_count").desc())

# Step 3: Join the delivery count by hour to the predictions DataFrame based on hour_of_day
predictions = predictions.join(
    delivery_by_hour_sorted, on="hour_of_day", how="left"
)

# Show the result: predictions with delivery count by hour
predictions.select("order_id", "hour_of_day", "delivery_order_count").show(5)

+--------+-----------+--------------------+
|order_id|hour_of_day|delivery_order_count|
+--------+-----------+--------------------+
| 2276923|         18|               73495|
| 1122939|         18|               73495|
| 4105928|         18|               73495|
| 2328097|         18|               73495|
|  599171|         18|               73495|
+--------+-----------+--------------------+
only showing top 5 rows



#### Busiest locations

In [15]:
df_busiest_locations = predictions.groupBy("city") \
    .agg(F.count("order_id").alias("city_order_count")) \
    .orderBy(F.desc("city_order_count"))
df_busiest_locations.show(5)

+---------+----------------+
|     city|city_order_count|
+---------+----------------+
| Shanghai|         1483864|
|Chongqing|          931351|
|   Yantai|          206431|
|    Jilin|           31415|
+---------+----------------+



In [16]:
# Step 3: Join the delivery count by hour to the predictions DataFrame based on hour_of_day
predictions = predictions.join(
    df_busiest_locations, on="city", how="left"
)

In [17]:
predictions.select("order_id", "city", "city_order_count").show(5)

+--------+--------+----------------+
|order_id|    city|city_order_count|
+--------+--------+----------------+
| 1262776|Shanghai|         1483864|
| 3208077|Shanghai|         1483864|
| 2393578|Shanghai|         1483864|
| 3821309|Shanghai|         1483864|
| 2548119|Shanghai|         1483864|
+--------+--------+----------------+
only showing top 5 rows



### Anomaly Detection Features

#### Identify delays based on time threshold deviations

In [18]:
# Step 1: Define the threshold for delay
time_threshold = 360  # Set the threshold in minutes

# Step 2: Identify delays based on the delivery time delay
# We create a new column `is_delayed` that indicates if the delay is above the threshold
predictions = predictions.withColumn(
    "is_delayed", 
    F.when(F.col("delivery_time_delay") > time_threshold, "Delayed").otherwise("On Time")
)

# Show the resulting DataFrame with the delay identification
predictions.select(
    "order_id", 
    "city", 
    "delivery_time_delay", 
    "delivery_time", 
    "accept_time", 
    "is_delayed"
).show()

+--------+---------+-------------------+-------------------+-------------------+----------+
|order_id|     city|delivery_time_delay|      delivery_time|        accept_time|is_delayed|
+--------+---------+-------------------+-------------------+-------------------+----------+
|  600887|Chongqing|              200.0|2025-07-22 12:07:00|2025-07-22 08:47:00|   On Time|
| 4129878| Shanghai|              212.0|2025-07-22 12:19:00|2025-07-22 08:47:00|   On Time|
| 1413255| Shanghai|              128.0|2025-07-22 10:55:00|2025-07-22 08:47:00|   On Time|
| 2117651| Shanghai|              171.0|2025-07-22 11:38:00|2025-07-22 08:47:00|   On Time|
| 4470226|Chongqing|               93.0|2025-07-22 10:20:00|2025-07-22 08:47:00|   On Time|
|  741493| Shanghai|              683.0|2025-07-22 20:10:00|2025-07-22 08:47:00|   Delayed|
| 4095657| Shanghai|                0.0|2025-10-20 10:56:00|2025-10-20 10:56:00|   On Time|
| 3959622| Shanghai|              160.0|2025-10-20 13:35:00|2025-10-20 10:55:00|

#### Flag inconsistent geospatial entries (e.g., unrealistic speed between locations).

In [19]:
# Step 1: Convert delivery_time_delay from minutes to hours
predictions = predictions.withColumn(
    "time_diff_hours", 
    F.col("delivery_time_delay") / 60  # Convert minutes to hours
)

# Step 2: Calculate the speed in km/h using the existing delivery_distance_km column
predictions = predictions.withColumn(
    "speed_kmh", 
    F.when(F.col("time_diff_hours") > 0, F.floor(F.col("delivery_distance_km") / F.col("time_diff_hours"))).otherwise(0)
)

# Step 3: Flag entries with unrealistic speed
speed_threshold = 100  # Define a reasonable speed threshold (in km/h)
predictions = predictions.withColumn(
    "speed_status", 
    F.when(F.col("speed_kmh") > speed_threshold, "Unrealistic Speed").otherwise("Realistic Speed")
)

predictions = predictions.drop("time_diff_hours")

# Show the resulting DataFrame with the speed status column
predictions.select(
    "order_id", 
    "city", 
    "delivery_distance_km", 
    "delivery_time_delay",  # Using existing column for time delay in minutes
    "speed_kmh", 
    "speed_status"
).show()

+--------+---------+--------------------+-------------------+---------+---------------+
|order_id|     city|delivery_distance_km|delivery_time_delay|speed_kmh|   speed_status|
+--------+---------+--------------------+-------------------+---------+---------------+
|  600887|Chongqing|                3.55|              200.0|        1|Realistic Speed|
| 4129878| Shanghai|                3.83|              212.0|        1|Realistic Speed|
| 1413255| Shanghai|                3.54|              128.0|        1|Realistic Speed|
| 2117651| Shanghai|                1.43|              171.0|        0|Realistic Speed|
| 4470226|Chongqing|                3.91|               93.0|        2|Realistic Speed|
|  741493| Shanghai|                4.44|              683.0|        0|Realistic Speed|
| 4095657| Shanghai|                1.15|                0.0|        0|Realistic Speed|
| 3959622| Shanghai|                1.81|              160.0|        0|Realistic Speed|
| 2212906| Shanghai|            

### Dimensionality Reduction & Feature Selection

### Save Data

In [21]:
selected_columns = [
    "order_id",
    "delivery_time",
    "accept_time",
    "hour_of_day",
    "day_of_week",
    "month",
    "delivery_eta_minutes",
    "delivery_time_delay",
    "delivery_distance_km",
    "cluster",
    "avg_delivery_time_minutes",
    "delivery_order_count",
     "city",
    "city_order_count",
    "is_delayed",
    "speed_kmh",
    "speed_status",
      # Ensure this is the correct column name for PCA features
]

# Create a new DataFrame with only the selected columns
final_predictions = predictions.select(*selected_columns)

# Define PostgreSQL connection details
jdbc_url = "jdbc:postgresql://localhost:5432/postgres"
properties = {
    "user": "postgres",
    "password": "root",
    "driver": "org.postgresql.Driver"
}

final_predictions.write.jdbc(
    url=jdbc_url,
    table="feature_engg_delivery_data",
    mode="overwrite",
    properties=properties,
)
print("Data Saved to Postgresql successfully")

Data Saved to Postgresql successfully


In [22]:
# Save the cleaned DataFrame to csv
output_path = r'C:\Users\Dusty\Downloads\Internship\Last-Mile-Delivery-Delays-and-Route-Optimization\data\feature_engg_delivery_data.csv'
final_predictions.coalesce(1).write.option("header", "true").csv(output_path, mode="overwrite")
print("Data saved to csv successfully")

Data saved to csv successfully
