In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler, StandardScaler, PCA, OneHotEncoder, StringIndexer
import pandas as pd
import numpy as np
from pyspark.ml.clustering import KMeans
import matplotlib.pyplot as plt
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql.types import ArrayType, DoubleType
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_regression

### Read the pickup data

In [2]:
postgresql_jdbc_jar = r"C:/Program Files/PostgreSQL/17/postgresql-42.7.4.jar"
spark = SparkSession.builder.appName('FeatureEngineering')\
                            .config("spark.jars", postgresql_jdbc_jar) \
                            .config("spark.driver.extraClassPath", postgresql_jdbc_jar) \
                            .config("spark.driver.memory", "8g")\
                            .config("spark.executor.memory", "8g")\
                            .config("spark.executor.cores", "4")\
                            .getOrCreate()

In [3]:
# Database connection parameters
url = "jdbc:postgresql://localhost:5432/postgres"
properties = {
    "user": "postgres",
    "password": "root",
    "driver": "org.postgresql.Driver"
}

# Query to filter cities Hangzhou and Shanghai
query = "(SELECT * FROM pickup_data WHERE city IN ('Chongqing', 'Shanghai', 'Yantai', 'Jilin')) AS filtered_data"

# Load the data into a PySpark DataFrame
df_pickup = spark.read.jdbc(url=url, table=query, properties=properties)

# Show the first few rows
row_count = df_pickup.count()
print(f"Number of rows in the DataFrame: {row_count}")

Number of rows in the DataFrame: 4005691


In [4]:
df_pickup.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- region_id: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- courier_id: integer (nullable = true)
 |-- accept_time: timestamp (nullable = true)
 |-- time_window_start: timestamp (nullable = true)
 |-- time_window_end: timestamp (nullable = true)
 |-- aoi_id: integer (nullable = true)
 |-- aoi_type: integer (nullable = true)
 |-- pickup_time: timestamp (nullable = true)
 |-- ds: integer (nullable = true)
 |-- lng: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- pickup_gps_lng: double (nullable = true)
 |-- pickup_gps_lat: double (nullable = true)
 |-- accept_gps_lng: double (nullable = true)
 |-- accept_gps_lat: double (nullable = true)
 |-- pickup_eta_minutes: double (nullable = true)



In [5]:
df_pickup = df_pickup.drop('pickup_eta_minutes')

### Time-Based Features

In [6]:
from pyspark.sql.functions import col, mean

# Extracting hour, day of the week, and month from the `accept_time` column
df_pickup = df_pickup.withColumn("hour_of_day", F.hour(df_pickup["accept_time"])) \
       .withColumn("day_of_week", F.dayofweek(df_pickup["accept_time"])) \
       .withColumn("month", F.month(df_pickup["accept_time"]))

# Calculate time taken for pickup (in minutes) which is already calculated in pickup_eta_minutes

# Calculate the time difference between expected and actual pickup time in minutes
# expected_pickup_time = time_window_start
# actual pickup time = pickup_time
# A positive value means the pickup happened after the expected time (late pickup), 
# while a negative value means it happened earlier than expected (early pickup).

# # Compute avg_pickup_time_minutes
# avg_pickup_time = df_pickup.select(expr("percentile_approx(pickup_eta_minutes, 0.50)")).collect()[0][0]

# # Compute pickup_time_delay
# df_pickup = df_pickup.withColumn("pickup_time_delay", col("pickup_eta_minutes") - avg_pickup_time)

df_pickup = df_pickup.withColumn("pickup_eta_minutes",
                  (df_pickup["pickup_time"].cast("long") - df_pickup["accept_time"].cast("long")) / 60)

df_pickup = df_pickup.withColumn("pickup_time_delay",
                  (df_pickup["pickup_time"].cast("long") - df_pickup["time_window_start"].cast("long")) / 60)

# Show result
df_pickup.select("pickup_eta_minutes", "pickup_time_delay").show(10)

+------------------+-----------------+
|pickup_eta_minutes|pickup_time_delay|
+------------------+-----------------+
|              74.0|             -6.0|
|            1526.0|              6.0|
|             157.0|             77.0|
|             180.0|            100.0|
|              13.0|            -67.0|
|             651.0|           -989.0|
|             164.0|             84.0|
|             174.0|             94.0|
|             367.0|             47.0|
|             200.0|              0.0|
+------------------+-----------------+
only showing top 10 rows



In [8]:
from pyspark.sql.functions import col

# Replace 'column_name' with your actual column name
negative_count = df_pickup.filter(col("pickup_time_delay") < 0).count()

print(f"Number of negative values in column 'pickup_time_delay': {negative_count}")

Number of negative values in column 'pickup_time_delay': 1043616


In [8]:
df_pickup.select("hour_of_day", "day_of_week", "month", "pickup_time_delay").show(5)

+-----------+-----------+-----+-----------------+
|hour_of_day|day_of_week|month|pickup_time_delay|
+-----------+-----------+-----+-----------------+
|          9|          4|   10|            -52.0|
|          9|          4|   10|             27.0|
|          9|          4|   10|            -81.0|
|          9|          4|   10|             24.0|
|          9|          4|   10|           -122.0|
+-----------+-----------+-----+-----------------+
only showing top 5 rows



In [9]:
# Filter rows where pickup_time_difference is negative
df_negative_pickups = df_pickup.filter(df_pickup["pickup_time_delay"] < 0)
df_negative_pickups.select("order_id","city", "pickup_time_delay").show(5)

+--------+---------+-----------------+
|order_id|     city|pickup_time_delay|
+--------+---------+-----------------+
|  998888|    Jilin|            -52.0|
| 5524389|   Yantai|            -81.0|
|  221410| Shanghai|           -122.0|
| 5452642| Shanghai|             -9.0|
| 4972100|Chongqing|            -31.0|
+--------+---------+-----------------+
only showing top 5 rows



### Geospatial Features

#### Calculate the distance between pickup and delivery locations using Haversine Formula.

In [10]:
# Define the Haversine function to calculate distance between two points on the Earth
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the Earth in kilometers
    
    dlat = radians(lat2 - lat1)  # Difference in latitudes (radians)
    dlon = radians(lon2 - lon1)  # Difference in longitudes (radians)
    
    # Apply Haversine formula to calculate the "a" value
    a = sin(dlat / 2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2)**2
    # calculate the central angle between two points
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    
    # Compute the distance in KM using the radius of Earth (R) and the central angle (c)
    distance = R * c 
    return distance

# Apply the Haversine function to the DataFrame to calculate the distance in KM
df_pickup = df_pickup.withColumn("pickup_distance_km", 
                                 haversine(df_pickup["pickup_gps_lat"], df_pickup["pickup_gps_lng"],
                                           df_pickup["accept_gps_lat"], df_pickup["accept_gps_lng"]))
# Round the 'pickup_distance_km' column to 2 decimal places
df_pickup = df_pickup.withColumn("pickup_distance_km", round(df_pickup["pickup_distance_km"], 2))

In [11]:
df_pickup.select("order_id","pickup_gps_lat","pickup_gps_lng","accept_gps_lat","accept_gps_lng","pickup_distance_km").show(10)

+--------+--------------+--------------+--------------+--------------+------------------+
|order_id|pickup_gps_lat|pickup_gps_lng|accept_gps_lat|accept_gps_lng|pickup_distance_km|
+--------+--------------+--------------+--------------+--------------+------------------+
|  998888|      43.87979|     126.56965|      43.88011|     126.57201|              0.19|
| 1817267|      29.49215|     106.47098|      29.49172|     106.47445|              0.34|
| 5524389|       36.7588|     121.17365|      32.70376|     118.54136|            510.99|
| 3124312|      32.57753|     118.38133|      32.70376|     118.54136|             20.53|
|  221410|      31.19308|     121.70351|      31.19639|     121.70901|              0.64|
| 3287344|      29.70265|     107.37631|      29.70634|     107.37352|              0.49|
|  964270|      31.24391|     121.41594|      31.23992|     121.41514|              0.45|
| 3835946|      37.60493|     120.47227|      37.60347|     120.47835|              0.56|
| 3408841|

#### Cluster regions using K-Means or DBSCAN for better route mapping.

In [12]:
# observing above regions and number of orders they have we can see here is a large variation in the number of rows (packages) across regions, 
# DBSCAN would be a more appropriate clustering algorithm for this scenario as It can handle regions where some areas are densely packed with 
# packages (e.g., urban centers) while others have fewer packages (e.g., rural areas).
# Also it doesn't require the number of clusters to be predefined and can naturally identify clusters of different shapes and sizes.

In [13]:
# Step 1: Select the relevant geographic columns (Longitude and Latitude)
feature_columns = ["pickup_gps_lng", "pickup_gps_lat"]

# Step 2: Vectorize the geographic features into a single vector column 'features'
vector_assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df_vectorized = vector_assembler.transform(df_pickup)

# Step 4: Apply KMeans with a specified number of clusters (let's say 5)
kmeans = KMeans(k=25, seed=1, featuresCol="features", predictionCol="cluster")
model = kmeans.fit(df_vectorized)

# Step 5: Make predictions (assign clusters to the data)
predictions = model.transform(df_vectorized)

In [14]:
# Step 7: Show some of the prediction results (assigned cluster labels)
predictions.select("order_id", "city", "cluster").show()

+--------+---------+-------+
|order_id|     city|cluster|
+--------+---------+-------+
|  998888|    Jilin|      2|
| 1817267|Chongqing|     12|
| 5524389|   Yantai|      8|
| 3124312|Chongqing|      4|
|  221410| Shanghai|      0|
| 3287344|Chongqing|      1|
|  964270| Shanghai|     24|
| 3835946|   Yantai|      9|
| 3408841|   Yantai|      5|
| 5452642| Shanghai|      0|
| 4972100|Chongqing|      6|
| 5950591| Shanghai|      0|
| 1084012|   Yantai|      5|
| 1669840|Chongqing|     12|
| 1649197|Chongqing|      6|
| 2324275|   Yantai|      5|
| 2805523|Chongqing|      6|
|  377878| Shanghai|      0|
| 3649709|   Yantai|      4|
| 3350698| Shanghai|      0|
+--------+---------+-------+
only showing top 20 rows



In [15]:
predictions = predictions.drop('features')

### Courier & Package Features

#### Compute the average pickup time per courier

In [16]:
# Group by courier_id and calculate the average pickup time delay (in minutes)
average_pickup_time_per_courier = predictions.groupBy("courier_id").agg(
    F.avg("pickup_time_delay").alias("avg_pickup_time_minutes")
)

# Show the result
predictions.show(5)
predictions = predictions.join(
    average_pickup_time_per_courier, on="courier_id", how="left")

+--------+---------+---------+----------+-------------------+-------------------+-------------------+------+--------+-------------------+----+---------+--------+--------------+--------------+--------------+--------------+------------------+-----------+-----------+-----+-----------------+------------------+-------+
|order_id|region_id|     city|courier_id|        accept_time|  time_window_start|    time_window_end|aoi_id|aoi_type|        pickup_time|  ds|      lng|     lat|pickup_gps_lng|pickup_gps_lat|accept_gps_lng|accept_gps_lat|pickup_eta_minutes|hour_of_day|day_of_week|month|pickup_time_delay|pickup_distance_km|cluster|
+--------+---------+---------+----------+-------------------+-------------------+-------------------+------+--------+-------------------+----+---------+--------+--------------+--------------+--------------+--------------+------------------+-----------+-----------+-----+-----------------+------------------+-------+
|  998888|       90|    Jilin|      7180|2025-10-22 

#### Extract package pickup patterns (e.g., peak hours, busiest locations).

#### Peak hours

In [17]:
# Step 1: Group by hour_of_day and count the number of pickups (orders)
pickup_by_hour = predictions.groupBy("hour_of_day").agg(
    F.count("order_id").alias("pickup_order_count")
)

# Step 2: Sort by the pickup count in descending order to find the peak hours
pickup_by_hour_sorted = pickup_by_hour.orderBy(F.col("pickup_order_count").desc())

# Step 3: Join the pickup count by hour to the predictions DataFrame based on hour_of_day
predictions = predictions.join(
    pickup_by_hour_sorted, on="hour_of_day", how="left"
)

# Show the result: predictions with pickup count by hour
predictions.select("order_id", "hour_of_day", "pickup_order_count").show(5)

+--------+-----------+------------------+
|order_id|hour_of_day|pickup_order_count|
+--------+-----------+------------------+
|  998888|          9|            516633|
| 1817267|          9|            516633|
| 5524389|          9|            516633|
| 3124312|          9|            516633|
|  221410|          9|            516633|
+--------+-----------+------------------+
only showing top 5 rows



#### Busiest locations

In [18]:
df_busiest_locations = predictions.groupBy("city") \
    .agg(F.count("order_id").alias("city_order_count")) \
    .orderBy(F.desc("city_order_count"))
df_busiest_locations.show(5)

+---------+----------------+
|     city|city_order_count|
+---------+----------------+
| Shanghai|         1424406|
|Chongqing|         1172703|
|   Yantai|         1146781|
|    Jilin|          261801|
+---------+----------------+



In [19]:
# Step 3: Join the pickup count by hour to the predictions DataFrame based on hour_of_day
predictions = predictions.join(
    df_busiest_locations, on="city", how="left"
)

In [20]:
predictions.select("order_id", "city", "city_order_count").show(5)

+--------+--------+----------------+
|order_id|    city|city_order_count|
+--------+--------+----------------+
| 3385131|   Jilin|          261801|
| 4933663|  Yantai|         1146781|
| 6137240|Shanghai|         1424406|
| 1226397|Shanghai|         1424406|
| 1793373|Shanghai|         1424406|
+--------+--------+----------------+
only showing top 5 rows



In [21]:
predictions.printSchema()

root
 |-- city: string (nullable = true)
 |-- hour_of_day: integer (nullable = true)
 |-- courier_id: integer (nullable = true)
 |-- order_id: integer (nullable = true)
 |-- region_id: integer (nullable = true)
 |-- accept_time: timestamp (nullable = true)
 |-- time_window_start: timestamp (nullable = true)
 |-- time_window_end: timestamp (nullable = true)
 |-- aoi_id: integer (nullable = true)
 |-- aoi_type: integer (nullable = true)
 |-- pickup_time: timestamp (nullable = true)
 |-- ds: integer (nullable = true)
 |-- lng: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- pickup_gps_lng: double (nullable = true)
 |-- pickup_gps_lat: double (nullable = true)
 |-- accept_gps_lng: double (nullable = true)
 |-- accept_gps_lat: double (nullable = true)
 |-- pickup_eta_minutes: double (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- pickup_time_delay: double (nullable = true)
 |-- pickup_distance_km: double (nullable = t

### Anomaly Detection Features

#### Identify delays based on time threshold deviations

In [None]:
# Step 1: Define the threshold for delay
# time_threshold = 360  # Set the threshold in minutes

# Step 2: Identify delays based on the pickup time delay
# We create a new column `is_delayed` that indicates if the delay is above the threshold
predictions = predictions.withColumn(
    "is_delayed", 
    F.when(F.col("pickup_time_delay") > 0, "Delayed").otherwise("On Time")
)

# Show the resulting DataFrame with the delay identification
predictions.select(
    "order_id", 
    "city", 
    "pickup_time_delay", 
    "pickup_time", 
    "accept_time", 
    "is_delayed"
).show()

+--------+---------+-----------------+-------------------+-------------------+----------+
|order_id|     city|pickup_time_delay|        pickup_time|        accept_time|is_delayed|
+--------+---------+-----------------+-------------------+-------------------+----------+
| 5883271|   Yantai|             51.0|2025-07-24 17:51:00|2025-07-24 16:56:00|   Delayed|
| 4303663|Chongqing|             37.0|2025-07-24 17:33:00|2025-07-24 16:56:00|   Delayed|
| 3716828| Shanghai|              4.0|2025-07-24 17:04:00|2025-07-24 16:56:00|   Delayed|
|  710918| Shanghai|             40.0|2025-07-24 17:40:00|2025-07-24 16:56:00|   Delayed|
|  602713|    Jilin|            101.0|2025-07-24 18:41:00|2025-07-24 16:56:00|   Delayed|
| 3739332|Chongqing|             10.0|2025-07-24 17:06:00|2025-07-24 16:56:00|   Delayed|
| 2303104|   Yantai|             35.0|2025-07-24 17:35:00|2025-07-24 16:56:00|   Delayed|
| 5105167| Shanghai|             76.0|2025-07-24 18:16:00|2025-07-24 16:56:00|   Delayed|
| 5385590|

#### Flag inconsistent geospatial entries (e.g., unrealistic speed between locations).

In [23]:
# Step 1: Convert pickup_time_delay from minutes to hours
predictions = predictions.withColumn(
    "time_diff_hours", 
    F.col("pickup_time_delay") / 60  # Convert minutes to hours
)

# Step 2: Calculate the speed in km/h using the existing pickup_distance_km column
predictions = predictions.withColumn(
    "speed_kmh", 
    F.when(F.col("time_diff_hours") > 0, F.floor(F.col("pickup_distance_km") / F.col("time_diff_hours"))).otherwise(0)
)

# Step 3: Flag entries with unrealistic speed
speed_threshold = 100  # Define a reasonable speed threshold (in km/h)
predictions = predictions.withColumn(
    "speed_status", 
    F.when(F.col("speed_kmh") > speed_threshold, "Unrealistic Speed").otherwise("Realistic Speed")
)

predictions = predictions.drop("time_diff_hours")

# Show the resulting DataFrame with the speed status column
predictions.select(
    "order_id", 
    "city", 
    "pickup_distance_km", 
    "pickup_time_delay",  # Using existing column for time delay in minutes
    "speed_kmh", 
    "speed_status"
).show()

+--------+---------+------------------+-----------------+---------+-----------------+
|order_id|     city|pickup_distance_km|pickup_time_delay|speed_kmh|     speed_status|
+--------+---------+------------------+-----------------+---------+-----------------+
| 5883271|   Yantai|               0.8|             51.0|        0|  Realistic Speed|
| 4303663|Chongqing|              0.61|             37.0|        0|  Realistic Speed|
| 3716828| Shanghai|             20.53|              4.0|      307|Unrealistic Speed|
|  710918| Shanghai|              0.42|             40.0|        0|  Realistic Speed|
|  602713|    Jilin|              1.51|            101.0|        0|  Realistic Speed|
| 3739332|Chongqing|             20.53|             10.0|      123|Unrealistic Speed|
| 2303104|   Yantai|              1.67|             35.0|        2|  Realistic Speed|
| 5105167| Shanghai|             20.53|             76.0|       16|  Realistic Speed|
| 5385590|Chongqing|           1202.73|          -1525

### Save Data

In [24]:
predictions = predictions.drop('features','scaled_features')

In [25]:
# from pyspark.sql.functions import udf

# # Convert vector column to array of doubles (e.g., 'features' or 'scaled_features')
# def vector_to_array(vec):
#     if isinstance(vec, Vector):
#         return vec.toArray().tolist()
#     else:
#         return []

# # Register the UDF
# vector_to_array_udf = udf(vector_to_array, ArrayType(DoubleType()))

# # Apply the UDF to your vector columns and convert them into array of doubles
# predictions = predictions.withColumn("pca_features_array", vector_to_array_udf("pca_features"))

# # Drop the original vector columns (if you no longer need them)
# predictions = predictions.drop("pca_features")  # Keep 'pca_features_array'

In [26]:
print(predictions.columns) 

['city', 'hour_of_day', 'courier_id', 'order_id', 'region_id', 'accept_time', 'time_window_start', 'time_window_end', 'aoi_id', 'aoi_type', 'pickup_time', 'ds', 'lng', 'lat', 'pickup_gps_lng', 'pickup_gps_lat', 'accept_gps_lng', 'accept_gps_lat', 'pickup_eta_minutes', 'day_of_week', 'month', 'pickup_time_delay', 'pickup_distance_km', 'cluster', 'avg_pickup_time_minutes', 'pickup_order_count', 'city_order_count', 'is_delayed', 'speed_kmh', 'speed_status']


In [29]:
selected_columns = [
    "order_id",
    "pickup_time",
    "accept_time",
    "hour_of_day",
    "day_of_week",
    "month",
    "pickup_eta_minutes",
    "pickup_time_delay",
    "pickup_distance_km",
    "cluster",
    "avg_pickup_time_minutes",
    "pickup_order_count",
    "city",
    "city_order_count",
    "is_delayed",
    "speed_kmh",
    "speed_status",
]

# Create a new DataFrame with only the selected columns
final_predictions = predictions.select(*selected_columns)

# Define PostgreSQL connection details
jdbc_url = "jdbc:postgresql://localhost:5432/postgres"
properties = {
    "user": "postgres",
    "password": "root",
    "driver": "org.postgresql.Driver"
}

final_predictions.write.jdbc(
    url=jdbc_url,
    table="feature_engg_pickup_data",
    mode="overwrite",
    properties=properties,
)
print("Data Saved to Postgresql successfully")

Data Saved to Postgresql successfully


In [28]:
# Save the cleaned DataFrame to csv
output_path = r'C:\Users\Dusty\Downloads\Internship\Last-Mile-Delivery-Delays-and-Route-Optimization\data\feature_engg_pickup_data.csv'
final_predictions.coalesce(1).write.option("header", "true").csv(output_path, mode="overwrite")
print("Data saved to csv successfully")

Data saved to csv successfully
