### Imports

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.ml.feature import Imputer
spark = SparkSession.builder.appName('PysparkTransformation').getOrCreate()
import sys

### Read the pickup data

In [2]:
delivery_file = r'C:\Users\Dusty\Downloads\Internship\Last-Mile-Delivery-Delays-and-Route-Optimization\data\delivery_data.csv'
df_delivery = spark.read.csv(delivery_file,header=True,inferSchema=True)

In [3]:
#import : from pyspark.sql.functions import rand
df_delivery.orderBy(rand()).show(4)

+--------+---------+---------+----------+---------+--------+------+--------+--------------+---------------+--------------+--------------+--------------+-----------------+----------------+----------------+----+
|order_id|region_id|     city|courier_id|      lng|     lat|aoi_id|aoi_type|   accept_time|accept_gps_time|accept_gps_lng|accept_gps_lat| delivery_time|delivery_gps_time|delivery_gps_lng|delivery_gps_lat|  ds|
+--------+---------+---------+----------+---------+--------+------+--------+--------------+---------------+--------------+--------------+--------------+-----------------+----------------+----------------+----+
| 1034452|      158|Chongqing|      1014|106.52605| 29.6391| 20067|       1|07-26 13:38:00| 07-26 13:38:00|     106.52121|      29.62357|07-26 20:47:00|   07-26 20:47:00|       106.47765|        29.62664| 726|
| 1001537|       27| Hangzhou|      2947|120.11555|30.29957| 21312|       1|07-16 17:06:00| 07-16 17:06:00|     120.12521|      30.29694|07-16 19:42:00|   07-16

In [4]:
print(f"Total number of records: {df_delivery.count()}")

Total number of records: 4514661


### Convert String Timestamps to timestamp Format

In [5]:
df_delivery.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- region_id: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- courier_id: integer (nullable = true)
 |-- lng: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- aoi_id: integer (nullable = true)
 |-- aoi_type: integer (nullable = true)
 |-- accept_time: string (nullable = true)
 |-- accept_gps_time: string (nullable = true)
 |-- accept_gps_lng: double (nullable = true)
 |-- accept_gps_lat: double (nullable = true)
 |-- delivery_time: string (nullable = true)
 |-- delivery_gps_time: string (nullable = true)
 |-- delivery_gps_lng: double (nullable = true)
 |-- delivery_gps_lat: double (nullable = true)
 |-- ds: integer (nullable = true)



In [6]:
# Get current year dynamically as with adding its taking 1970 as default year as there is no year in csv file and excel is showing byfault the current year
current_year = date_format(col("current_date"), "yyyy")

# Convert all date columns by prepending the current year
df_delivery = df_delivery.withColumn("accept_time", to_timestamp(concat(current_year, lit("-"), trim(col("accept_time"))), "yyyy-MM-dd HH:mm:ss")) \
       .withColumn("delivery_time", to_timestamp(concat(current_year, lit("-"), trim(col("delivery_time"))), "yyyy-MM-dd HH:mm:ss")) \
       .withColumn("delivery_gps_time", to_timestamp(concat(current_year, lit("-"), trim(col("delivery_gps_time"))), "yyyy-MM-dd HH:mm:ss")) \
       .withColumn("accept_gps_time", to_timestamp(concat(current_year, lit("-"), trim(col("accept_gps_time"))), "yyyy-MM-dd HH:mm:ss"))

In [7]:
df_delivery.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- region_id: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- courier_id: integer (nullable = true)
 |-- lng: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- aoi_id: integer (nullable = true)
 |-- aoi_type: integer (nullable = true)
 |-- accept_time: timestamp (nullable = true)
 |-- accept_gps_time: timestamp (nullable = true)
 |-- accept_gps_lng: double (nullable = true)
 |-- accept_gps_lat: double (nullable = true)
 |-- delivery_time: timestamp (nullable = true)
 |-- delivery_gps_time: timestamp (nullable = true)
 |-- delivery_gps_lng: double (nullable = true)
 |-- delivery_gps_lat: double (nullable = true)
 |-- ds: integer (nullable = true)



In [8]:
df_delivery.select("delivery_time").show(5)

+-------------------+
|      delivery_time|
+-------------------+
|2025-10-30 10:30:00|
|2025-10-31 10:40:00|
|2025-10-22 15:03:00|
|2025-10-26 10:30:00|
|2025-10-31 16:41:00|
+-------------------+
only showing top 5 rows



### Dropping Duplicates

In [9]:
df_delivery = df_delivery.dropDuplicates(["order_id"])

### Handle missing values using dropna

In [10]:
# Dropping order_id and courier_idif they are missing as they are important
df_delivery = df_delivery.dropna(subset=["order_id", "courier_id"])

In [11]:
print(f"Total number of records: {df_delivery.count()}")

Total number of records: 4514661


### Handle Missing values using imputation strategies where necessary. 

#### Mean Imputation for Numeric Columns

In [12]:
numeric_cols = ["lng", "lat", "delivery_gps_lng", "delivery_gps_lat", "accept_gps_lng", "accept_gps_lat"]

# Create Imputer to fill missing numeric values with mean
imputer = Imputer(inputCols=numeric_cols, outputCols=[c + "_imputed" for c in numeric_cols], strategy="mean")

# Apply Imputer
df_delivery = imputer.fit(df_delivery).transform(df_delivery)
# Round the imputed values to 5 decimal places
for col_name in numeric_cols:
    df_delivery = df_delivery.withColumn(col_name + "_imputed", round(col(col_name + "_imputed"), 5))

##### To Check the imputed data

In [13]:
# Define numeric columns that were imputed
numeric_cols = ["lng", "lat","delivery_gps_lng", "delivery_gps_lat", "accept_gps_lng", "accept_gps_lat"]
# Select original and imputed columns for comparison
df_delivery.select([col(c) for c in numeric_cols] + [col(c + "_imputed") for c in numeric_cols]).show(5)

+---------+--------+----------------+----------------+--------------+--------------+-----------+-----------+------------------------+------------------------+----------------------+----------------------+
|      lng|     lat|delivery_gps_lng|delivery_gps_lat|accept_gps_lng|accept_gps_lat|lng_imputed|lat_imputed|delivery_gps_lng_imputed|delivery_gps_lat_imputed|accept_gps_lng_imputed|accept_gps_lat_imputed|
+---------+--------+----------------+----------------+--------------+--------------+-----------+-----------+------------------------+------------------------+----------------------+----------------------+
|120.17887|30.26379|       120.17803|         30.2638|     120.20605|      30.28655|  120.17887|   30.26379|               120.17803|                 30.2638|             120.20605|              30.28655|
|120.17887|30.26417|       120.18027|          30.263|     120.18549|      30.28065|  120.17887|   30.26417|               120.18027|                  30.263|             120.18549

##### Drop original columns and rename imputed columns

In [14]:
for col_name in numeric_cols:
    df_delivery = df_delivery.drop(col_name).withColumnRenamed(col_name + "_imputed", col_name)

In [15]:
df_delivery.select("*").show(5)

+--------+---------+---------+----------+------+--------+-------------------+-------------------+-------------------+-------------------+----+---------+--------+----------------+----------------+--------------+--------------+
|order_id|region_id|     city|courier_id|aoi_id|aoi_type|        accept_time|    accept_gps_time|      delivery_time|  delivery_gps_time|  ds|      lng|     lat|delivery_gps_lng|delivery_gps_lat|accept_gps_lng|accept_gps_lat|
+--------+---------+---------+----------+------+--------+-------------------+-------------------+-------------------+-------------------+----+---------+--------+----------------+----------------+--------------+--------------+
|      31|       98| Hangzhou|       300| 42283|      14|2025-07-23 07:54:00|2025-07-23 07:54:00|2025-07-23 09:26:00|2025-07-23 09:26:00| 723|120.44703|30.31246|       120.44639|        30.31075|     120.53647|      30.27606|
|      53|       90| Shanghai|      2144| 32101|       1|2025-09-13 13:43:00|2025-09-13 13:43:00

#### Mode Imputation for Categorical Fields

In [16]:
# For categorical fields (city, region_id, aoi_type), we replace missing values with the most frequent value (mode).
categorical_cols = ["city", "region_id", "aoi_type"]

for cat_cols in categorical_cols:
    mode_value = df_delivery.groupBy(cat_cols).count().orderBy(col("count").desc()).first()[0] # Get most frequent value
    df_delivery.fillna({cat_cols: mode_value}) # Fill missing values with mode

In [17]:
df_delivery.select("city", "region_id", "aoi_type").show(5)

+---------+---------+--------+
|     city|region_id|aoi_type|
+---------+---------+--------+
| Hangzhou|       98|      14|
| Shanghai|       90|       1|
| Hangzhou|       47|       1|
|Chongqing|      109|       4|
| Hangzhou|      124|       7|
+---------+---------+--------+
only showing top 5 rows



#### Forward Fill (Last Known Value) for Timestamps

##### Removing unnecessary columns i.e. delivery_gps_time and accept_gps_time

In [18]:
# before filling the timestamp notice in the csv file that delivery_time and delivery_gps_time have same values same for accept_time & accept_gps_time.
# therefore we can drop these columns delivery_gps_time and accept_gps_time.
# Assuming you have a DataFrame named df
df_delivery = df_delivery.drop('delivery_gps_time', 'accept_gps_time')

In [19]:
df_delivery.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- region_id: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- courier_id: integer (nullable = true)
 |-- aoi_id: integer (nullable = true)
 |-- aoi_type: integer (nullable = true)
 |-- accept_time: timestamp (nullable = true)
 |-- delivery_time: timestamp (nullable = true)
 |-- ds: integer (nullable = true)
 |-- lng: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- delivery_gps_lng: double (nullable = true)
 |-- delivery_gps_lat: double (nullable = true)
 |-- accept_gps_lng: double (nullable = true)
 |-- accept_gps_lat: double (nullable = true)



##### Forward/Backward Filling

In [20]:
# first row in a partition (grouped by courier_id) remains NULL because there is no previous value to carry forward. 
# The forward fill approach only propagates past values forward, but it doesn't backfill the first row if it was originally NULL in the very starting rows.
# To fix this, we need to first fill missing values with the earliest available timestamp in the partition (backfill), then apply forward filling
from pyspark.sql import functions as F

# Define a global forward-fill and backward-fill window
window_forward = Window.orderBy("accept_time").rowsBetween(-sys.maxsize, 0)  # Forward fill
window_backward = Window.orderBy(F.col("accept_time").desc()).rowsBetween(-sys.maxsize, 0)  # Backward fill

time_cols = ["accept_time", "delivery_time"]

for time_col in time_cols:
    df_delivery = df_delivery.withColumn(time_col, 
        F.coalesce(
            F.last(time_col, ignorenulls=True).over(window_forward),  # Forward fill
            F.first(time_col, ignorenulls=True).over(window_backward)  # Backward fill
        )
    )

In [21]:
df_delivery.filter(df_delivery.order_id == "583722").select("delivery_time", "accept_time").show()

+-------------------+-------------------+
|      delivery_time|        accept_time|
+-------------------+-------------------+
|2025-10-30 10:30:00|2025-10-30 09:20:00|
+-------------------+-------------------+



In [22]:
df_delivery.select([count(when(col(c).isNull(), 1)).alias(c) for c in time_cols]).show()

+-----------+-------------+
|accept_time|delivery_time|
+-----------+-------------+
|          0|            0|
+-----------+-------------+



### Convert categorical fields (e.g., aoi_type, city, region_id) to a consistent format.

In [23]:
df_delivery = df_delivery.withColumn("city", trim(initcap(col("city")))) \
       .withColumn("region_id", col("region_id").cast("int")) \
       .withColumn("aoi_type", col("aoi_type").cast("int"))

In [24]:
df_delivery.select("city", "region_id", "aoi_type").show(5)

+---------+---------+--------+
|     city|region_id|aoi_type|
+---------+---------+--------+
| Hangzhou|       98|      14|
| Shanghai|       90|       1|
| Hangzhou|       47|       1|
|Chongqing|      109|       4|
| Hangzhou|      124|       7|
+---------+---------+--------+
only showing top 5 rows



### Removing or fixing inconsistent geospatial coordinates (lng/lat).

In [25]:
lat_cols = ["lat", "delivery_gps_lat", "accept_gps_lat"]
lng_cols = ["lng", "delivery_gps_lng", "accept_gps_lng"]

# Keep only valid latitudes (-90 to 90)
for col_name in lat_cols:
    df_delivery = df_delivery.withColumn(col_name, when((col(col_name) >= -90) & (col(col_name) <= 90), col(col_name)).otherwise(None))

# Keep only valid longitudes (-180 to 180)
for col_name in lng_cols:
    df_delivery = df_delivery.withColumn(col_name, when((col(col_name) >= -180) & (col(col_name) <= 180), col(col_name)).otherwise(None))


In [26]:
df_delivery.select("lng", "lat","delivery_gps_lng", "delivery_gps_lat", "accept_gps_lng", "accept_gps_lat").show(5)

+---------+--------+----------------+----------------+--------------+--------------+
|      lng|     lat|delivery_gps_lng|delivery_gps_lat|accept_gps_lng|accept_gps_lat|
+---------+--------+----------------+----------------+--------------+--------------+
|120.17887|30.26379|       120.17803|         30.2638|     120.20605|      30.28655|
|120.17887|30.26417|       120.18027|          30.263|     120.18549|      30.28065|
|120.18379|30.25993|       120.18393|        30.25991|     120.18544|      30.28074|
|120.18467|30.26117|       120.18461|        30.26102|     120.20603|      30.28662|
|120.18509| 30.2613|       120.18468|        30.26087|     120.18541|      30.28078|
+---------+--------+----------------+----------------+--------------+--------------+
only showing top 5 rows



In [27]:
#checking any of the column is still null 
time_cols = ["lng", "lat","delivery_gps_lng", "delivery_gps_lat", "accept_gps_lng", "accept_gps_lat"]
df_delivery.select([count(when(col(c).isNull(), 1)).alias(c) for c in time_cols]).show()

+---+---+----------------+----------------+--------------+--------------+
|lng|lat|delivery_gps_lng|delivery_gps_lat|accept_gps_lng|accept_gps_lat|
+---+---+----------------+----------------+--------------+--------------+
|  0|  0|               0|               0|             0|             0|
+---+---+----------------+----------------+--------------+--------------+



### Identify any anomalies in timestamps (e.g., deliveries before pickups) & Checking for incorrect or negative time differences.

In [28]:
# Check for "accepts before pickups" (accept_time > delivery_time)
df_delivery = df_delivery.withColumn(
    "accept_before_delivery",
    when(col("accept_time") > col("delivery_time"), True).otherwise(False)
)

# Check for negative or incorrect time differences between accept_time and delivery_time
df_delivery = df_delivery.withColumn(
    "time_difference",
    (unix_timestamp("delivery_time") - unix_timestamp("accept_time"))
)

# Flag negative time differences
df_delivery = df_delivery.withColumn(
    "negative_time_difference",
    when(col("time_difference") < 0, True).otherwise(False)
)

In [29]:
# Filter for anomalies
anomalies = df_delivery.filter((col("accept_before_delivery") == True) | (col("negative_time_difference") == True))

# Show anomalies
anomalies.select("accept_before_delivery", "negative_time_difference").show()

+----------------------+------------------------+
|accept_before_delivery|negative_time_difference|
+----------------------+------------------------+
|                  true|                    true|
|                  true|                    true|
|                  true|                    true|
+----------------------+------------------------+



In [30]:
df_delivery = df_delivery.drop('accept_before_delivery','time_difference', 'negative_time_difference')

### Creating derived feature: Delivery ETA = delivery_time - accept_time

In [31]:
# 1. Calculate Delivery Delay
df_delivery = df_delivery.withColumn(
    "delivery_eta_minutes", 
    ((F.unix_timestamp("delivery_time") - F.unix_timestamp("accept_time"))/60).cast("double")
)

In [32]:
df_delivery.select('delivery_eta_minutes').show(5)

+--------------------+
|delivery_eta_minutes|
+--------------------+
|             19188.0|
|                 2.0|
|                 0.0|
|                 3.0|
|                 1.0|
+--------------------+
only showing top 5 rows



### Saving the cleaned data in a csv file

In [33]:
time_cols = ["accept_time", "delivery_time"]

# Format the timestamp columns to match the desired format as directly saving to csv file is giving wrong format for dates columns.
for time_col in time_cols:
    df_delivery = df_delivery.withColumn(
        time_col, 
        date_format(time_col, "yyyy-MM-dd HH:mm:ss")
    )

output_path = r'C:\Users\Dusty\Downloads\Internship\Last-Mile-Delivery-Delays-and-Route-Optimization\data\cleaned_delivery_data.csv'

# Save the DataFrame to a CSV file
df_delivery.coalesce(1).write.csv(output_path, header=True, mode="overwrite")
print(f"Cleaned Delivery data saved successfully")

Cleaned Delivery data saved successfully


In [35]:
row_count1 = df_delivery.count()
print(f"Number of rows in the DataFrame: {row_count1}")

Number of rows in the DataFrame: 4514661
