### Imports

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.ml.feature import Imputer
spark = SparkSession.builder.appName('PysparkTransformation').getOrCreate()
import sys

### Read the pickup data

In [2]:
pickup_file = r'C:\Users\Dusty\Downloads\Internship\Last-Mile-Delivery-Delays-and-Route-Optimization\data\pickup_data.csv'
df_pickup = spark.read.csv(pickup_file,header=True,inferSchema=True)

In [3]:
#import : from pyspark.sql.functions import rand
df_pickup.orderBy(rand()).show(4)

+--------+---------+--------+----------+--------------+-----------------+---------------+---------+--------+------+--------+--------------+---------------+--------------+--------------+---------------+--------------+--------------+----+
|order_id|region_id|    city|courier_id|   accept_time|time_window_start|time_window_end|      lng|     lat|aoi_id|aoi_type|   pickup_time|pickup_gps_time|pickup_gps_lng|pickup_gps_lat|accept_gps_time|accept_gps_lng|accept_gps_lat|  ds|
+--------+---------+--------+----------+--------------+-----------------+---------------+---------+--------+------+--------+--------------+---------------+--------------+--------------+---------------+--------------+--------------+----+
|  835595|      134|  Yantai|      6190|06-15 08:08:00|   06-15 09:00:00| 06-15 11:00:00|121.34875| 37.5794|  1619|       1|06-15 08:27:00| 06-15 08:27:00|     121.35092|      37.57914| 06-15 08:08:00|     121.37013|      37.55683| 615|
| 6042873|      116|  Yantai|      2332|10-12 13:42:

In [4]:
print(f"Total number of records: {df_pickup.count()}")

Total number of records: 6136147


### Convert String Timestamps to timestamp Format

In [5]:
df_pickup.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- region_id: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- courier_id: integer (nullable = true)
 |-- accept_time: string (nullable = true)
 |-- time_window_start: string (nullable = true)
 |-- time_window_end: string (nullable = true)
 |-- lng: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- aoi_id: integer (nullable = true)
 |-- aoi_type: integer (nullable = true)
 |-- pickup_time: string (nullable = true)
 |-- pickup_gps_time: string (nullable = true)
 |-- pickup_gps_lng: double (nullable = true)
 |-- pickup_gps_lat: double (nullable = true)
 |-- accept_gps_time: string (nullable = true)
 |-- accept_gps_lng: double (nullable = true)
 |-- accept_gps_lat: double (nullable = true)
 |-- ds: integer (nullable = true)



In [6]:
# Get current year dynamically as with adding its taking 1970 as default year as there is no year in csv file and excel is showing byfault the current year
current_year = date_format(col("current_date"), "yyyy")

# Convert all date columns by prepending the current year
df_pickup = df_pickup.withColumn("accept_time", to_timestamp(concat(current_year, lit("-"), trim(col("accept_time"))), "yyyy-MM-dd HH:mm:ss")) \
       .withColumn("pickup_time", to_timestamp(concat(current_year, lit("-"), trim(col("pickup_time"))), "yyyy-MM-dd HH:mm:ss")) \
       .withColumn("pickup_gps_time", to_timestamp(concat(current_year, lit("-"), trim(col("pickup_gps_time"))), "yyyy-MM-dd HH:mm:ss")) \
       .withColumn("accept_gps_time", to_timestamp(concat(current_year, lit("-"), trim(col("accept_gps_time"))), "yyyy-MM-dd HH:mm:ss")) \
       .withColumn("time_window_start", to_timestamp(concat(current_year, lit("-"), trim(col("time_window_start"))), "yyyy-MM-dd HH:mm:ss")) \
       .withColumn("time_window_end", to_timestamp(concat(current_year, lit("-"), trim(col("time_window_end"))), "yyyy-MM-dd HH:mm:ss"))

In [7]:
df_pickup.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- region_id: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- courier_id: integer (nullable = true)
 |-- accept_time: timestamp (nullable = true)
 |-- time_window_start: timestamp (nullable = true)
 |-- time_window_end: timestamp (nullable = true)
 |-- lng: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- aoi_id: integer (nullable = true)
 |-- aoi_type: integer (nullable = true)
 |-- pickup_time: timestamp (nullable = true)
 |-- pickup_gps_time: timestamp (nullable = true)
 |-- pickup_gps_lng: double (nullable = true)
 |-- pickup_gps_lat: double (nullable = true)
 |-- accept_gps_time: timestamp (nullable = true)
 |-- accept_gps_lng: double (nullable = true)
 |-- accept_gps_lat: double (nullable = true)
 |-- ds: integer (nullable = true)



In [8]:
df_pickup.select("pickup_time").show(5)

+-------------------+
|        pickup_time|
+-------------------+
|2025-07-14 17:14:00|
|2025-07-19 08:47:00|
|2025-10-20 17:50:00|
|2025-10-29 17:00:00|
|2025-10-16 09:56:00|
+-------------------+
only showing top 5 rows



### Dropping Duplicates

In [9]:
df_pickup = df_pickup.dropDuplicates(["order_id"])

### Handle missing values using dropna

In [10]:
# Dropping order_id and courier_idif they are missing as they are important
df_pickup = df_pickup.dropna(subset=["order_id", "courier_id"])

In [11]:
print(f"Total number of records: {df_pickup.count()}")

Total number of records: 6136147


### Handle Missing values using imputation strategies where necessary. 

#### Mean Imputation for Numeric Columns

In [12]:
numeric_cols = ["lng", "lat", "pickup_gps_lng", "pickup_gps_lat", "accept_gps_lng", "accept_gps_lat"]

# Create Imputer to fill missing numeric values with mean
imputer = Imputer(inputCols=numeric_cols, outputCols=[c + "_imputed" for c in numeric_cols], strategy="mean")

# Apply Imputer
df_pickup = imputer.fit(df_pickup).transform(df_pickup)
# Round the imputed values to 5 decimal places
for col_name in numeric_cols:
    df_pickup = df_pickup.withColumn(col_name + "_imputed", round(col(col_name + "_imputed"), 5))

##### To Check the imputed data

In [13]:
# Define numeric columns that were imputed
numeric_cols = ["lng", "lat","pickup_gps_lng", "pickup_gps_lat", "accept_gps_lng", "accept_gps_lat"]
# Select original and imputed columns for comparison
df_pickup.select([col(c) for c in numeric_cols] + [col(c + "_imputed") for c in numeric_cols]).show(5)

+---------+--------+--------------+--------------+--------------+--------------+-----------+-----------+----------------------+----------------------+----------------------+----------------------+
|      lng|     lat|pickup_gps_lng|pickup_gps_lat|accept_gps_lng|accept_gps_lat|lng_imputed|lat_imputed|pickup_gps_lng_imputed|pickup_gps_lat_imputed|accept_gps_lng_imputed|accept_gps_lat_imputed|
+---------+--------+--------------+--------------+--------------+--------------+-----------+-----------+----------------------+----------------------+----------------------+----------------------+
|120.05093|30.37981|     120.04952|       30.3713|     120.05035|      30.36885|  120.05093|   30.37981|             120.04952|               30.3713|             120.05035|              30.36885|
|120.05162|30.37836|      120.0515|      30.37853|     120.05515|      30.38345|  120.05162|   30.37836|              120.0515|              30.37853|             120.05515|              30.38345|
|120.05046|30.3

##### Drop original columns and rename imputed columns

In [14]:
for col_name in numeric_cols:
    df_pickup = df_pickup.drop(col_name).withColumnRenamed(col_name + "_imputed", col_name)

In [15]:
df_pickup.select("*").show(5)

+--------+---------+---------+----------+-------------------+-------------------+-------------------+------+--------+-------------------+-------------------+-------------------+---+---------+--------+--------------+--------------+--------------+--------------+
|order_id|region_id|     city|courier_id|        accept_time|  time_window_start|    time_window_end|aoi_id|aoi_type|        pickup_time|    pickup_gps_time|    accept_gps_time| ds|      lng|     lat|pickup_gps_lng|pickup_gps_lat|accept_gps_lng|accept_gps_lat|
+--------+---------+---------+----------+-------------------+-------------------+-------------------+------+--------+-------------------+-------------------+-------------------+---+---------+--------+--------------+--------------+--------------+--------------+
|      31|       43| Shanghai|      8956|2025-09-12 14:28:00|2025-09-12 15:00:00|2025-09-12 17:00:00| 14530|       1|2025-09-12 14:37:00|2025-09-12 14:37:00|2025-09-12 14:28:00|912|121.54796|31.23265|      121.5488|  

#### Mode Imputation for Categorical Fields

In [16]:
# For categorical fields (city, region_id, aoi_type), we replace missing values with the most frequent value (mode).
categorical_cols = ["city", "region_id", "aoi_type"]

for cat_cols in categorical_cols:
    mode_value = df_pickup.groupBy(cat_cols).count().orderBy(col("count").desc()).first()[0] # Get most frequent value
    df_pickup.fillna({cat_cols: mode_value}) # Fill missing values with mode

In [17]:
df_pickup.select("city", "region_id", "aoi_type").show(5)

+---------+---------+--------+
|     city|region_id|aoi_type|
+---------+---------+--------+
| Shanghai|       43|       1|
| Shanghai|        6|       1|
| Hangzhou|       92|       1|
| Shanghai|       54|       1|
|Chongqing|       60|       4|
+---------+---------+--------+
only showing top 5 rows



#### Forward Fill (Last Known Value) for Timestamps

##### Removing unnecessary columns i.e. pickup_gps_time and accept_gps_time

In [18]:
# before filling the timestamp notice in the csv file that pickup_time and pickup_gps_time have same values same for accept_time & accept_gps_time.
# therefore we can drop these columns pickup_gps_time and accept_gps_time.
# Assuming you have a DataFrame named df
df_pickup = df_pickup.drop('pickup_gps_time', 'accept_gps_time')

In [19]:
df_pickup.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- region_id: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- courier_id: integer (nullable = true)
 |-- accept_time: timestamp (nullable = true)
 |-- time_window_start: timestamp (nullable = true)
 |-- time_window_end: timestamp (nullable = true)
 |-- aoi_id: integer (nullable = true)
 |-- aoi_type: integer (nullable = true)
 |-- pickup_time: timestamp (nullable = true)
 |-- ds: integer (nullable = true)
 |-- lng: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- pickup_gps_lng: double (nullable = true)
 |-- pickup_gps_lat: double (nullable = true)
 |-- accept_gps_lng: double (nullable = true)
 |-- accept_gps_lat: double (nullable = true)



##### Forward/Backward Filling

In [20]:
# first row in a partition (grouped by courier_id) remains NULL because there is no previous value to carry forward. 
# The forward fill approach only propagates past values forward, but it doesn't backfill the first row if it was originally NULL in the very starting rows.
# To fix this, we need to first fill missing values with the earliest available timestamp in the partition (backfill), then apply forward filling
from pyspark.sql import functions as F

# Define a global forward-fill and backward-fill window
window_forward = Window.orderBy("accept_time").rowsBetween(-sys.maxsize, 0)  # Forward fill
window_backward = Window.orderBy(F.col("accept_time").desc()).rowsBetween(-sys.maxsize, 0)  # Backward fill

time_cols = ["accept_time", "pickup_time", "time_window_start", "time_window_end"]

for time_col in time_cols:
    df_pickup = df_pickup.withColumn(time_col, 
        F.coalesce(
            F.last(time_col, ignorenulls=True).over(window_forward),  # Forward fill
            F.first(time_col, ignorenulls=True).over(window_backward)  # Backward fill
        )
    )

In [21]:
df_pickup.filter(df_pickup.order_id == "19").select("pickup_time", "accept_time", "time_window_start", "time_window_end").show()

+-------------------+-------------------+-------------------+-------------------+
|        pickup_time|        accept_time|  time_window_start|    time_window_end|
+-------------------+-------------------+-------------------+-------------------+
|2025-08-20 10:09:00|2025-08-20 07:37:00|2025-08-20 09:00:00|2025-08-20 11:00:00|
+-------------------+-------------------+-------------------+-------------------+



In [22]:
df_pickup.select([count(when(col(c).isNull(), 1)).alias(c) for c in time_cols]).show()

+-----------+-----------+-----------------+---------------+
|accept_time|pickup_time|time_window_start|time_window_end|
+-----------+-----------+-----------------+---------------+
|          0|          0|                0|              0|
+-----------+-----------+-----------------+---------------+



### Convert categorical fields (e.g., aoi_type, city, region_id) to a consistent format.

In [23]:
df_pickup = df_pickup.withColumn("city", trim(initcap(col("city")))) \
       .withColumn("region_id", col("region_id").cast("int")) \
       .withColumn("aoi_type", col("aoi_type").cast("int"))

In [24]:
df_pickup.select("city", "region_id", "aoi_type").show(5)

+---------+---------+--------+
|     city|region_id|aoi_type|
+---------+---------+--------+
| Shanghai|       43|       1|
| Shanghai|        6|       1|
| Hangzhou|       92|       1|
| Shanghai|       54|       1|
|Chongqing|       60|       4|
+---------+---------+--------+
only showing top 5 rows



### Removing or fixing inconsistent geospatial coordinates (lng/lat).

In [25]:
lat_cols = ["lat", "pickup_gps_lat", "accept_gps_lat"]
lng_cols = ["lng", "pickup_gps_lng", "accept_gps_lng"]

# Keep only valid latitudes (-90 to 90)
for col_name in lat_cols:
    df_pickup = df_pickup.withColumn(col_name, when((col(col_name) >= -90) & (col(col_name) <= 90), col(col_name)).otherwise(None))

# Keep only valid longitudes (-180 to 180)
for col_name in lng_cols:
    df_pickup = df_pickup.withColumn(col_name, when((col(col_name) >= -180) & (col(col_name) <= 180), col(col_name)).otherwise(None))


In [26]:
df_pickup.select("lng", "lat","pickup_gps_lng", "pickup_gps_lat", "accept_gps_lng", "accept_gps_lat").show(5)

+---------+--------+--------------+--------------+--------------+--------------+
|      lng|     lat|pickup_gps_lng|pickup_gps_lat|accept_gps_lng|accept_gps_lat|
+---------+--------+--------------+--------------+--------------+--------------+
|120.05093|30.37981|     120.04952|       30.3713|     120.05035|      30.36885|
|120.05162|30.37836|      120.0515|      30.37853|     120.05515|      30.38345|
|120.05046|30.37844|     118.38133|      32.57753|     118.54136|      32.70376|
|120.05242|30.38059|     120.05268|      30.38056|      120.0743|      30.38605|
|120.05038|30.37835|     120.05027|      30.37838|     120.05585|      30.37965|
+---------+--------+--------------+--------------+--------------+--------------+
only showing top 5 rows



In [27]:
#checking any of the column is still null 
time_cols = ["lng", "lat","pickup_gps_lng", "pickup_gps_lat", "accept_gps_lng", "accept_gps_lat"]
df_pickup.select([count(when(col(c).isNull(), 1)).alias(c) for c in time_cols]).show()

+---+---+--------------+--------------+--------------+--------------+
|lng|lat|pickup_gps_lng|pickup_gps_lat|accept_gps_lng|accept_gps_lat|
+---+---+--------------+--------------+--------------+--------------+
|  0|  0|             0|             0|             0|             0|
+---+---+--------------+--------------+--------------+--------------+



### Identify any anomalies in timestamps (e.g., deliveries before pickups) & Checking for incorrect or negative time differences.

In [28]:
# Check for "accepts before pickups" (accept_time > pickup_time)
df_pickup = df_pickup.withColumn(
    "accept_before_pickup",
    when(col("accept_time") > col("pickup_time"), True).otherwise(False)
)

# Check for negative or incorrect time differences between accept_time and pickup_time
df_pickup = df_pickup.withColumn(
    "time_difference",
    (unix_timestamp("pickup_time") - unix_timestamp("accept_time"))
)

# Flag negative time differences
df_pickup = df_pickup.withColumn(
    "negative_time_difference",
    when(col("time_difference") < 0, True).otherwise(False)
)

In [29]:
# Filter for anomalies
anomalies = df_pickup.filter((col("accept_before_pickup") == True) | (col("negative_time_difference") == True))

# Show anomalies
anomalies.select("accept_before_pickup", "negative_time_difference").show()

+--------------------+------------------------+
|accept_before_pickup|negative_time_difference|
+--------------------+------------------------+
+--------------------+------------------------+



In [30]:
df_pickup = df_pickup.drop('accept_before_pickup','time_difference', 'negative_time_difference')

### Creating derived feature: Pickup ETA = pickup_time - accept_time

In [31]:
# 1. Calculate Pickup Delay
df_pickup = df_pickup.withColumn(
    "pickup_eta_minutes", 
    ((F.unix_timestamp("pickup_time") - F.unix_timestamp("accept_time"))/60).cast("double")
)

In [32]:
df_pickup.select('pickup_eta_minutes').show(5)

+------------------+
|pickup_eta_minutes|
+------------------+
|              17.0|
|              22.0|
|              30.0|
|              52.0|
|              17.0|
+------------------+
only showing top 5 rows



### Saving the cleaned data in a csv file

In [33]:
time_cols = ["accept_time", "pickup_time", "time_window_start", "time_window_end"]

# Format the timestamp columns to match the desired format as directly saving to csv file is giving wrong format for dates columns.
for time_col in time_cols:
    df_pickup = df_pickup.withColumn(
        time_col, 
        date_format(time_col, "yyyy-MM-dd HH:mm:ss")
    )

output_path = r'C:\Users\Dusty\Downloads\Internship\Last-Mile-Delivery-Delays-and-Route-Optimization\data\cleaned_pickup_data.csv'

# Save the DataFrame to a CSV file
df_pickup.coalesce(1).write.csv(output_path, header=True, mode="overwrite")
print(f"Cleaned Pickup data saved successfully")

Cleaned Pickup data saved successfully


In [35]:
row_count = df_pickup.count()
print(f"Number of rows in the DataFrame: {row_count}")

Number of rows in the DataFrame: 6136147
