# Capstone Project: Customer Order Insights & Delivery Tracker

# Week 3 – Order Analysis with PySpark

Tools: PySpark

In [26]:
# Import required PySpark modules
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, split, count, when
from pyspark.sql.types import TimestampType

In [27]:
# Create Spark session
spark = SparkSession.builder.appName("week-3-Order-Delay-Analysis").getOrCreate()
spark

Capstone Tasks:

        1.  Load order and delivery data into PySpark
        2. Join orders and customer tables
        3.  Group by region to count delays
        4.  Save results to a file

In [28]:
# 1. Load order data from CSV using PySpark.

# Upload CSV files from local to Colab
from google.colab import files
uploaded = files.upload()

Saving customers.csv to customers (1).csv
Saving delivery_status.csv to delivery_status (1).csv
Saving orders.csv to orders (1).csv


In [29]:
# Load CSV files into DataFrames
orders_df = spark.read.csv("orders.csv", header=True, inferSchema=True)
delivery_df = spark.read.csv("delivery_status.csv", header=True, inferSchema=True)
customers_df = spark.read.csv("customers.csv", header=True, inferSchema=True)

In [30]:
# Display Orders Schema
orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- delivery_date: date (nullable = true)
 |-- status: string (nullable = true)



In [31]:
# Display Delivery_Status Schema
delivery_df.printSchema()

root
 |-- status_id: integer (nullable = true)
 |-- order_id: integer (nullable = true)
 |-- current_status: string (nullable = true)
 |-- updated_at: timestamp (nullable = true)



In [33]:
# Display Customers Schema
customers_df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- contact_info: string (nullable = true)
 |-- address: string (nullable = true)



In [34]:
# Clean and convert types
orders_df = orders_df.withColumn("order_date", to_date("order_date")) \
                     .withColumn("delivery_date", to_date("delivery_date"))

delivery_df = delivery_df.withColumn("updated_at", col("updated_at").cast(TimestampType()))

orders_df.select("order_id", "order_date", "delivery_date").show()
delivery_df.select("order_id", "updated_at").show()

+--------+----------+-------------+
|order_id|order_date|delivery_date|
+--------+----------+-------------+
|       1|2025-07-01|   2025-07-04|
|       2|2025-07-02|   2025-07-05|
|       3|2025-07-03|   2025-07-06|
|       4|2025-07-04|   2025-07-07|
|       5|2025-07-05|   2025-07-09|
|       6|2025-07-06|   2025-07-10|
|       7|2025-07-07|   2025-07-11|
|       8|2025-07-08|   2025-07-12|
|       9|2025-07-09|   2025-07-13|
|      10|2025-07-10|   2025-07-14|
+--------+----------+-------------+

+--------+-------------------+
|order_id|         updated_at|
+--------+-------------------+
|       1|2025-07-04 18:00:00|
|       2|2025-07-05 17:30:00|
|       3|2025-07-06 12:45:00|
|       4|2025-07-07 10:15:00|
|       5|2025-07-08 09:00:00|
|       6|2025-07-10 19:10:00|
|       7|2025-07-11 13:20:00|
|       8|2025-07-12 20:30:00|
|       9|2025-07-13 11:50:00|
|      10|2025-07-14 16:40:00|
+--------+-------------------+



In [35]:
# 2. Join orders + delivery_status + customers tables
joined_df = orders_df.join(delivery_df, orders_df["order_id"] == delivery_df["order_id"]) \
                     .join(customers_df, orders_df["customer_id"] == customers_df["customer_id"])

joined_df.show()
joined_df.printSchema()

+--------+-----------+----------+----------+-------------+----------+---------+--------+--------------+-------------------+-----------+------------+------------------+----------------+
|order_id|customer_id|product_id|order_date|delivery_date|    status|status_id|order_id|current_status|         updated_at|customer_id|        name|      contact_info|         address|
+--------+-----------+----------+----------+-------------+----------+---------+--------+--------------+-------------------+-----------+------------+------------------+----------------+
|       1|          1|       101|2025-07-01|   2025-07-04| Delivered|        1|       1|     Delivered|2025-07-04 18:00:00|          1|Anjali Mehta|anjali@example.com|   Mumbai, India|
|       2|          2|       102|2025-07-02|   2025-07-05| Delivered|        2|       2|     Delivered|2025-07-05 17:30:00|          2|Rohit Sharma| rohit@example.com|    Delhi, India|
|       3|          3|       103|2025-07-03|   2025-07-06|   Shipped|      

In [41]:
# Add delay column (based on delivery_date vs updated_at)
# Rename order_id in delivery_df before joining
delivery_df = delivery_df.withColumnRenamed("order_id", "delivery_order_id")

# Join safely
joined_df = orders_df.join(delivery_df, orders_df.order_id == delivery_df.delivery_order_id, "inner") \
                     .join(customers_df, "customer_id")

# Add the delay column
joined_df = joined_df.withColumn(
    "is_delayed",
    when(col("updated_at").cast("date") > col("delivery_date"), 1).otherwise(0)
)

# Select with renamed columns
joined_df.select("order_id", "delivery_date", "updated_at", "is_delayed").show()


+--------+-------------+-------------------+----------+
|order_id|delivery_date|         updated_at|is_delayed|
+--------+-------------+-------------------+----------+
|       1|   2025-07-04|2025-07-04 18:00:00|         0|
|       2|   2025-07-05|2025-07-05 17:30:00|         0|
|       3|   2025-07-06|2025-07-06 12:45:00|         0|
|       4|   2025-07-07|2025-07-07 10:15:00|         0|
|       5|   2025-07-09|2025-07-08 09:00:00|         0|
|       6|   2025-07-10|2025-07-10 19:10:00|         0|
|       7|   2025-07-11|2025-07-11 13:20:00|         0|
|       8|   2025-07-12|2025-07-12 20:30:00|         0|
|       9|   2025-07-13|2025-07-13 11:50:00|         0|
|      10|   2025-07-14|2025-07-14 16:40:00|         0|
+--------+-------------+-------------------+----------+



In [42]:
#  Extract region (e.g., city) from address
joined_df = joined_df.withColumn("region", split(col("address"), ",").getItem(0))

joined_df.select("customer_id", "address", "region").show()

+-----------+----------------+---------+
|customer_id|         address|   region|
+-----------+----------------+---------+
|          1|   Mumbai, India|   Mumbai|
|          2|    Delhi, India|    Delhi|
|          3|Hyderabad, India|Hyderabad|
|          4|Ahmedabad, India|Ahmedabad|
|          5|     Pune, India|     Pune|
|          6|  Chennai, India|  Chennai|
|          7|Bangalore, India|Bangalore|
|          8|  Kolkata, India|  Kolkata|
|          9|    Kochi, India|    Kochi|
|         10|   Jaipur, India|   Jaipur|
+-----------+----------------+---------+



In [43]:
#  3. Group by region to count delayed orders
delay_by_region = joined_df.groupBy("region").agg(count(when(col("is_delayed") == 1, True)).alias("delayed_orders"))

# Show result
delay_by_region.show()

+---------+--------------+
|   region|delayed_orders|
+---------+--------------+
|Bangalore|             0|
|    Kochi|             0|
|  Chennai|             0|
|   Mumbai|             0|
|Ahmedabad|             0|
|  Kolkata|             0|
|     Pune|             0|
|    Delhi|             0|
|Hyderabad|             0|
|   Jaipur|             0|
+---------+--------------+



In [45]:
# 4. Save to CSV
delay_by_region.write.mode("overwrite").csv("output/delayed_orders_by_region", header=True)

Deliverables:

      1. PySpark script with join, group, and aggregation
      2. Output file showing delayed orders by region

In [46]:
# Merge parts into one file
!cat output/delayed_orders_by_region/part-*.csv > delayed_orders_by_region.csv

# 2. Download if using Google Colab(Output file showing delayed orders by region)
from google.colab import files
files.download("delayed_orders_by_region.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>