# Capstone Project: Customer Order Insights & Delivery Tracker


# Week 4 – Simple ETL Pipeline in Azure Databricks

Tools: Azure Databricks

Capstone Tasks:

    1. Load cleaned order data into Databricks
    2. Create a pipeline to update latest delivery status
    3. Save the results as Delta or CSV
    4. Optional: run a SQL query to show top 5 delayed customers

In [2]:
# Install PySpark
!pip install pyspark -q

In [3]:
# Import the Requirements
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, current_date, datediff

In [4]:
# Install dependencies (only needed in Google Colab)
# In Azure Databricks, skip this cell
!pip install pyspark==3.5.1 delta-spark==3.1.0
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

Collecting delta-spark==3.1.0
  Downloading delta_spark-3.1.0-py3-none-any.whl.metadata (1.9 kB)
Downloading delta_spark-3.1.0-py3-none-any.whl (21 kB)
Installing collected packages: delta-spark
Successfully installed delta-spark-3.1.0


In [5]:
# using DELTA in colab
!pip install delta-spark==3.2.0 -q
import pyspark
from delta import *
from pyspark.sql.functions import *

# Create a SparkSession with Delta Lake extensions
# The '.config(...)' lines are crucial for enabling Delta Lake's features
builder = pyspark.sql.SparkSession.builder.appName("DeltaDemo") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Get or create the SparkSession
spark = configure_spark_with_delta_pip(builder).getOrCreate()

print("Spark and Delta Lake are ready!")

Spark and Delta Lake are ready!


In [6]:
# Simple ETL Pipeline in Azure Databricks
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [7]:
# Upload your CSV files manually (orders.csv, suppliers.csv, inventory.csv)
from google.colab import files

print("Upload customers.csv")
customers_upload = files.upload()

print("Upload delivery_status.csv")
delivery_status_upload = files.upload()

print("Upload orders.csv")
orders_upload = files.upload()

Upload customers.csv


Saving customers.csv to customers.csv
Upload delivery_status.csv


Saving delivery_status.csv to delivery_status.csv
Upload orders.csv


Saving orders.csv to orders.csv


In [8]:
# 1.  Load cleaned order data into Databricks
print("Upload cleaned_orders.csv")
cleaned_orders_upload = files.upload()

Upload cleaned_orders.csv


Saving cleaned_orders.csv to cleaned_orders.csv


In [13]:
# Load CSV into Spark DataFrame
# Get uploaded file name (dictionary key)
csv_filename = list(cleaned_orders_upload.keys())[0]

# Load into Spark DataFrame
cleaned_orders_df = spark.read.csv(csv_filename, header=True, inferSchema=True)

print("Cleaned Orders Data:")
cleaned_orders_df.show()

Cleaned Orders Data:
+--------+-----------+----------+----------+-------------+----------+---------+--------------+-------------------+----------+-------+------------+------------------+----------------+
|order_id|customer_id|product_id|order_date|delivery_date|    status|status_id|current_status|         updated_at|delay_days|delayed|        name|      contact_info|         address|
+--------+-----------+----------+----------+-------------+----------+---------+--------------+-------------------+----------+-------+------------+------------------+----------------+
|       1|          1|       101|2025-07-01|   2025-07-04| Delivered|        1|     Delivered|2025-07-04 18:00:00|         0|      0|Anjali Mehta|anjali@example.com|   Mumbai, India|
|       2|          2|       102|2025-07-02|   2025-07-05| Delivered|        2|     Delivered|2025-07-05 17:30:00|         0|      0|Rohit Sharma| rohit@example.com|    Delhi, India|
|       3|          3|       103|2025-07-03|   2025-07-06|   Shi

In [18]:
# 2. Create a pipeline to update latest delivery status
# Upload CSV files
print("Upload cleaned_orders.csv")
uploaded_orders = files.upload()
orders_file = list(uploaded_orders.keys())[0]
cleaned_orders_df = spark.read.csv(orders_file, header=True, inferSchema=True)

print("Upload delivery_status.csv")
uploaded_status = files.upload()
status_file = list(uploaded_status.keys())[0]
delivery_status_df = spark.read.csv(status_file, header=True, inferSchema=True)

# Rename updated_at to avoid ambiguity
delivery_status_df = delivery_status_df.withColumnRenamed("updated_at", "status_update_time")

# Join and select only needed columns
orders_updated_df = (
    cleaned_orders_df.alias("orders")
    .join(delivery_status_df.alias("status"), on="order_id", how="left")
    .select(
        "orders.*",
        F.col("status.current_status").alias("latest_status"),
        "status.status_update_time"
    )
)

# Calculate delay_days
orders_updated_df = orders_updated_df.withColumn(
    "delay_days",
    F.when(
        (F.col("delivery_date").isNotNull()) &
        (F.col("status_update_time").isNotNull()),
        F.datediff(F.to_date(F.col("status_update_time")), F.to_date(F.col("delivery_date")))
    ).otherwise(None)
)

# Show cleaned result
print("Updated Orders Data:")
orders_updated_df.show()

Upload cleaned_orders.csv


Saving cleaned_orders.csv to cleaned_orders (3).csv
Upload delivery_status.csv


Saving delivery_status.csv to delivery_status (3).csv
Updated Orders Data:
+--------+-----------+----------+----------+-------------+----------+---------+--------------+-------------------+----------+-------+------------+------------------+----------------+-------------+-------------------+
|order_id|customer_id|product_id|order_date|delivery_date|    status|status_id|current_status|         updated_at|delay_days|delayed|        name|      contact_info|         address|latest_status| status_update_time|
+--------+-----------+----------+----------+-------------+----------+---------+--------------+-------------------+----------+-------+------------+------------------+----------------+-------------+-------------------+
|       1|          1|       101|2025-07-01|   2025-07-04| Delivered|        1|     Delivered|2025-07-04 18:00:00|         0|      0|Anjali Mehta|anjali@example.com|   Mumbai, India|    Delivered|2025-07-04 18:00:00|
|       2|          2|       102|2025-07-02|   2025-07-05

In [19]:
# 3. Save the results as Delta or CSV
# Save as Delta
orders_updated_df.write.format("delta").mode("overwrite").save("/content/orders_with_status_delta")

# Save as CSV
orders_updated_df.write.mode("overwrite").option("header", "true").csv("/content/orders_with_status_csv")

In [20]:
# 4. Optional: run a SQL query to show top 5 delayed customers
orders_updated_df.createOrReplaceTempView("orders_status_view")
top_5_delayed = spark.sql("""
SELECT customer_id,
       SUM(CASE WHEN delay_days > 0 THEN 1 ELSE 0 END) AS delayed_orders
FROM orders_status_view
GROUP BY customer_id
ORDER BY delayed_orders DESC
LIMIT 5
""")

print("Top 5 Delayed Customers:")
top_5_delayed.show()

Top 5 Delayed Customers:
+-----------+--------------+
|customer_id|delayed_orders|
+-----------+--------------+
|          1|             0|
|          6|             0|
|          3|             0|
|          5|             0|
|          9|             0|
+-----------+--------------+



# Deliverables:

    1. Databricks notebook with ETL steps
    2. Output stored in Delta/CSV

In [21]:
# Save as single CSV
single_csv_path = "/content/orders_with_status_single_csv"
orders_updated_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(single_csv_path)

# Find the part file Spark created
import glob
part_file = glob.glob(f"{single_csv_path}/*.csv")[0]

# Rename to a clean file name
final_csv_path = "/content/orders_with_status.csv"
import shutil
shutil.move(part_file, final_csv_path)

# Download the file in Colab
from google.colab import files
files.download(final_csv_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>