In [1]:
# from pyspark.sql import SparkSession
# spark = (
#     SparkSession.builder.appName("DeltaLakeNCRDemo")
#     .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
#     .getOrCreate()
# )
# spark
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("DeltaLakeDemo")
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .getOrCreate()
)


In [2]:
# Load raw CSV
df = spark.read.csv("/opt/data/ncr_ride_bookings.csv", header=True, inferSchema=True)

# Clean column names for Delta compatibility
df_clean = df.toDF(*[
    c.strip()
     .replace(" ", "_")      # spaces → underscore
     .replace("-", "_")      # dashes → underscore
     .replace(".", "_")      # dots → underscore
     .replace("/", "_")      # slashes → underscore
     .lower()                # lowercase for consistency
    for c in df.columns
])

df_clean.printSchema()


root
 |-- date: date (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- booking_id: string (nullable = true)
 |-- booking_status: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- vehicle_type: string (nullable = true)
 |-- pickup_location: string (nullable = true)
 |-- drop_location: string (nullable = true)
 |-- avg_vtat: string (nullable = true)
 |-- avg_ctat: string (nullable = true)
 |-- cancelled_rides_by_customer: string (nullable = true)
 |-- reason_for_cancelling_by_customer: string (nullable = true)
 |-- cancelled_rides_by_driver: string (nullable = true)
 |-- driver_cancellation_reason: string (nullable = true)
 |-- incomplete_rides: string (nullable = true)
 |-- incomplete_rides_reason: string (nullable = true)
 |-- booking_value: string (nullable = true)
 |-- ride_distance: string (nullable = true)
 |-- driver_ratings: string (nullable = true)
 |-- customer_rating: string (nullable = true)
 |-- payment_method: string (nullable = true)



In [3]:
from pyspark.sql.functions import col

# sanitize column names
delta_path = "/opt/output/delta/ncr_rides"

df_clean.write.format("delta").mode("overwrite").save(delta_path)

rides = spark.read.format("delta").load(delta_path)
rides.show(5)


+----------+-------------------+----------------+-------------------+----------------+-------------+---------------+-------------------+--------+--------+---------------------------+---------------------------------+-------------------------+--------------------------+----------------+-----------------------+-------------+-------------+--------------+---------------+--------------+
|      date|               time|      booking_id|     booking_status|     customer_id| vehicle_type|pickup_location|      drop_location|avg_vtat|avg_ctat|cancelled_rides_by_customer|reason_for_cancelling_by_customer|cancelled_rides_by_driver|driver_cancellation_reason|incomplete_rides|incomplete_rides_reason|booking_value|ride_distance|driver_ratings|customer_rating|payment_method|
+----------+-------------------+----------------+-------------------+----------------+-------------+---------------+-------------------+--------+--------+---------------------------+---------------------------------+--------------

In [4]:
# Total rides by status
rides.groupBy("booking_status").count().show()

# Average rating by vehicle type
rides.groupBy("vehicle_type").agg({"driver_ratings": "avg"}).show()


+--------------------+-----+
|      booking_status|count|
+--------------------+-----+
|           Completed|93000|
|     No Driver Found|10500|
| Cancelled by Driver|27000|
|Cancelled by Cust...|10500|
|          Incomplete| 9000|
+--------------------+-----+

+-------------+-------------------+
| vehicle_type|avg(driver_ratings)|
+-------------+-------------------+
|         Bike|  4.230055579307396|
|     Go Sedan|  4.231812185176316|
|      Go Mini|  4.227694215321595|
|      Uber XL|  4.238339920948616|
|         Auto|   4.23236881882964|
|Premier Sedan|  4.234864912904378|
|        eBike| 4.2256144100137325|
+-------------+-------------------+



In [5]:
from delta.tables import DeltaTable

delta_rides = DeltaTable.forPath(spark, delta_path)

delta_rides.update(
    condition=col("booking_status") == "Cancelled",
    set={"booking_status": col("booking_status") + "_LostRevenue"}
)


In [6]:
delta_rides.delete(condition=col("ride_distance").isNull())


In [8]:
from datetime import date, datetime
from pyspark.sql.functions import col

delta_path = "/opt/output/delta/ncr_rides"

# Overwrite existing data
df_clean.write.format("delta").mode("overwrite").save(delta_path)

# Append new rows
new_rows = spark.createDataFrame([
    (
        date(2025, 9, 29),                         # ✅ date
        datetime(2025, 9, 29, 14, 45, 0),          # ✅ timestamp
        "201", "Completed", "C010", "Bike", "LocX", "LocY",
        "3", "6", "0", None, "0", None, "0", None,
        "150", "5", "4.9", "4.5", "Cash"
    )
], schema=df_clean.schema)

new_rows.write.format("delta").mode("append").save(delta_path)


In [9]:
rides = spark.read.format("delta").load(delta_path)
rides.show(5)

# Example filter
rides.filter(col("booking_status") == "Completed").show()


+----------+-------------------+----------------+--------------------+----------------+------------+---------------+-------------+--------+--------+---------------------------+---------------------------------+-------------------------+--------------------------+----------------+-----------------------+-------------+-------------+--------------+---------------+--------------+
|      date|               time|      booking_id|      booking_status|     customer_id|vehicle_type|pickup_location|drop_location|avg_vtat|avg_ctat|cancelled_rides_by_customer|reason_for_cancelling_by_customer|cancelled_rides_by_driver|driver_cancellation_reason|incomplete_rides|incomplete_rides_reason|booking_value|ride_distance|driver_ratings|customer_rating|payment_method|
+----------+-------------------+----------------+--------------------+----------------+------------+---------------+-------------+--------+--------+---------------------------+---------------------------------+-------------------------+------

In [10]:
from delta.tables import DeltaTable

rides_delta = DeltaTable.forPath(spark, delta_path)

# Example: update rating of customer C010
rides_delta.update(
    condition=col("customer_id") == "C010",
    set={"customer_rating": "5.0"}
)


In [11]:
updates = spark.createDataFrame([
    (
        date(2025, 9, 29),                         # ✅ date
        datetime(2025, 9, 29, 16, 0, 0),           # ✅ timestamp
        "201", "Cancelled", "C010", "Bike", "LocX", "LocY",
        "3", "6", "1", "Changed mind", "0", None, "0", None,
        "150", "5", "4.9", "4.5", "Cash"
    )
], schema=rides.schema)

rides_delta.alias("tgt").merge(
    updates.alias("src"),
    "tgt.booking_id = src.booking_id"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()


In [12]:
from pyspark.sql.functions import lit

# Add metadata
rides_scd2 = rides.withColumn("is_current", lit(True)) \
                  .withColumn("start_date", lit(date.today())) \
                  .withColumn("end_date", lit(None).cast("date"))

rides_scd2.write.format("delta").mode("overwrite").save("/opt/output/delta/ncr_rides_scd2")

rides_scd2_delta = DeltaTable.forPath(spark, "/opt/output/delta/ncr_rides_scd2")

# New version of booking 201 (changed status)
new_version = spark.createDataFrame([
    (
        date(2025, 9, 29),
        datetime(2025, 9, 29, 18, 0, 0),
        "201", "Completed", "C010", "Bike", "LocX", "LocY",
        "3", "6", "0", None, "0", None, "0", None,
        "150", "5", "4.9", "4.9", "Cash",
        True, date(2025, 9, 29), None
    )
], schema=rides_scd2.schema)

# Expire old record + insert new one
rides_scd2_delta.alias("tgt").merge(
    new_version.alias("src"),
    "tgt.booking_id = src.booking_id AND tgt.is_current = True"
).whenMatchedUpdate(set={"is_current": "false", "end_date": "src.start_date"}) \
 .whenNotMatchedInsertAll() \
 .execute()


In [13]:
# Show history
rides_delta.history().show()

# Query an old version
rides_old = spark.read.format("delta").option("versionAsOf", 0).load(delta_path)
rides_old.show()


+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|userId|userName|operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|      4|2025-09-29 11:29:...|  NULL|    NULL|    MERGE|{predicate -> ["(...|NULL|    NULL|     NULL|          3|  Serializable|        false|{numTargetRowsCop...|        NULL|Apache-Spark/3.5....|
|      3|2025-09-29 11:29:...|  NULL|    NULL|   UPDATE|{predicate -> ["(...|NULL|    NULL|     NULL|          2|  Serializable|        false|{numRemovedFiles ...|        NULL|Apache-Spark/3.5....|
|      2|2