In [0]:

from pyspark.sql.functions import col,current_date,to_date, lit, current_timestamp, sum as _sum
from delta.tables import DeltaTable

In [0]:
import datetime

# Format the current date as YYYYMMDD
current_date_str = datetime.datetime.now().strftime("%Y%m%d")

# dbutils.widgets.text("arrival_date", current_date_str)
# date_of_arrival = dbutils.widgets.get("arrival_date")
booking_data = f"/Volumes/workspace/zoom_car/zoom_car_volume/zoom_car_bookings_{current_date_str}.json"
print(booking_data)

/Volumes/workspace/zoom_car/zoom_car_volume/zoom_car_bookings_20250717.json


In [0]:
booking_df = spark.read.format("json").option("header", "true").option("inferschema", "true").option("quote", "\"").option("multiline", "true").load(booking_data)
# Print customerSchema
booking_df.printSchema()
display(booking_df)

root
 |-- booking_date: string (nullable = true)
 |-- booking_id: string (nullable = true)
 |-- car_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- end_time: string (nullable = true)
 |-- start_time: string (nullable = true)
 |-- status: string (nullable = true)
 |-- total_amount: double (nullable = true)



booking_date,booking_id,car_id,customer_id,end_time,start_time,status,total_amount
2025-07-11,B001,CAR125,C007,2025-07-11T23:09:04Z,2025-07-11T13:09:04Z,cancelled,255.02
2025-07-04,B002,CAR522,C003,2025-07-04T20:32:27Z,2025-07-04T09:32:27Z,cancelled,415.2
2025-07-10,B003,CAR386,C001,2025-07-10T18:35:18Z,2025-07-10T07:35:18Z,completed,286.15
2025-06-26,B004,CAR398,C003,2025-06-26T02:39:20Z,2025-06-26T00:39:20Z,pending,457.77
2025-07-10,B005,CAR146,C007,2025-07-11T03:09:23Z,2025-07-10T18:09:23Z,cancelled,375.99
2025-06-21,B006,CAR943,C004,2025-06-21T14:58:16Z,2025-06-21T06:58:16Z,pending,158.14
2025-07-12,B007,CAR271,C009,2025-07-13T04:00:38Z,2025-07-12T18:00:38Z,cancelled,411.82
2025-07-03,B008,CAR749,C004,2025-07-04T00:33:28Z,2025-07-03T13:33:28Z,cancelled,331.3
2025-06-30,B009,CAR764,C007,2025-06-30T19:50:13Z,2025-06-30T17:50:13Z,completed,224.73
2025-06-17,B010,CAR670,C009,2025-06-18T00:12:19Z,2025-06-17T20:12:19Z,cancelled,246.06


In [0]:
from pyspark.sql.functions import col, to_timestamp

# 2: remove rows with null values in critical fields
critical_fields = ["booking_id", "customer_id", "car_id", "booking_date"]
df_cleaned = booking_df.dropna(subset=critical_fields)

# 3: Validate date formats
df_validated = df_cleaned.withColumn("booking_date", to_timestamp("booking_date", "yyyy-MM-dd")) \
                         .withColumn("start_time", to_timestamp("start_time")) \
                         .withColumn("end_time", to_timestamp("end_time"))

# 4: Validate booking status
valid_statuses = ["completed", "cancelled", "pending"]
df_filtered = df_validated.filter(col("status").isin(valid_statuses))

# # 5: Load into staging delta table
# df_filtered.write.format("delta").mode("overwrite").saveAsTable("workspace.zoom_car.staging_bookings")

# # Display for verification
# display(df_filtered)

booking_date,booking_id,car_id,customer_id,end_time,start_time,status,total_amount
2025-07-11T00:00:00.000Z,B001,CAR125,C007,2025-07-11T23:09:04.000Z,2025-07-11T13:09:04.000Z,cancelled,255.02
2025-07-04T00:00:00.000Z,B002,CAR522,C003,2025-07-04T20:32:27.000Z,2025-07-04T09:32:27.000Z,cancelled,415.2
2025-07-10T00:00:00.000Z,B003,CAR386,C001,2025-07-10T18:35:18.000Z,2025-07-10T07:35:18.000Z,completed,286.15
2025-06-26T00:00:00.000Z,B004,CAR398,C003,2025-06-26T02:39:20.000Z,2025-06-26T00:39:20.000Z,pending,457.77
2025-07-10T00:00:00.000Z,B005,CAR146,C007,2025-07-11T03:09:23.000Z,2025-07-10T18:09:23.000Z,cancelled,375.99
2025-06-21T00:00:00.000Z,B006,CAR943,C004,2025-06-21T14:58:16.000Z,2025-06-21T06:58:16.000Z,pending,158.14
2025-07-12T00:00:00.000Z,B007,CAR271,C009,2025-07-13T04:00:38.000Z,2025-07-12T18:00:38.000Z,cancelled,411.82
2025-07-03T00:00:00.000Z,B008,CAR749,C004,2025-07-04T00:33:28.000Z,2025-07-03T13:33:28.000Z,cancelled,331.3
2025-06-30T00:00:00.000Z,B009,CAR764,C007,2025-06-30T19:50:13.000Z,2025-06-30T17:50:13.000Z,completed,224.73
2025-06-17T00:00:00.000Z,B010,CAR670,C009,2025-06-18T00:12:19.000Z,2025-06-17T20:12:19.000Z,cancelled,246.06


In [0]:
from pyspark.sql.functions import col, to_timestamp, to_date, date_format, hour, minute, second

# Step 3: Parse timestamp fields
df_parsed = df_filtered.withColumn("start_time", to_timestamp("start_time")) \
                      .withColumn("end_time", to_timestamp("end_time"))

# Step 4: Extract Date and Time separately
df_transformed = df_parsed \
    .withColumn("start_date", to_date("start_time")) \
    .withColumn("start_clock", date_format("start_time", "HH:mm:ss")) \
    .withColumn("end_date", to_date("end_time")) \
    .withColumn("end_clock", date_format("end_time", "HH:mm:ss"))

# Step 5: Calculate booking duration in minutes
df_with_duration = df_transformed.withColumn("duration_hours",
                      (col("end_time").cast("long") - col("start_time").cast("long")) / 3600)

# Final display or write to delta table
display(df_with_duration.select("booking_id", "start_date", "start_clock", "end_date", "end_clock", "duration_hours",))

# final_staging_bookings delta table with separately time and date
# df_with_duration.write.format("delta").mode("overwrite").saveAsTable("workspace.zoom_car.final_staging_bookings")

booking_id,start_date,start_clock,end_date,end_clock,duration_hours
B001,2025-07-11,13:09:04,2025-07-11,23:09:04,10.0
B002,2025-07-04,09:32:27,2025-07-04,20:32:27,11.0
B003,2025-07-10,07:35:18,2025-07-10,18:35:18,11.0
B004,2025-06-26,00:39:20,2025-06-26,02:39:20,2.0
B005,2025-07-10,18:09:23,2025-07-11,03:09:23,9.0
B006,2025-06-21,06:58:16,2025-06-21,14:58:16,8.0
B007,2025-07-12,18:00:38,2025-07-13,04:00:38,10.0
B008,2025-07-03,13:33:28,2025-07-04,00:33:28,11.0
B009,2025-06-30,17:50:13,2025-06-30,19:50:13,2.0
B010,2025-06-17,20:12:19,2025-06-18,00:12:19,4.0


In [0]:
# Add audit columns
df_final = df_transformed.withColumn("arrival_date", current_date()) \
                         .withColumn("load_timestamp", current_timestamp())
display(df_final)


booking_date,booking_id,car_id,customer_id,end_time,start_time,status,total_amount,start_date,start_clock,end_date,end_clock,arrival_date,load_timestamp
2025-07-11T00:00:00.000Z,B001,CAR125,C007,2025-07-11T23:09:04.000Z,2025-07-11T13:09:04.000Z,cancelled,255.02,2025-07-11,13:09:04,2025-07-11,23:09:04,2025-07-17,2025-07-17T16:51:07.489Z
2025-07-04T00:00:00.000Z,B002,CAR522,C003,2025-07-04T20:32:27.000Z,2025-07-04T09:32:27.000Z,cancelled,415.2,2025-07-04,09:32:27,2025-07-04,20:32:27,2025-07-17,2025-07-17T16:51:07.489Z
2025-07-10T00:00:00.000Z,B003,CAR386,C001,2025-07-10T18:35:18.000Z,2025-07-10T07:35:18.000Z,completed,286.15,2025-07-10,07:35:18,2025-07-10,18:35:18,2025-07-17,2025-07-17T16:51:07.489Z
2025-06-26T00:00:00.000Z,B004,CAR398,C003,2025-06-26T02:39:20.000Z,2025-06-26T00:39:20.000Z,pending,457.77,2025-06-26,00:39:20,2025-06-26,02:39:20,2025-07-17,2025-07-17T16:51:07.489Z
2025-07-10T00:00:00.000Z,B005,CAR146,C007,2025-07-11T03:09:23.000Z,2025-07-10T18:09:23.000Z,cancelled,375.99,2025-07-10,18:09:23,2025-07-11,03:09:23,2025-07-17,2025-07-17T16:51:07.489Z
2025-06-21T00:00:00.000Z,B006,CAR943,C004,2025-06-21T14:58:16.000Z,2025-06-21T06:58:16.000Z,pending,158.14,2025-06-21,06:58:16,2025-06-21,14:58:16,2025-07-17,2025-07-17T16:51:07.489Z
2025-07-12T00:00:00.000Z,B007,CAR271,C009,2025-07-13T04:00:38.000Z,2025-07-12T18:00:38.000Z,cancelled,411.82,2025-07-12,18:00:38,2025-07-13,04:00:38,2025-07-17,2025-07-17T16:51:07.489Z
2025-07-03T00:00:00.000Z,B008,CAR749,C004,2025-07-04T00:33:28.000Z,2025-07-03T13:33:28.000Z,cancelled,331.3,2025-07-03,13:33:28,2025-07-04,00:33:28,2025-07-17,2025-07-17T16:51:07.489Z
2025-06-30T00:00:00.000Z,B009,CAR764,C007,2025-06-30T19:50:13.000Z,2025-06-30T17:50:13.000Z,completed,224.73,2025-06-30,17:50:13,2025-06-30,19:50:13,2025-07-17,2025-07-17T16:51:07.489Z
2025-06-17T00:00:00.000Z,B010,CAR670,C009,2025-06-18T00:12:19.000Z,2025-06-17T20:12:19.000Z,cancelled,246.06,2025-06-17,20:12:19,2025-06-18,00:12:19,2025-07-17,2025-07-17T16:51:07.489Z


In [0]:
staging_booking_path = "workspace.zoom_car.staging_bookings_delta"

# Compute the current date
current_date_str = spark.sql("SELECT current_date()").collect()[0][0]

df_final.write.format("delta") \
    .mode("overwrite") \
    .option("replaceWhere", f"arrival_date = '{current_date_str}'") \
    .saveAsTable(staging_booking_path)

display(df_final)

booking_date,booking_id,car_id,customer_id,end_time,start_time,status,total_amount,start_date,start_clock,end_date,end_clock,arrival_date,load_timestamp
2025-07-11T00:00:00.000Z,B001,CAR125,C007,2025-07-11T23:09:04.000Z,2025-07-11T13:09:04.000Z,cancelled,255.02,2025-07-11,13:09:04,2025-07-11,23:09:04,2025-07-17,2025-07-17T17:51:33.443Z
2025-07-04T00:00:00.000Z,B002,CAR522,C003,2025-07-04T20:32:27.000Z,2025-07-04T09:32:27.000Z,cancelled,415.2,2025-07-04,09:32:27,2025-07-04,20:32:27,2025-07-17,2025-07-17T17:51:33.443Z
2025-07-10T00:00:00.000Z,B003,CAR386,C001,2025-07-10T18:35:18.000Z,2025-07-10T07:35:18.000Z,completed,286.15,2025-07-10,07:35:18,2025-07-10,18:35:18,2025-07-17,2025-07-17T17:51:33.443Z
2025-06-26T00:00:00.000Z,B004,CAR398,C003,2025-06-26T02:39:20.000Z,2025-06-26T00:39:20.000Z,pending,457.77,2025-06-26,00:39:20,2025-06-26,02:39:20,2025-07-17,2025-07-17T17:51:33.443Z
2025-07-10T00:00:00.000Z,B005,CAR146,C007,2025-07-11T03:09:23.000Z,2025-07-10T18:09:23.000Z,cancelled,375.99,2025-07-10,18:09:23,2025-07-11,03:09:23,2025-07-17,2025-07-17T17:51:33.443Z
2025-06-21T00:00:00.000Z,B006,CAR943,C004,2025-06-21T14:58:16.000Z,2025-06-21T06:58:16.000Z,pending,158.14,2025-06-21,06:58:16,2025-06-21,14:58:16,2025-07-17,2025-07-17T17:51:33.443Z
2025-07-12T00:00:00.000Z,B007,CAR271,C009,2025-07-13T04:00:38.000Z,2025-07-12T18:00:38.000Z,cancelled,411.82,2025-07-12,18:00:38,2025-07-13,04:00:38,2025-07-17,2025-07-17T17:51:33.443Z
2025-07-03T00:00:00.000Z,B008,CAR749,C004,2025-07-04T00:33:28.000Z,2025-07-03T13:33:28.000Z,cancelled,331.3,2025-07-03,13:33:28,2025-07-04,00:33:28,2025-07-17,2025-07-17T17:51:33.443Z
2025-06-30T00:00:00.000Z,B009,CAR764,C007,2025-06-30T19:50:13.000Z,2025-06-30T17:50:13.000Z,completed,224.73,2025-06-30,17:50:13,2025-06-30,19:50:13,2025-07-17,2025-07-17T17:51:33.443Z
2025-06-17T00:00:00.000Z,B010,CAR670,C009,2025-06-18T00:12:19.000Z,2025-06-17T20:12:19.000Z,cancelled,246.06,2025-06-17,20:12:19,2025-06-18,00:12:19,2025-07-17,2025-07-17T17:51:33.443Z
