In [0]:
# current_date = "20250317"
import dbutils
dbutils.widgets.text("current_date", "20250317")
# current_date = "20250317"

In [0]:
bookings_data_path = f"/Volumes/zoom_cars_data/default/bookings_data/zoom_car_bookings_{current_date}.json"

In [0]:
import os
print(os.environ["SPARK_VERSION"])

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

schema = StructType([
    StructField("booking_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("car_id", StringType(), True),
    StructField("booking_date", StringType(), True),
    StructField("start_time", StringType(), True),
    StructField("end_time", StringType(), True),
    StructField("total_amount", DoubleType(), True),
    StructField("status", StringType(), True)
])

In [0]:
df = spark.read.schema(schema).option("multiLine", True).json(bookings_data_path)


In [0]:
df.show()

In [0]:
from pyspark.sql.functions import to_date
df = df.withColumn("booking_date", to_date(df.booking_date, "yyyy-MM-dd"))
df = df.withColumn("start_time", to_date(df.start_time, "yyyy-MM-dd'T'HH:mm:ss'Z'"))
df = df.withColumn("end_time", to_date(df.end_time, "yyyy-MM-dd'T'HH:mm:ss'Z'"))

In [0]:
df.printSchema()

In [0]:
df.show()

In [0]:
null_data = df.filter(df.booking_id.isNull() | df.customer_id.isNull() | df.car_id.isNull() | df.booking_date.isNull() | df.start_time.isNull() | df.end_time.isNull() | df.total_amount.isNull() | df.status.isNull())

In [0]:
null_data.show()

In [0]:
df.show()

In [0]:
df

In [0]:
from pyspark.sql.functions import sum as _sum
dff = df.groupBy("status").agg(_sum("total_amount")).select("status", "sum(total_amount)")

In [0]:
dff.show()

In [0]:
scd_table_path = "zoom_cars_data.default.customers_data"

In [0]:
dffff = df.write.format("delta").mode("overwrite").saveAsTable(scd_table_path)

# ApplyTransformations


In [0]:
df.columns

In [0]:
df.printSchema()

In [0]:
df_transformed = df.withColumn("order_duration", (df.end_time - df.start_time) \
                               .cast("integer")) \
                               .groupBy("booking_id") \
                               .agg(_sum(col="order_duration"))

In [0]:
df_transformed.show()