In [0]:
from pyspark.sql.functions import col,current_date,to_date,datediff, lit, current_timestamp, sum as _sum, regexp_extract
from delta.tables import DeltaTable

In [0]:
import datetime

# Format the current date as YYYYMMDD
current_date_str = datetime.datetime.now().strftime("%Y%m%d")

# dbutils.widgets.text("arrival_date", current_date_str)
# date_of_arrival = dbutils.widgets.get("arrival_date")
customer_data = f"/Volumes/workspace/zoom_car/zoom_car_volume/zoom_car_customers_{current_date_str}.json"
print(customer_data)

/Volumes/workspace/zoom_car/zoom_car_volume/zoom_car_customers_20250717.json


In [0]:
customer_df = spark.read.format("json").option("header", "true").option("inferschema", "true").option("quote", "\"").option("multiline", "true").load(customer_data)
# Print customerSchema
customer_df.printSchema()
display(customer_df)

root
 |-- customer_id: string (nullable = true)
 |-- email: string (nullable = true)
 |-- name: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- signup_date: string (nullable = true)
 |-- status: string (nullable = true)



customer_id,email,name,phone_number,signup_date,status
C001,joshuathomas@yahoo.com,Emma Hawkins,3138135510,2025-05-28,inactive
C002,lrosario@gmail.com,Brenda Galloway,3304466710,2025-03-28,active
C003,yanderson@gmail.com,Michele Branch,1895122579,2025-02-17,active
C004,rhale@gmail.com,Jessica Gray,7041080388,2024-10-28,active
C005,orobinson@gmail.com,Carolyn Martinez,1403054963,2025-06-25,active
C006,twilliams@simmons.com,Kathy Strickland,8385627187,2024-11-14,active
C007,aramirez@vazquez.com,Tabitha Smith,4917908181,2025-02-08,active
C008,gregory64@porter.com,Christopher Lopez,893704162,2024-03-30,active
C009,hcabrera@hotmail.com,Rebecca Fleming,5078290394,2024-03-13,active
C010,tgoodwin@cohen.biz,Amanda Bradley,7593852622,2024-06-04,inactive


In [0]:
# Data Cleaning Steps
customer_df = customer_df.na.drop(subset=["customer_id", "email", "name", "phone_number", "signup_date", "status"])

# Email Validation
email_pattern = r"^[\w\.-]+@[\w\.-]+\.\w+$"
customer_df = customer_df.withColumn("is_valid_email", regexp_extract(col("email"), email_pattern, 0) != "")
customer_df = customer_df.filter(col("is_valid_email")).drop("is_valid_email")

# Phone Validation
customer_df = customer_df.filter(col("phone_number").rlike(r"^[1-9]\d{9}$"))


# Date Validation
customer_df = customer_df.withColumn("signup_date", to_date(col("signup_date"), "yyyy-MM-dd"))
customer_df = customer_df.filter(col("signup_date").isNotNull())
customer_df = customer_df.filter(col("signup_date") <= current_date())

# Status Validation
customer_df = customer_df.filter(col("status").isin("active", "inactive"))

display(customer_df)

customer_id,email,name,phone_number,signup_date,status
C001,joshuathomas@yahoo.com,Emma Hawkins,3138135510,2025-05-28,inactive
C002,lrosario@gmail.com,Brenda Galloway,3304466710,2025-03-28,active
C003,yanderson@gmail.com,Michele Branch,1895122579,2025-02-17,active
C004,rhale@gmail.com,Jessica Gray,7041080388,2024-10-28,active
C005,orobinson@gmail.com,Carolyn Martinez,1403054963,2025-06-25,active
C006,twilliams@simmons.com,Kathy Strickland,8385627187,2024-11-14,active
C007,aramirez@vazquez.com,Tabitha Smith,4917908181,2025-02-08,active
C009,hcabrera@hotmail.com,Rebecca Fleming,5078290394,2024-03-13,active
C010,tgoodwin@cohen.biz,Amanda Bradley,7593852622,2024-06-04,inactive


In [0]:
# Tenure in days
customer_df = customer_df.withColumn("tenure_days", datediff(current_date(), col("signup_date")))
display(customer_df)

customer_id,email,name,phone_number,signup_date,status,tenure_days
C001,joshuathomas@yahoo.com,Emma Hawkins,3138135510,2025-05-28,inactive,50
C002,lrosario@gmail.com,Brenda Galloway,3304466710,2025-03-28,active,111
C003,yanderson@gmail.com,Michele Branch,1895122579,2025-02-17,active,150
C004,rhale@gmail.com,Jessica Gray,7041080388,2024-10-28,active,262
C005,orobinson@gmail.com,Carolyn Martinez,1403054963,2025-06-25,active,22
C006,twilliams@simmons.com,Kathy Strickland,8385627187,2024-11-14,active,245
C007,aramirez@vazquez.com,Tabitha Smith,4917908181,2025-02-08,active,159
C009,hcabrera@hotmail.com,Rebecca Fleming,5078290394,2024-03-13,active,491
C010,tgoodwin@cohen.biz,Amanda Bradley,7593852622,2024-06-04,inactive,408


In [0]:
# Add audit columns
customer_df = customer_df.withColumn("arrival_date", current_date())
customer_df = customer_df.withColumn("load_timestamp", current_timestamp())
display(customer_df)

customer_id,email,name,phone_number,signup_date,status,tenure_days,arrival_date,load_timestamp
C001,joshuathomas@yahoo.com,Emma Hawkins,3138135510,2025-05-28,inactive,50,2025-07-17,2025-07-17T16:41:32.123Z
C002,lrosario@gmail.com,Brenda Galloway,3304466710,2025-03-28,active,111,2025-07-17,2025-07-17T16:41:32.123Z
C003,yanderson@gmail.com,Michele Branch,1895122579,2025-02-17,active,150,2025-07-17,2025-07-17T16:41:32.123Z
C004,rhale@gmail.com,Jessica Gray,7041080388,2024-10-28,active,262,2025-07-17,2025-07-17T16:41:32.123Z
C005,orobinson@gmail.com,Carolyn Martinez,1403054963,2025-06-25,active,22,2025-07-17,2025-07-17T16:41:32.123Z
C006,twilliams@simmons.com,Kathy Strickland,8385627187,2024-11-14,active,245,2025-07-17,2025-07-17T16:41:32.123Z
C007,aramirez@vazquez.com,Tabitha Smith,4917908181,2025-02-08,active,159,2025-07-17,2025-07-17T16:41:32.123Z
C009,hcabrera@hotmail.com,Rebecca Fleming,5078290394,2024-03-13,active,491,2025-07-17,2025-07-17T16:41:32.123Z
C010,tgoodwin@cohen.biz,Amanda Bradley,7593852622,2024-06-04,inactive,408,2025-07-17,2025-07-17T16:41:32.123Z


In [0]:
from pyspark.sql.functions import current_date

staging_customer_path = "workspace.zoom_car.staging_customers_delta"

# Compute the current date
current_date_str = spark.sql("SELECT current_date()").collect()[0][0]

customer_df.write.format("delta") \
    .mode("overwrite") \
    .option("replaceWhere", f"arrival_date = '{current_date_str}'") \
    .saveAsTable(staging_customer_path)

display(customer_df)

customer_id,email,name,phone_number,signup_date,status,tenure_days,arrival_date,load_timestamp
C001,joshuathomas@yahoo.com,Emma Hawkins,3138135510,2025-05-28,inactive,50,2025-07-17,2025-07-17T16:41:34.791Z
C002,lrosario@gmail.com,Brenda Galloway,3304466710,2025-03-28,active,111,2025-07-17,2025-07-17T16:41:34.791Z
C003,yanderson@gmail.com,Michele Branch,1895122579,2025-02-17,active,150,2025-07-17,2025-07-17T16:41:34.791Z
C004,rhale@gmail.com,Jessica Gray,7041080388,2024-10-28,active,262,2025-07-17,2025-07-17T16:41:34.791Z
C005,orobinson@gmail.com,Carolyn Martinez,1403054963,2025-06-25,active,22,2025-07-17,2025-07-17T16:41:34.791Z
C006,twilliams@simmons.com,Kathy Strickland,8385627187,2024-11-14,active,245,2025-07-17,2025-07-17T16:41:34.791Z
C007,aramirez@vazquez.com,Tabitha Smith,4917908181,2025-02-08,active,159,2025-07-17,2025-07-17T16:41:34.791Z
C009,hcabrera@hotmail.com,Rebecca Fleming,5078290394,2024-03-13,active,491,2025-07-17,2025-07-17T16:41:34.791Z
C010,tgoodwin@cohen.biz,Amanda Bradley,7593852622,2024-06-04,inactive,408,2025-07-17,2025-07-17T16:41:34.791Z


In [0]:
# customer_df.write.format("delta") \
#     .mode("overwrite") \
#     .partitionBy("arrival_date") \
#     .option("mergeSchema", "true") \
#     .saveAsTable("workspace.zoom_car.staging_customers_delta")

In [0]:
cleaned_df = spark.table("workspace.zoom_car.staging_customers_delta")
display(cleaned_df)

customer_id,email,name,phone_number,signup_date,status,tenure_days,arrival_date,load_timestamp
C001,joshuathomas@yahoo.com,Emma Hawkins,3138135510,2025-05-28,inactive,50,2025-07-17,2025-07-17T16:41:33.688Z
C002,lrosario@gmail.com,Brenda Galloway,3304466710,2025-03-28,active,111,2025-07-17,2025-07-17T16:41:33.688Z
C003,yanderson@gmail.com,Michele Branch,1895122579,2025-02-17,active,150,2025-07-17,2025-07-17T16:41:33.688Z
C004,rhale@gmail.com,Jessica Gray,7041080388,2024-10-28,active,262,2025-07-17,2025-07-17T16:41:33.688Z
C005,orobinson@gmail.com,Carolyn Martinez,1403054963,2025-06-25,active,22,2025-07-17,2025-07-17T16:41:33.688Z
C006,twilliams@simmons.com,Kathy Strickland,8385627187,2024-11-14,active,245,2025-07-17,2025-07-17T16:41:33.688Z
C007,aramirez@vazquez.com,Tabitha Smith,4917908181,2025-02-08,active,159,2025-07-17,2025-07-17T16:41:33.688Z
C009,hcabrera@hotmail.com,Rebecca Fleming,5078290394,2024-03-13,active,491,2025-07-17,2025-07-17T16:41:33.688Z
C010,tgoodwin@cohen.biz,Amanda Bradley,7593852622,2024-06-04,inactive,408,2025-07-17,2025-07-17T16:41:33.688Z


In [0]:
# staging_customer_path = "workspace/zoom_car/staging_customers_delta/_delta_log"

# from delta.tables import DeltaTable

# if DeltaTable.isDeltaTable(spark, staging_customer_path):
#     delta_table = DeltaTable.forPath(spark, staging_customer_path)

#     (
#         delta_table.alias("t")
#         .merge(
#             customer_df.alias("s"),
#             "t.customer_id = s.customer_id"
#         )
#         .whenMatchedUpdateAll()
#         .whenNotMatchedInsertAll()
#         .execute()
#     )
# else:
#     customer_df.write.format("delta") \
#         .mode("overwrite") \
#         .partitionBy("arrival_date") \
#         .saveAsTable("workspace.zoom_car.staging_customers_delta")


In [0]:
# final_df = spark.table("workspace.zoom_car.staging_customers_delta")
# display(final_df)