In [0]:
%pip install pandas
%restart_python

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import os
os.environ['SPARK_VERSION'] = "3.3"

In [0]:
from pyspark.sql.functions import col,current_date,to_date, lit, current_timestamp, sum as _sum
from delta.tables import DeltaTable
import pandas as pd

In [0]:
dbutils.widgets.text("arrival_date", "YYYY-MM-DD")
date_of_arrival = dbutils.widgets.get("arrival_date")
customer_data = f"/Volumes/workspace/booking_and_customers/customers_volume/customers_{date_of_arrival}.csv"
print(customer_data)
# Read customer data
customer_df = spark.read.format("csv").option("header", "true").option("inferschema", "true").option("quote", "\"").option("multiline", "true").load(customer_data)
# Print customerSchema
customer_df.printSchema()
display(customer_df)

/Volumes/workspace/booking_and_customers/customers_volume/customers_2024-07-27.csv
root
 |-- customer_id: integer (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- customer_address: string (nullable = true)
 |-- phone_number: long (nullable = true)
 |-- email: string (nullable = true)
 |-- valid_from: date (nullable = true)
 |-- valid_to: date (nullable = true)



customer_id,customer_name,customer_address,phone_number,email,valid_from,valid_to
1026,Lori Odom,"66894 Pamela Ridge Apt. 701 Wilsonport, NV 55859",6864684198,santanachristopher@wilson-bailey.com,2024-01-30,9999-12-31
1030,David Odonnell,USNV Simmons FPO AE 08244,6864684148,kathleen41@hotmail.com,2024-08-30,9999-12-31


In [0]:
customer_df = customer_df.withColumn("start_date", to_date(lit("2025-07-01"))).withColumn("end_date", to_date(lit("2200-01-01"))).withColumn("current_flag", lit(True))
customer_df.printSchema()
display(customer_df)

root
 |-- customer_id: integer (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- customer_address: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- email: string (nullable = true)
 |-- valid_from: date (nullable = true)
 |-- valid_to: date (nullable = true)
 |-- start_date: date (nullable = true)
 |-- end_date: date (nullable = true)
 |-- current_flag: boolean (nullable = false)



customer_id,customer_name,customer_address,phone_number,email,valid_from,valid_to,start_date,end_date,current_flag
1007,Robert Johnson,"574 Patel Drive Apt. 043 Jonesville, TX 54199",001-927-927-7643x34711,danielbrown@west.com,2022-09-26,9999-12-31,2025-07-01,2200-01-01,True
1093,Eric Glenn,"2560 Christian Junctions North Shane, CA 88681",001-944-838-5681,tarabradley@jones.com,2023-06-21,9999-12-31,2025-07-01,2200-01-01,True
1081,Scott Jones,"591 Richard Lodge Suite 568 New John, HI 87997",239.669.1888x6136,nicolefisher@miller.com,2022-10-19,9999-12-31,2025-07-01,2200-01-01,True
1036,Kimberly Garza,USNS Reilly FPO AP 52950,8095911248,sdavis@gmail.com,2022-11-12,9999-12-31,2025-07-01,2200-01-01,True
1022,Thomas Mathis,"2386 Parker Mountain East Sharonmouth, IL 15940",+1-871-965-0054,jennifer65@gmail.com,2023-02-04,9999-12-31,2025-07-01,2200-01-01,True
1062,Steven Steele,"6203 Smith Forest Lake Crystalchester, CA 67075",(253)242-6269x1355,johnsonannette@wallace-griffith.com,2022-11-12,9999-12-31,2025-07-01,2200-01-01,True
1021,Nicholas Anderson,"578 Hill Village Suite 210 Evansbury, UT 95568",008-256-3946,mavery@baker-ayala.com,2023-01-09,9999-12-31,2025-07-01,2200-01-01,True
1037,Richard Owens,"2941 Nicole Extensions Jeffreyport, OH 42853",001-874-094-3134x8557,jfields@stone.com,2023-02-22,9999-12-31,2025-07-01,2200-01-01,True
1047,Nina Weeks,"4250 Burns Forges Suite 922 West Christopher, SC 88814",574-638-3211x5713,ohowell@berry.com,2023-03-31,9999-12-31,2025-07-01,2200-01-01,True
1099,Steven Mathews,"4536 Michele Village Suite 277 North Tracy, NV 18979",522-742-2881x05998,thomaserica@yahoo.com,2022-11-07,9999-12-31,2025-07-01,2200-01-01,True


In [0]:
customer_df.write.format("delta").mode("overwrite").saveAsTable("workspace.booking_and_customers.customer_table")

In [0]:
historical_df = spark.read.format("delta").table("workspace.booking_and_customers.customer_table")
display(historical_df)


customer_id,customer_name,customer_address,phone_number,email,valid_from,valid_to,start_date,end_date,current_flag
1007,Robert Johnson,"574 Patel Drive Apt. 043 Jonesville, TX 54199",001-927-927-7643x34711,danielbrown@west.com,2022-09-26,9999-12-31,2025-07-01,2200-01-01,True
1093,Eric Glenn,"2560 Christian Junctions North Shane, CA 88681",001-944-838-5681,tarabradley@jones.com,2023-06-21,9999-12-31,2025-07-01,2200-01-01,True
1081,Scott Jones,"591 Richard Lodge Suite 568 New John, HI 87997",239.669.1888x6136,nicolefisher@miller.com,2022-10-19,9999-12-31,2025-07-01,2200-01-01,True
1036,Kimberly Garza,USNS Reilly FPO AP 52950,8095911248,sdavis@gmail.com,2022-11-12,9999-12-31,2025-07-01,2200-01-01,True
1022,Thomas Mathis,"2386 Parker Mountain East Sharonmouth, IL 15940",+1-871-965-0054,jennifer65@gmail.com,2023-02-04,9999-12-31,2025-07-01,2200-01-01,True
1062,Steven Steele,"6203 Smith Forest Lake Crystalchester, CA 67075",(253)242-6269x1355,johnsonannette@wallace-griffith.com,2022-11-12,9999-12-31,2025-07-01,2200-01-01,True
1021,Nicholas Anderson,"578 Hill Village Suite 210 Evansbury, UT 95568",008-256-3946,mavery@baker-ayala.com,2023-01-09,9999-12-31,2025-07-01,2200-01-01,True
1037,Richard Owens,"2941 Nicole Extensions Jeffreyport, OH 42853",001-874-094-3134x8557,jfields@stone.com,2023-02-22,9999-12-31,2025-07-01,2200-01-01,True
1047,Nina Weeks,"4250 Burns Forges Suite 922 West Christopher, SC 88814",574-638-3211x5713,ohowell@berry.com,2023-03-31,9999-12-31,2025-07-01,2200-01-01,True
1099,Steven Mathews,"4536 Michele Village Suite 277 North Tracy, NV 18979",522-742-2881x05998,thomaserica@yahoo.com,2022-11-07,9999-12-31,2025-07-01,2200-01-01,True


In [0]:
customer_df = customer_df.withColumn("start_date", current_date()).withColumn("end_date", to_date(lit("2200-01-01"))).withColumn("current_flag", lit(True))
customer_df.printSchema()
current_df = customer_df
display(current_df)

root
 |-- customer_id: integer (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- customer_address: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- email: string (nullable = true)
 |-- valid_from: date (nullable = true)
 |-- valid_to: date (nullable = true)
 |-- start_date: date (nullable = false)
 |-- end_date: date (nullable = true)
 |-- current_flag: boolean (nullable = false)



customer_id,customer_name,customer_address,phone_number,email,valid_from,valid_to,start_date,end_date,current_flag
1026,Lori Odom,"66894 Pamela Ridge Apt. 701 Wilsonport, NV 55859",,santanachristopher@wilson-bailey.com,2024-01-30,9999-12-31,2025-07-16,2200-01-01,True
1030,David Odonnell,USNV Simmons FPO AE 08244,6864684148,kathleen41@hotmail.com,2024-08-30,9999-12-31,2025-07-16,2200-01-01,True
1035,Cheryl Weaver,"3716 Cunningham Station Apt. 567 Davidborough, TX 41021",893.223.0773x3326,smithcatherine@yahoo.com,2024-03-07,9999-12-31,2025-07-16,2200-01-01,True
1036,Rebecca Johnson,"674 Bishop Mission Suzannebury, NY 90306",018.713.0054x360,carol33@holt-higgins.info,2024-05-25,9999-12-31,2025-07-16,2200-01-01,True
1037,Lisa Hill,"6720 Brittany Streets Lake Sabrinaview, IN 22990",(646)830-3919x64651,cory15@hotmail.com,2024-11-01,9999-12-31,2025-07-16,2200-01-01,True
1038,Aaron Cooper,"37842 Haynes Isle Suite 421 South Marisa, PA 75690",249-334-3781x7626,crystal91@henderson-lane.net,2024-11-26,9999-12-31,2025-07-16,2200-01-01,True
1039,Betty Andrews,Unit 9441 Box 7301 DPO AA 92892,(055)647-0735,avilacody@yahoo.com,2024-04-10,9999-12-31,2025-07-16,2200-01-01,True
1047,Edward Stone,"31740 Martinez Trace Jonesview, NC 49949",884.266.5166x7808,zwhite@hotmail.com,2024-03-31,9999-12-31,2025-07-16,2200-01-01,True
1048,James Myers,"78527 Kelly Corner Powellbury, FL 03544",001-403-398-8094,vyoder@wiley-jones.com,2024-08-06,9999-12-31,2025-07-16,2200-01-01,True
1050,Scott Freeman,"528 John Hollow Theresabury, SC 37328",0970621868,emily78@gmail.com,2024-01-24,9999-12-31,2025-07-16,2200-01-01,True


In [0]:
customer_df = customer_df.withColumn("start_date", to_date(lit("2025-07-18"))).withColumn("end_date", to_date(lit("2200-01-01"))).withColumn("current_flag", lit(True))
customer_df.printSchema()
current_df = customer_df
display(current_df)

root
 |-- customer_id: integer (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- customer_address: string (nullable = true)
 |-- phone_number: long (nullable = true)
 |-- email: string (nullable = true)
 |-- valid_from: date (nullable = true)
 |-- valid_to: date (nullable = true)
 |-- start_date: date (nullable = true)
 |-- end_date: date (nullable = true)
 |-- current_flag: boolean (nullable = false)



customer_id,customer_name,customer_address,phone_number,email,valid_from,valid_to,start_date,end_date,current_flag
1026,Lori Odom,"66894 Pamela Ridge Apt. 701 Wilsonport, NV 55859",6864684198,santanachristopher@wilson-bailey.com,2024-01-30,9999-12-31,2025-07-18,2200-01-01,True
1030,David Odonnell,USNV Simmons FPO AE 08244,6864684148,kathleen41@hotmail.com,2024-08-30,9999-12-31,2025-07-18,2200-01-01,True


In [0]:
current_df = current_df.withColumn("phone_number", col("phone_number").cast("string"))

current_df.printSchema()
historical_df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- customer_address: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- email: string (nullable = true)
 |-- valid_from: date (nullable = true)
 |-- valid_to: date (nullable = true)
 |-- start_date: date (nullable = true)
 |-- end_date: date (nullable = true)
 |-- current_flag: boolean (nullable = false)

root
 |-- customer_id: integer (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- customer_address: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- email: string (nullable = true)
 |-- valid_from: date (nullable = true)
 |-- valid_to: date (nullable = true)
 |-- start_date: date (nullable = true)
 |-- end_date: date (nullable = true)
 |-- current_flag: boolean (nullable = true)



In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import col, lit

# Load the historical data delta table
historical_delta = DeltaTable.forName(spark, "workspace.booking_and_customers.customer_table")

# Step 1: Expire existing records that have the same customer_id and are still marked as current
historical_delta.alias("target").merge(
    current_df.alias("source"),
    "target.customer_id = source.customer_id AND target.current_flag = true AND target.start_date < source.start_date"
).whenMatchedUpdate(
    condition="target.start_date < source.start_date",  # Ensure we only update records with a start_date before the new one
    set={
        "end_date": col("source.start_date"),  # Set the end date of the previous record to the start date of the new record
        "current_flag": lit(False)  # Mark the previous record as expired
    }
).execute()


current_df.write.format("delta").mode("append").saveAsTable("workspace.booking_and_customers.customer_table")

# Verifying the result
historical_df = spark.read.format("delta").table("workspace.booking_and_customers.customer_table")
display(historical_df)


customer_id,customer_name,customer_address,phone_number,email,valid_from,valid_to,start_date,end_date,current_flag
1035,Cheryl Weaver,"3716 Cunningham Station Apt. 567 Davidborough, TX 41021",893.223.0773x3326,smithcatherine@yahoo.com,2024-03-07,9999-12-31,2025-07-16,2200-01-01,True
1036,Rebecca Johnson,"674 Bishop Mission Suzannebury, NY 90306",018.713.0054x360,carol33@holt-higgins.info,2024-05-25,9999-12-31,2025-07-16,2200-01-01,True
1037,Lisa Hill,"6720 Brittany Streets Lake Sabrinaview, IN 22990",(646)830-3919x64651,cory15@hotmail.com,2024-11-01,9999-12-31,2025-07-16,2200-01-01,True
1038,Aaron Cooper,"37842 Haynes Isle Suite 421 South Marisa, PA 75690",249-334-3781x7626,crystal91@henderson-lane.net,2024-11-26,9999-12-31,2025-07-16,2200-01-01,True
1039,Betty Andrews,Unit 9441 Box 7301 DPO AA 92892,(055)647-0735,avilacody@yahoo.com,2024-04-10,9999-12-31,2025-07-16,2200-01-01,True
1047,Edward Stone,"31740 Martinez Trace Jonesview, NC 49949",884.266.5166x7808,zwhite@hotmail.com,2024-03-31,9999-12-31,2025-07-16,2200-01-01,True
1048,James Myers,"78527 Kelly Corner Powellbury, FL 03544",001-403-398-8094,vyoder@wiley-jones.com,2024-08-06,9999-12-31,2025-07-16,2200-01-01,True
1050,Scott Freeman,"528 John Hollow Theresabury, SC 37328",0970621868,emily78@gmail.com,2024-01-24,9999-12-31,2025-07-16,2200-01-01,True
1053,Robert Pratt,"9189 Christine Divide Port Edward, OH 51538",020.465.4390,anthony28@hotmail.com,2024-11-06,9999-12-31,2025-07-16,2200-01-01,True
1054,Ronald Sharp,"027 King Forest East Megan, WV 86035",665.459.2000x4188,christophergarcia@yahoo.com,2024-01-29,9999-12-31,2025-07-16,2200-01-01,True


In [0]:
final_df = spark.read.format("delta").table("workspace.booking_and_customers.customer_table")
display(final_df.orderBy("customer_id"))

customer_id,customer_name,customer_address,phone_number,email,valid_from,valid_to,start_date,end_date,current_flag
1001,Michelle Mcbride,"0613 Sanders Islands Suite 871 Catherineport, MN 20654",(220)633-2698x716,millergabriella@greene.com,2024-09-10,9999-12-31,2025-07-16,2200-01-01,True
1001,Elizabeth Olson,"05736 Trevor Ville Suite 638 Robertton, OR 38207",260.694.4065x215,tammy97@cobb-clarke.com,2022-09-22,9999-12-31,2025-07-01,2025-07-16,False
1002,Belinda Walsh,"811 Daniel Crescent Vaughnland, DE 12902",(124)398-7985x7797,theresa38@murphy.com,2022-11-22,9999-12-31,2025-07-01,2025-07-16,False
1002,Caleb Carter,USNS Young FPO AP 70550,(838)696-0925x780,wardjennifer@hotmail.com,2024-03-18,9999-12-31,2025-07-16,2200-01-01,True
1003,Marissa Blair,"370 Rodriguez Station Suite 432 Jenkinsmouth, NY 12919",+1-009-154-3329x478,dschwartz@gmail.com,2022-11-04,9999-12-31,2025-07-01,2200-01-01,True
1005,Lisa Wilson,"324 Smith Springs Lake Kathleen, MI 46779",001-398-400-0662x13884,plynn@singh.com,2022-11-13,9999-12-31,2025-07-01,2200-01-01,True
1006,Jennifer Adams,"27070 Young Views Apt. 973 Russellside, TX 46865",(860)022-1416,todd23@hotmail.com,2024-08-21,9999-12-31,2025-07-16,2200-01-01,True
1006,Kathleen Thompson,"8405 Rodriguez Viaduct Apt. 520 Carterton, IL 81631",(102)681-5777x90986,klevy@williamson.com,2023-01-11,9999-12-31,2025-07-01,2025-07-16,False
1007,Robert Johnson,"574 Patel Drive Apt. 043 Jonesville, TX 54199",001-927-927-7643x34711,danielbrown@west.com,2022-09-26,9999-12-31,2025-07-01,2200-01-01,True
1008,Cameron Patrick,"76801 Ruiz Roads Lisaview, ND 46964",527.433.0454x28798,kathy09@gmail.com,2024-07-15,9999-12-31,2025-07-16,2200-01-01,True


In [0]:
# union_df = historical_df.union(current_df)
# display(union_df)
# union_df.write.format("delta").mode("overwrite").saveAsTable("workspace.booking_and_customers.union_delta")
# temp_df = spark.read.format("delta").table("workspace.booking_and_customers.union_delta")
# from pyspark.sql.functions import current_date, lit
# from delta.tables import DeltaTable

# temp_delta = DeltaTable.forName(spark, "workspace.booking_and_customers.union_delta")
# historical_delta_df = DeltaTable.forName(spark, "workspace.booking_and_customers.customer_table").toDF()

# temp_alias = "t"
# historical_alias = "h"

# temp_delta.alias(temp_alias).merge(
#     historical_delta_df.alias(historical_alias),
#     f"{historical_alias}.customer_id = {temp_alias}.customer_id AND " +
#     f"{historical_alias}.start_date < current_date() AND " +
#     f"{historical_alias}.current_flag = 'true'"
# ).whenMatchedUpdate(set = {
#     "end_date": current_date(),
#     "current_flag": lit('false')
# }).execute()
# display(temp_delta.toDF())
