In [0]:
from pyspark.sql import Row
from datetime import date
from pyspark.sql.types import (
    StructType,
    StructField,
    IntegerType,
    StringType,
    DateType,
    BooleanType
)

schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), False),
    StructField("region", StringType(), False),
    StructField("valid_from", DateType(), False),
    StructField("valid_to", DateType(), True),
    StructField("is_current", BooleanType(), False)
])

initial_data = [
    Row(id=1, name="Alice", region="East",  valid_from=date(2020,1,1), valid_to=None, is_current=True),
    Row(id=2, name="Bob",   region="West",  valid_from=date(2020,1,1), valid_to=None, is_current=True),
    Row(id=3, name="Carol", region="South", valid_from=date(2020,1,1), valid_to=None, is_current=True),
]

df_customers = spark.createDataFrame(initial_data, schema=schema)
df_customers.write.mode("overwrite").saveAsTable("customer_dim")

display(spark.sql("SELECT * FROM customer_dim"))

id,name,region,valid_from,valid_to,is_current
1,Alice,East,2020-01-01,,True
2,Bob,West,2020-01-01,,True
3,Carol,South,2020-01-01,,True


In [0]:
# Step 2a: Close old record (set valid_to + is_current=False)
spark.sql("""
UPDATE customer_dim
SET valid_to = DATE('2020-12-31'), is_current = false
WHERE id = 3 AND is_current = true
""")

# Step 2b: Insert new record for Carol in North
new_data = [
    Row(id=3, name="Carol", region="North",
        valid_from=date(2021,1,1), valid_to=None, is_current=True)
]
df_update = spark.createDataFrame(new_data, schema = schema)
df_update.write.mode("append").saveAsTable("customer_dim")

spark.sql("SELECT * FROM customer_dim ORDER BY id, valid_from").show()


+---+-----+------+----------+----------+----------+
| id| name|region|valid_from|  valid_to|is_current|
+---+-----+------+----------+----------+----------+
|  1|Alice|  East|2020-01-01|      NULL|      true|
|  2|  Bob|  West|2020-01-01|      NULL|      true|
|  3|Carol| South|2020-01-01|2020-12-31|     false|
|  3|Carol| North|2021-01-01|      NULL|      true|
+---+-----+------+----------+----------+----------+



In [0]:
%sql
-- Show full history of a customer
SELECT * FROM customer_dim WHERE id = 3 ORDER BY valid_from;


id,name,region,valid_from,valid_to,is_current
3,Carol,South,2020-01-01,2020-12-31,False
3,Carol,North,2021-01-01,,True
