# Config

In [1]:
from delta import DeltaTable
from datetime import datetime

import ConnectionConfig as cc
cc.setupEnvironment()

# Start local cluster

In [2]:
spark = cc.startLocalCluster("DIM_CUSTOMER")
spark.getActiveSession()

# Ophalen van gegevens (en oplaan in data warehouse)

In [21]:
#Extract
cc.set_connectionProfile("velodb")

df_user = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "velo_users") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "userid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 60000) \
    .load()

df_user.show(5)

+------+-----------------+--------------------+--------------------+--------+-------+-------------------+------------+
|userid|             name|               email|              street|  number|zipcode|               city|country_code|
+------+-----------------+--------------------+--------------------+--------+-------+-------------------+------------+
|     2|van der Zee Julia|Julia.van.der.Zee...|          Europalaan|     43 |   2610|Wilrijk (Antwerpen)|          BE|
|     3|  de Boer Ricardo|Ricardo.de.Boer@g...|   Maria Clarastraat|     80 |   2160|          Wommelgem|          BE|
|     4|   Willems Angela|Angela.Willems@te...|Graaf Joseph de P...|     15 |   2900|            Schoten|          BE|
|     5| Heijnen Patricia|Patricia.Heijnen@...|          Meylstraat|    111 |   2540|               Hove|          BE|
|     6|   Driessen Anouk|Anouk.Driessen@sc...|   Jan Ockegemstraat|168 0107|   2650|             Edegem|          BE|
+------+-----------------+--------------------+-

In [22]:
#Transform
df_user.createOrReplaceTempView("source_User")

#df_dim_user= spark.sql("select * from source_User")
#df_dim_user.show(5)
#df_user.printSchema()

df_dim_user= spark.sql(f"select uuid() as user_SK,*,\
                        to_timestamp('1900-01-01','yyyy-MM-dd') as scd_start,\
                        to_timestamp('2100-12-12','yyyy-MM-dd') as scd_end,\
                        md5(concat(name, country_code, city, street, number)) as md5, \
                        True as current\
                        from source_User")
df_dim_user.show(5)


+--------------------+------+-----------------+--------------------+--------------------+--------+-------+-------------------+------------+-------------------+-------------------+--------------------+-------+
|              userSK|userid|             name|               email|              street|  number|zipcode|               city|country_code|          scd_start|            scd_end|                 md5|current|
+--------------------+------+-----------------+--------------------+--------------------+--------+-------+-------------------+------------+-------------------+-------------------+--------------------+-------+
|fcacef4b-58d9-4f3...|     2|van der Zee Julia|Julia.van.der.Zee...|          Europalaan|     43 |   2610|Wilrijk (Antwerpen)|          BE|1900-01-01 00:00:00|2100-12-12 00:00:00|1fac8d077aca6b0ad...|   true|
|73e9dd3d-228b-4fe...|     3|  de Boer Ricardo|Ricardo.de.Boer@g...|   Maria Clarastraat|     80 |   2160|          Wommelgem|          BE|1900-01-01 00:00:00|2100-

In [23]:
#Load
df_dim_user.write.format("delta").mode("overwrite").saveAsTable("dimUser")

# Updaten van SCD 2 type (adres)

In [24]:
# Extract (from data Warehouse)
dt_dimUser = DeltaTable.forPath(spark, ".\\spark-warehouse\\dimuser")
dt_dimUser.toDF().createOrReplaceTempView("dimUser_current")

spark.sql("select * from dimUser_current").show()

+--------------------+------+-------------------+--------------------+--------------------+--------+-------+--------------------+------------+-------------------+-------------------+--------------------+-------+
|              userSK|userid|               name|               email|              street|  number|zipcode|                city|country_code|          scd_start|            scd_end|                 md5|current|
+--------------------+------+-------------------+--------------------+--------------------+--------+-------+--------------------+------------+-------------------+-------------------+--------------------+-------+
|c8ed5dbb-e15f-434...| 30000|  Brouwers Patricia|Patricia.Brouwers...|   Joe Englishstraat|    590 |   2140|Borgerhout (Antwe...|          BE|1900-01-01 00:00:00|2100-12-12 00:00:00|a6909f0b0b1679a46...|   true|
|f02b90fa-cdd9-4ce...| 30001|   de Ruijter Maria|Maria.de.Ruijter@...|            Kallodam| 50 0802|   9120|Beveren-Waas/Haas...|          BE|1900-01-01

In [25]:
#Extract (from source database)
df_user_newRead = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "velo_users") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "userid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 60000) \
    .load()

df_user_newRead.createOrReplaceTempView("source_user_newRead")

df_dim_user_newRead= spark.sql(f"select uuid() as source_user_SK,\
                                userid as source_userid, name as source_name, email as source_email, street as source_street, number as source_number, zipcode as source_zipcode, \
                                city as source_city, country_code as source_country_code,\
                                md5(concat(name, country_code, city, street, number)) as source_md5 \
                                from source_user_newRead")

df_dim_user_newRead.createOrReplaceTempView("dimUser_new")

spark.sql("select * from dimUser_new").show()


+--------------------+-------------+--------------------+--------------------+--------------------+-------------+--------------+--------------------+-------------------+--------------------+
|       source_userSK|source_userid|         source_name|        source_email|       source_street|source_number|source_zipcode|         source_city|source_country_code|          source_md5|
+--------------------+-------------+--------------------+--------------------+--------------------+-------------+--------------+--------------------+-------------------+--------------------+
|dedcc000-b7b7-47e...|            2|   van der Zee Julia|Julia.van.der.Zee...|          Europalaan|          43 |          2610| Wilrijk (Antwerpen)|                 BE|1fac8d077aca6b0ad...|
|98778f5f-2b6a-487...|            3|     de Boer Ricardo|Ricardo.de.Boer@g...|   Maria Clarastraat|          80 |          2160|           Wommelgem|                 BE|183312416b7c7fd51...|
|2cfad65c-5f03-491...|            4|      Wil

In [37]:
##Transform
#Detect changes
detectedChanges = spark.sql(f"select * from dimUser_new source\
                            left outer join dimUser_current dwh on dwh.userid = source.source_userid and dwh.current == TRUE \
                            where dwh.userid is null or dwh.md5 <> source.source_md5")

detectedChanges.createOrReplaceTempView("detectedChanges")

#Note: zelf data aanpassen in db om dit te kunnen zien(update). Je kan ook nieuwe rij in de db toevoegen. (insert)
detectedChanges.show()



+--------------------+-------------+-----------------+--------------------+-----------------+-------------+--------------+-------------------+-------------------+--------------------+--------------------+------+-----------------+--------------------+-----------------+------+-------+-------------------+------------+-------------------+-------------------+--------------------+-------+
|       source_userSK|source_userid|      source_name|        source_email|    source_street|source_number|source_zipcode|        source_city|source_country_code|          source_md5|              userSK|userid|             name|               email|           street|number|zipcode|               city|country_code|          scd_start|            scd_end|                 md5|current|
+--------------------+-------------+-----------------+--------------------+-----------------+-------------+--------------+-------------------+-------------------+--------------------+--------------------+------+-----------------

In [38]:
#Transform
#upserts (updates and inserts)
run_timestamp =datetime.now()

df_upserts = spark.sql(f"select source_user_SK as user_SK, \
                                source_userid as userid, source_name as name, source_email as email , source_street as street,source_number as number, source_zipcode as zipcode, \
                                source_city as city, source_country_code as country_code, \
                                to_timestamp('{run_timestamp}') as scd_start, \
                                to_timestamp('2100-12-12','yyyy-MM-dd') as scd_end, \
                                source_md5 as md5, \
                                True as current \
                        from detectedChanges \
                        union \
                        select user_SK, \
                                userid, name, email, street, number, zipcode, city, country_code, \
                                scd_start, \
                                to_timestamp('{run_timestamp}') as scd_end, \
                                md5, \
                                false \
                        from detectedChanges \
                        where current is not null")


df_upserts.createOrReplaceTempView("upserts")
spark.sql("select * from upserts").show()

+--------------------+------+-----------------+--------------------+-----------------+------+-------+-------------------+------------+--------------------+--------------------+--------------------+-------+
|              userSK|userid|             name|               email|           street|number|zipcode|               city|country_code|           scd_start|             scd_end|                 md5|current|
+--------------------+------+-----------------+--------------------+-----------------+------+-------+-------------------+------------+--------------------+--------------------+--------------------+-------+
|dedcc000-b7b7-47e...|     2|van der Zee Julia|Julia.van.der.Zee...|       Europalaan|   42 |   2610|Wilrijk (Antwerpen)|          BE|2024-12-15 15:53:...| 2100-12-12 00:00:00|7d9d7dfede494c602...|   true|
|98778f5f-2b6a-487...|     3|  de Boer Ricardo|Ricardo.de.Boer@g...|Maria Clarastraat|  81  |   2160|          Wommelgem|          BE|2024-12-15 15:53:...| 2100-12-12 00:00:00|

In [41]:
#Load
##Merge (changes into current data warehouse)
spark.sql(f"MERGE INTO dimUser_current as target \
          using upserts as source ON target.userid = source.userid and source.current = false and target.current = true \
          WHEN MATCHED THEN UPDATE SET scd_end = source.scd_end, current = source.current \
          WHEN NOT MATCHED THEN INSERT (user_SK, userid, name, email, street, number, zipcode, city, country_code, scd_start, scd_end, md5, current) values \
          (source.user_SK, source.userid, source.name, source.email, source.street, source.number, source.zipcode, source.city, source.country_code, source.scd_start, source.scd_end, source.md5, source.current)")
#Matched is when the current is set to false (new record)
#Not matched is for the new records

dt_dimUser.toDF().sort("userId", "scd_start").show(100)

+--------------------+------+--------------------+--------------------+--------------------+--------+-------+--------------------+------------+--------------------+--------------------+--------------------+-------+
|              userSK|userid|                name|               email|              street|  number|zipcode|                city|country_code|           scd_start|             scd_end|                 md5|current|
+--------------------+------+--------------------+--------------------+--------------------+--------+-------+--------------------+------------+--------------------+--------------------+--------------------+-------+
|24fe7a37-cf88-446...|     1|         Bouman Lars|Lars.Bouman@gmail...|         Somméstraat|    155 |   2060|           Antwerpen|          BE| 1900-01-01 00:00:00| 2100-12-12 00:00:00|ba6e8c6ad4554da28...|   true|
|fcacef4b-58d9-4f3...|     2|   van der Zee Julia|Julia.van.der.Zee...|          Europalaan|     43 |   2610| Wilrijk (Antwerpen)|          

# Stoppen van Local Cluster

In [42]:
spark.stop();

# Klad hieronder

In [1]:
from pyspark.sql import SparkSession

def startSimpleCluster(appName):
    spark = SparkSession.builder \
        .appName(appName) \
        .config("spark.master", "local[1]") \
        .config("spark.driver.host", "127.0.0.1") \
        .getOrCreate()
    return spark

if __name__ == "__main__":
    spark = startSimpleCluster("TestApp")
    print("Spark session created successfully!")
    spark.stop()

Spark session created successfully!


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Check Temp Dir") \
    .master("local[*]") \
    .getOrCreate()

print("Temporary directory Spark is using:")
print(spark.sparkContext.getConf().get("spark.local.dir"))

spark.stop()

Temporary directory Spark is using:
None


In [30]:
users_df = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "velo_users") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "userid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()

subscriptions_df = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "Subscriptions") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .load()

subscription_types_df = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "Subscription_types") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .load()




In [31]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lead

# Voeg de data samen
combined_df = subscriptions_df.join(users_df, "UserId") \
    .join(subscription_types_df, "SubscriptionTypeId")

# Bereken de einddatum met een window-functie
window_spec = Window.partitionBy("UserId").orderBy("ValidFrom")
combined_df = combined_df.withColumn("end_date", lead("ValidFrom", 1).over(window_spec))


In [32]:
from pyspark.sql.functions import lit, when

scd_df = combined_df.withColumn("is_current", when(col("end_date").isNull(), lit(True)).otherwise(lit(False)))


In [33]:
scd_df.write.format("delta").mode("overwrite").save("/path/to/customer_dimension")


In [36]:
from pyspark.sql.functions import current_date, lit

# Lees bestaande klantdimensie in
existing_data = spark.read.format("delta").load("/path/to/customer_dimension")

# Lees nieuwe gegevens in (vervang 'jdbc-url' en tabellen met jouw configuratie)
new_data =   spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "velo_users") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "userid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()

subscriptions = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "Subscriptions") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .load()

subscription_types = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "Subscription_types") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .load()

# Combineer de tabellen
new_data = new_data.join(subscriptions, "UserId") \
    .join(subscription_types, "SubscriptionTypeId")

# Controleer de kolomnamen
print("Kolommen in nieuwe data:")
new_data.printSchema()

print("Kolommen in bestaande dimensie:")
existing_data.printSchema()

# Gebruik de juiste kolom voor de join (bijv. `UserId`)
updates = new_data.join(existing_data, "UserId", "left_anti")  # Alleen nieuwe of gewijzigde data

# Update bestaande rijen
updated_existing = existing_data.join(updates, "UserId") \
    .withColumn("end_date", current_date()) \
    .withColumn("is_current", lit(False))

# Voeg nieuwe rijen in
new_rows = updates.withColumn("start_date", current_date()) \
    .withColumn("end_date", lit(None).cast("date")) \
    .withColumn("is_current", lit(True))

# Combineer en sla op
final_data = updated_existing.union(new_rows)
final_data.write.format("delta").mode("overwrite").save("/path/to/customer_dimension")

Kolommen in nieuwe data:
root
 |-- subscriptiontypeid: integer (nullable = true)
 |-- userid: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- street: string (nullable = true)
 |-- number: string (nullable = true)
 |-- zipcode: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- subscriptionid: integer (nullable = true)
 |-- validfrom: date (nullable = true)
 |-- description: string (nullable = true)

Kolommen in bestaande dimensie:
root
 |-- subscriptiontypeid: integer (nullable = true)
 |-- userid: integer (nullable = true)
 |-- subscriptionid: integer (nullable = true)
 |-- validfrom: date (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- street: string (nullable = true)
 |-- number: string (nullable = true)
 |-- zipcode: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country_code: string (nullable = true)
 

AnalysisException: [NUM_COLUMNS_MISMATCH] UNION can only be performed on inputs with the same number of columns, but the first input has 25 columns and the second input has 15 columns.;
'Union false, false
:- Project [userid#1396, subscriptiontypeid#1395, subscriptionid#1397, validfrom#1398, name#1399, email#1400, street#1401, number#1402, zipcode#1403, city#1404, country_code#1405, description#1406, end_date#1527, false AS is_current#1553, subscriptiontypeid#1441, name#1424, email#1425, street#1426, number#1427, zipcode#1428, city#1429, country_code#1430, subscriptionid#1439, validfrom#1440, description#1448]
:  +- Project [userid#1396, subscriptiontypeid#1395, subscriptionid#1397, validfrom#1398, name#1399, email#1400, street#1401, number#1402, zipcode#1403, city#1404, country_code#1405, description#1406, current_date(Some(Europe/Brussels)) AS end_date#1527, is_current#1408, subscriptiontypeid#1441, name#1424, email#1425, street#1426, number#1427, zipcode#1428, city#1429, country_code#1430, subscriptionid#1439, validfrom#1440, description#1448]
:     +- Project [userid#1396, subscriptiontypeid#1395, subscriptionid#1397, validfrom#1398, name#1399, email#1400, street#1401, number#1402, zipcode#1403, city#1404, country_code#1405, description#1406, end_date#1407, is_current#1408, subscriptiontypeid#1441, name#1424, email#1425, street#1426, number#1427, zipcode#1428, city#1429, country_code#1430, subscriptionid#1439, validfrom#1440, description#1448]
:        +- Join Inner, (userid#1396 = userid#1423)
:           :- Relation [subscriptiontypeid#1395,userid#1396,subscriptionid#1397,validfrom#1398,name#1399,email#1400,street#1401,number#1402,zipcode#1403,city#1404,country_code#1405,description#1406,end_date#1407,is_current#1408] parquet
:           +- Project [userid#1423, subscriptiontypeid#1441, name#1424, email#1425, street#1426, number#1427, zipcode#1428, city#1429, country_code#1430, subscriptionid#1439, validfrom#1440, description#1448]
:              +- Join LeftAnti, (userid#1423 = userid#1488)
:                 :- Project [subscriptiontypeid#1441, userid#1423, name#1424, email#1425, street#1426, number#1427, zipcode#1428, city#1429, country_code#1430, subscriptionid#1439, validfrom#1440, description#1448]
:                 :  +- Join Inner, (subscriptiontypeid#1441 = subscriptiontypeid#1447)
:                 :     :- Project [userid#1423, name#1424, email#1425, street#1426, number#1427, zipcode#1428, city#1429, country_code#1430, subscriptionid#1439, validfrom#1440, subscriptiontypeid#1441]
:                 :     :  +- Join Inner, (userid#1423 = userid#1442)
:                 :     :     :- Relation [userid#1423,name#1424,email#1425,street#1426,number#1427,zipcode#1428,city#1429,country_code#1430] JDBCRelation(velo_users) [numPartitions=4]
:                 :     :     +- Relation [subscriptionid#1439,validfrom#1440,subscriptiontypeid#1441,userid#1442] JDBCRelation(Subscriptions) [numPartitions=1]
:                 :     +- Relation [subscriptiontypeid#1447,description#1448] JDBCRelation(Subscription_types) [numPartitions=1]
:                 +- Relation [subscriptiontypeid#1487,userid#1488,subscriptionid#1489,validfrom#1490,name#1491,email#1492,street#1493,number#1494,zipcode#1495,city#1496,country_code#1497,description#1498,end_date#1499,is_current#1500] parquet
+- Project [userid#1624, subscriptiontypeid#1634, name#1625, email#1626, street#1627, number#1628, zipcode#1629, city#1630, country_code#1631, subscriptionid#1632, validfrom#1633, description#1637, start_date#1579, end_date#1593, true AS is_current#1608]
   +- Project [userid#1624, subscriptiontypeid#1634, name#1625, email#1626, street#1627, number#1628, zipcode#1629, city#1630, country_code#1631, subscriptionid#1632, validfrom#1633, description#1637, start_date#1579, cast(null as date) AS end_date#1593]
      +- Project [userid#1624, subscriptiontypeid#1634, name#1625, email#1626, street#1627, number#1628, zipcode#1629, city#1630, country_code#1631, subscriptionid#1632, validfrom#1633, description#1637, current_date(Some(Europe/Brussels)) AS start_date#1579]
         +- Project [userid#1624, subscriptiontypeid#1634, name#1625, email#1626, street#1627, number#1628, zipcode#1629, city#1630, country_code#1631, subscriptionid#1632, validfrom#1633, description#1637]
            +- Join LeftAnti, (userid#1624 = userid#1639)
               :- Project [subscriptiontypeid#1634, userid#1624, name#1625, email#1626, street#1627, number#1628, zipcode#1629, city#1630, country_code#1631, subscriptionid#1632, validfrom#1633, description#1637]
               :  +- Join Inner, (subscriptiontypeid#1634 = subscriptiontypeid#1636)
               :     :- Project [userid#1624, name#1625, email#1626, street#1627, number#1628, zipcode#1629, city#1630, country_code#1631, subscriptionid#1632, validfrom#1633, subscriptiontypeid#1634]
               :     :  +- Join Inner, (userid#1624 = userid#1635)
               :     :     :- Relation [userid#1624,name#1625,email#1626,street#1627,number#1628,zipcode#1629,city#1630,country_code#1631] JDBCRelation(velo_users) [numPartitions=4]
               :     :     +- Relation [subscriptionid#1632,validfrom#1633,subscriptiontypeid#1634,userid#1635] JDBCRelation(Subscriptions) [numPartitions=1]
               :     +- Relation [subscriptiontypeid#1636,description#1637] JDBCRelation(Subscription_types) [numPartitions=1]
               +- Relation [subscriptiontypeid#1638,userid#1639,subscriptionid#1640,validfrom#1641,name#1642,email#1643,street#1644,number#1645,zipcode#1646,city#1647,country_code#1648,description#1649,end_date#1650,is_current#1651] parquet
