# Config

In [1]:
from delta import DeltaTable
from datetime import datetime

import ConnectionConfig as cc
cc.setupEnvironment()

# Start local cluster

In [2]:
spark = cc.startLocalCluster("DIM_CUSTOMER")
spark.getActiveSession()

# Ophalen van gegevens (en opslaan in data warehouse)

In [3]:
#Extract
cc.set_connectionProfile("VeloDB")

df_user = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "velo_users") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "userid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 60000) \
    .load()

df_subscription = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "Subscriptions") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "userid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 80000) \
    .load()

df_subscription_type = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "Subscription_types") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .load()

df_user.show(5)
df_subscription.show(5)
df_subscription_type.show(5)


+------+-----------------+--------------------+--------------------+--------+-------+-------------------+------------+
|userid|             name|               email|              street|  number|zipcode|               city|country_code|
+------+-----------------+--------------------+--------------------+--------+-------+-------------------+------------+
|     2|van der Zee Julia|Julia.van.der.Zee...|          Europalaan|     42 |   2610|Wilrijk (Antwerpen)|          BE|
|     3|  de Boer Ricardo|Ricardo.de.Boer@g...|   Maria Clarastraat|    81  |   2160|          Wommelgem|          BE|
|     4|   Willems Angela|Angela.Willems@te...|Graaf Joseph de P...|     16 |   2900|            Schoten|          BE|
|     5| Heijnen Patricia|Patricia.Heijnen@...|          Meylstraat|    112 |   2540|               Hove|          BE|
|     6|   Driessen Anouk|Anouk.Driessen@sc...|   Jan Ockegemstraat|168 0107|   2650|             Edegem|          BE|
+------+-----------------+--------------------+-

In [4]:
#Transform
df_user.createOrReplaceTempView("source_User")
df_subscription.createOrReplaceTempView("source_Subscription")
df_subscription_type.createOrReplaceTempView("source_Subscription_type")

#df_dim_user= spark.sql("select * from source_User")
#df_dim_user.show(5)
#df_user.printSchema()

# I left out email and zipcode
df_dim_user= spark.sql(f"select uuid() as user_SK, user.userid, user.name, subscription.subscriptionid as subscriptionid , subscription_type.description as subscription_type, subscription.validfrom as valid_from,\
                            user.street, user.number, user.city, user.country_code,\
                            to_timestamp('1900-01-01','yyyy-MM-dd') as scd_start,\
                            to_timestamp('2100-12-12','yyyy-MM-dd') as scd_end,\
                            md5(concat(country_code, city, street, number)) as md5,\
                            True as current\
                        from source_User as user\
                        join (select userid, max(validfrom) as last_validfrom from source_Subscription group by userid) as last_subscription on user.userid = last_subscription.userid\
                        join source_Subscription as subscription on subscription.userid = last_subscription.userid AND subscription.validfrom = last_subscription.last_validfrom\
                        join source_Subscription_type as subscription_type on subscription_type.subscriptiontypeid = subscription.subscriptiontypeid")


df_dim_user.show()


+--------------------+------+-----------------+--------------+-----------------+----------+--------------------+-------+--------------------+------------+-------------------+-------------------+--------------------+-------+
|             user_SK|userid|             name|subscriptionid|subscription_type|valid_from|              street| number|                city|country_code|          scd_start|            scd_end|                 md5|current|
+--------------------+------+-----------------+--------------+-----------------+----------+--------------------+-------+--------------------+------------+-------------------+-------------------+--------------------+-------+
|92fc2679-adc9-431...|    12|     Simons Thijs|            23|             JAAR|2023-10-20|         Bergenhoeve|81 0302|Antwerpen/Berendr...|          BE|1900-01-01 00:00:00|2100-12-12 00:00:00|185076a17fcae9054...|   true|
|adf38d31-eb12-42a...|    13|       Groen Rens|            24|             JAAR|2020-02-28|  Trompetvoge

In [5]:
#Load
df_dim_user.write.format("delta").mode("overwrite").saveAsTable("dimUser")

# Incremental (Updaten van SCD 2 type)

In [6]:
# Extract (from data Warehouse)
dt_dimUser = DeltaTable.forPath(spark, ".\\spark-warehouse\\dimuser")
dt_dimUser.toDF().createOrReplaceTempView("dimUser_current")

spark.sql("select * from dimUser_current").show();

+--------------------+------+-----------------+--------------+-----------------+----------+--------------------+-------+--------------------+------------+-------------------+-------------------+--------------------+-------+
|             user_SK|userid|             name|subscriptionid|subscription_type|valid_from|              street| number|                city|country_code|          scd_start|            scd_end|                 md5|current|
+--------------------+------+-----------------+--------------+-----------------+----------+--------------------+-------+--------------------+------------+-------------------+-------------------+--------------------+-------+
|92fc2679-adc9-431...|    12|     Simons Thijs|            23|             JAAR|2023-10-20|         Bergenhoeve|81 0302|Antwerpen/Berendr...|          BE|1900-01-01 00:00:00|2100-12-12 00:00:00|185076a17fcae9054...|   true|
|adf38d31-eb12-42a...|    13|       Groen Rens|            24|             JAAR|2020-02-28|  Trompetvoge

In [9]:
#Extract (from source database)
df_user_newRead = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "velo_users") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "userid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 60000) \
    .load()

df_subscription_newRead = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "Subscriptions") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "userid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 80000) \
    .load()

df_subscription_type_newRead = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "Subscription_types") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .load()

df_user_newRead.createOrReplaceTempView("source_user_newRead")
df_subscription_newRead.createOrReplaceTempView("source_subscription_newRead")
df_subscription_type_newRead.createOrReplaceTempView("source_subscription_type_newRead")

df_dim_user_newRead= spark.sql(f"select uuid() as source_user_SK,\
                                user.userid as source_userid, user.name as source_name, \
                                subscription.subscriptionid as source_subscriptionid, subscription_type.description as source_subscription_type, subscription.validfrom as source_valid_from,\
                                user.street as source_street, user.number as source_number, \
                                city as source_city, country_code as source_country_code,\
                                md5(concat(country_code, city, street, number)) as source_md5 \
                                from source_user_newRead as user\
                                join (select userid, max(validfrom) as last_validfrom from source_Subscription_newRead group by userid) as last_subscription on user.userid = last_subscription.userid\
                                join source_Subscription_newRead as subscription on subscription.userid = last_subscription.userid AND subscription.validfrom = last_subscription.last_validfrom\
                                join source_Subscription_type_newRead as subscription_type on subscription_type.subscriptiontypeid = subscription.subscriptiontypeid")

df_dim_user_newRead.createOrReplaceTempView("dimUser_new")

spark.sql("select * from dimUser_new").show()


+--------------------+-------------+-----------------+---------------------+------------------------+-----------------+--------------------+-------------+--------------------+-------------------+--------------------+
|      source_user_SK|source_userid|      source_name|source_subscriptionid|source_subscription_type|source_valid_from|       source_street|source_number|         source_city|source_country_code|          source_md5|
+--------------------+-------------+-----------------+---------------------+------------------------+-----------------+--------------------+-------------+--------------------+-------------------+--------------------+
|f3cf547e-099d-437...|           12|     Simons Thijs|                   23|                    JAAR|       2023-10-20|         Bergenhoeve|      81 0302|Antwerpen/Berendr...|                 BE|185076a17fcae9054...|
|cd6daae1-a505-4f6...|           13|       Groen Rens|                   24|                    JAAR|       2020-02-28|  Trompetvoge

In [10]:
##Transform
#Detect changes
detectedChanges = spark.sql(f"select * from dimUser_new source\
                            left outer join dimUser_current dwh on dwh.userid = source.source_userid and dwh.current == TRUE  \
                            where dwh.userid is null\
                            or dwh.md5 <> source.source_md5")

detectedChanges.createOrReplaceTempView("detectedChanges")

#Note: zelf data aanpassen in db om dit te kunnen zien(update). Je kan ook nieuwe rij in de db toevoegen. (insert)
detectedChanges.show()



+--------------------+-------------+-----------+---------------------+------------------------+-----------------+-------------+-------------+-------------------+-------------------+--------------------+--------------------+------+--------+--------------+-----------------+----------+-------------+------+-------------------+------------+-------------------+-------------------+--------------------+-------+
|      source_user_SK|source_userid|source_name|source_subscriptionid|source_subscription_type|source_valid_from|source_street|source_number|        source_city|source_country_code|          source_md5|             user_SK|userid|    name|subscriptionid|subscription_type|valid_from|       street|number|               city|country_code|          scd_start|            scd_end|                 md5|current|
+--------------------+-------------+-----------+---------------------+------------------------+-----------------+-------------+-------------+-------------------+-------------------+-----

In [11]:
#Transform
#upserts (updates and inserts)
run_timestamp =datetime.now()

df_upserts = spark.sql(f"select source_user_SK as user_SK,\
                                source_userid as userid, source_name as name,\
                                source_subscriptionid as subscriptionid, source_subscription_type as subscription_type, source_valid_from as valid_from, \
                                source_street as street,source_number as number, source_city as city, source_country_code as country_code,\
                                to_timestamp('{run_timestamp}') as scd_start,\
                                to_timestamp('2100-12-12','yyyy-MM-dd') as scd_end,\
                                source_md5 as md5,\
                                True as current\
                        from detectedChanges\
                        union\
                        select user_SK,\
                                userid, name,\
                                subscriptionid, subscription_type, valid_from,\
                                street, number, city, country_code,\
                                scd_start,\
                                to_timestamp('{run_timestamp}') as scd_end,\
                                md5,\
                                false\
                        from detectedChanges\
                        where current is not null")


df_upserts.createOrReplaceTempView("upserts")
spark.sql("select * from upserts").show()

+--------------------+------+--------+--------------+-----------------+----------+-------------+------+-------------------+------------+--------------------+--------------------+--------------------+-------+
|             user_SK|userid|    name|subscriptionid|subscription_type|valid_from|       street|number|               city|country_code|           scd_start|             scd_end|                 md5|current|
+--------------------+------+--------+--------------+-----------------+----------+-------------+------+-------------------+------------+--------------------+--------------------+--------------------+-------+
|f9b39b3c-0f68-40b...|     9|Smit Tim|            16|            MAAND|2023-11-29|Bikschotelaan|    59|Berchem (Antwerpen)|          BE|2025-01-12 18:09:...| 2100-12-12 00:00:00|d8b0e23a5a49503b2...|   true|
|0ec458bc-9bab-4b9...|     9|Smit Tim|            16|            MAAND|2023-11-29|Bikschotelaan|   60 |Berchem (Antwerpen)|          BE| 1900-01-01 00:00:00|2025-01-12 

In [12]:
#Load
##Merge (changes into current data warehouse)
spark.sql(f"MERGE INTO dimUser_current as target \
          using upserts as source ON target.userid = source.userid and source.current = false and target.current = true \
          WHEN MATCHED THEN UPDATE SET scd_end = source.scd_end, current = source.current \
          WHEN NOT MATCHED THEN INSERT (user_SK, userid, name, subscriptionid, subscription_type, valid_from, street, number, city, country_code, scd_start, scd_end, md5, current) values \
          (source.user_SK, source.userid, source.name, source.subscriptionid, source.subscription_type, source.valid_from,source.street, source.number, source.city, source.country_code, source.scd_start, source.scd_end, source.md5, source.current)")
#Matched is when the current is set to false (new record)
#Not matched is for the new records

dt_dimUser.toDF().sort("userId", "scd_start").show(100)

+--------------------+------+--------------------+--------------+-----------------+----------+--------------------+--------+--------------------+------------+--------------------+--------------------+--------------------+-------+
|             user_SK|userid|                name|subscriptionid|subscription_type|valid_from|              street|  number|                city|country_code|           scd_start|             scd_end|                 md5|current|
+--------------------+------+--------------------+--------------+-----------------+----------+--------------------+--------+--------------------+------------+--------------------+--------------------+--------------------+-------+
|0e1f50c7-69a1-42b...|     1|         Bouman Lars|             3|              DAG|2020-12-14|         Somméstraat|    155 |           Antwerpen|          BE| 1900-01-01 00:00:00| 2100-12-12 00:00:00|6ad982e1b845c5b41...|   true|
|6e5081b9-7ec9-459...|     2|   van der Zee Julia|             4|            MAA

# Stoppen van Local Cluster

In [13]:
spark.stop()