In [14]:
#SETUP
from datetime import datetime
from delta import DeltaTable
import ConnectionConfig as cc
cc.setupEnvironment()

In [15]:
#SETUP
spark = cc.startLocalCluster("dimCustomerIncrementalLoad")
spark.getActiveSession()

## Incremental load 
* Setting the parameters

In [16]:

run_timestamp = datetime.now()

In [17]:
cc.set_connectionProfile("VeloDB")

#### Read source table


In [18]:
#EXTRACT
df_velo_users =spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "velo_users") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "userid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()

df_subscriptions = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "subscriptions") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "subscriptionid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()

df_subscription_types = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "subscription_types") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "subscriptiontypeid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()

In [19]:
#TRANSFORM
df_velo_users.createOrReplaceTempView("operational_velo_users")
df_subscriptions.createOrReplaceTempView("operational_subscriptions")
df_subscription_types.createOrReplaceTempView("operational_subscription_types")

In [20]:
#TRANSFORM
df_dim_customers = spark.sql("""
    SELECT 
        monotonically_increasing_id() as source_customer_SK,
        u.userid as source_customer_id,
        u.name as source_customer,
        u.street as source_street, 
        u.number source_number,
        u.zipcode as source_zipcode,
        u.city as source_city, 
        u.country_code as source_country_code,
        st.description as source_subscription_type,
        s.validfrom as source_validform,
        md5(concat(u.name, street,number,zipcode,city,country_code )) as source_md5
    FROM operational_velo_users u
        JOIN operational_subscriptions s 
                ON u.userid = s.userid
        JOIN operational_subscription_types st 
                ON s.subscriptiontypeid = st.subscriptiontypeid
    WHERE s.validfrom = (
        SELECT MAX(validfrom) 
        FROM operational_subscriptions 
        WHERE userid = u.userid)
""")

df_dim_customers.createOrReplaceTempView("dimCustomer_new")

In [21]:
#TRANSFORM
dt_dimCustomer = DeltaTable.forPath(spark, "./spark-warehouse/dimCustomer")
dt_dimCustomer.toDF().createOrReplaceTempView("dimCustomer_current")

#DEBUG CODE TO SHOW CONTENT OF DIMENSION
spark.sql("""
   select * from dimCustomer_new
""").show()

+------------------+------------------+--------------------+--------------------+-------------+--------------+--------------------+-------------------+------------------------+----------------+--------------------+
|source_customer_SK|source_customer_id|     source_customer|       source_street|source_number|source_zipcode|         source_city|source_country_code|source_subscription_type|source_validform|          source_md5|
+------------------+------------------+--------------------+--------------------+-------------+--------------+--------------------+-------------------+------------------------+----------------+--------------------+
|                 0|                 2|   van der Zee Julia|     Steenbergstraat|          43 |          2610| Wilrijk (Antwerpen)|                 BE|                   MAAND|      2021-10-05|73d9101519509eb55...|
|                 1|                 4|      Willems Angela|Graaf Joseph de P...|          15 |          2900|             Schoten|         

##### 2 DETECT CHANGES


In [22]:
#TRANSFORM
detectedChanges = spark.sql("""
    select * 
    from dimCustomer_new source
        left outer join dimCustomer_current dwh 
            on dwh.customer_id = source.source_customer_id
            and dwh.current == true
    where dwh.customer_id is null or dwh.md5 <> source.source_md5
""")
detectedChanges.createOrReplaceTempView("detectedChanges")

#DEBUG CODE TO SHOW CONTENT OF DETECTED CHANGES
detectedChanges.show()

+------------------+------------------+-----------------+---------------+-------------+--------------+-------------------+-------------------+------------------------+----------------+--------------------+-----------+-----------+-----------------+----------+------+-------+-------------------+------------+-----------------+-------------------+-------------------+----------+---------------+--------------------+-------+
|source_customer_SK|source_customer_id|  source_customer|  source_street|source_number|source_zipcode|        source_city|source_country_code|source_subscription_type|source_validform|          source_md5|customer_SK|customer_id|             name|    street|number|zipcode|               city|country_code|subscription_type|          scd_start|            scd_end| validform|subscription_id|                 md5|current|
+------------------+------------------+-----------------+---------------+-------------+--------------+-------------------+-------------------+----------------

##### 3 TRANSOFRM TO UPSERTS


In [23]:
#TRANSFORM 
#zoekt de grootste waarde, met de max functie en die doe je +1 en dan zet je die als je nieuwe sk
max_customer_SK = spark.sql("SELECT COALESCE(MAX(customer_SK), 0) AS max_customer_SK FROM dimCustomer_current").collect()[0]['max_customer_SK']


df_upsert = spark.sql(f"""
    select  
       {max_customer_SK} as customer_SK,
        source_customer_id as customer_id,
        source_customer as name,
        source_street, 
        source_number,
        source_zipcode,
        source_city, 
        source_country_code,
        source_subscription_type as subscription_type,
        to_timestamp('{run_timestamp}') as scd_start,
        to_timestamp('2600-01-01', 'yyyy-MM-dd') as scd_end,
        true as current,
        source_md5 as md5
    FROM detectedChanges dc1
    union all
        select  
            customer_SK,
            customer_id,
            name,
            street, 
            number,
            zipcode,
            city, 
            country_code,
            subscription_type,
            scd_start,
            to_timestamp('{run_timestamp}') as scd_end,
            false as current,
            md5
        FROM detectedChanges dc2
        where current is not null
""")
df_upsert.createOrReplaceTempView("upsert")

In [24]:
#DEBUG CODE TO SHOW CONTENT OF UPSERT
spark.sql("""
    select * from upsert
""").show()

+-----------+-----------+-----------------+---------------+-------------+--------------+-------------------+-------------------+-----------------+--------------------+--------------------+-------+--------------------+
|customer_SK|customer_id|             name|  source_street|source_number|source_zipcode|        source_city|source_country_code|subscription_type|           scd_start|             scd_end|current|                 md5|
+-----------+-----------+-----------------+---------------+-------------+--------------+-------------------+-------------------+-----------------+--------------------+--------------------+-------+--------------------+
| 8589994849|          2|van der Zee Julia|Steenbergstraat|          43 |          2610|Wilrijk (Antwerpen)|                 BE|            MAAND|2024-11-03 17:19:...| 2600-01-01 00:00:00|   true|73d9101519509eb55...|
|          0|          2|van der Zee Julia|     Europalaan|          43 |          2610|Wilrijk (Antwerpen)|                 BE|

In [25]:
#CREATE MERGE
spark.sql("""
    MERGE INTO dimCustomer_current as target
    using upsert as source on target.customer_id = source.customer_id 
        and source.current = false 
        and target.current = true
    WHEN MATCHED THEN UPDATE SET scd_end = source.scd_end, current = source.current
    WHEN NOT MATCHED THEN INSERT (customer_SK, customer_id, name, street, number, zipcode, city, country_code, subscription_type, scd_start, scd_end, current, md5) values (source.customer_SK, source.customer_id, source.name, source.source_street, source.source_number, source.source_zipcode, source.source_city, source.source_country_code, source.subscription_type, source.scd_start, source.scd_end, source.current, source.md5)
""")

#DEBUG CODE TO SHOW CONTENT OF DIMENSION
dt_dimCustomer.toDF().sort("customer_id", "scd_start").show(100)

+-----------+-----------+--------------------+--------------------+--------+-------+--------------------+------------+-----------------+--------------------+--------------------+----------+---------------+--------------------+-------+
|customer_SK|customer_id|                name|              street|  number|zipcode|                city|country_code|subscription_type|           scd_start|             scd_end| validform|subscription_id|                 md5|current|
+-----------+-----------+--------------------+--------------------+--------+-------+--------------------+------------+-----------------+--------------------+--------------------+----------+---------------+--------------------+-------+
|          3|          1|         Bouman Lars|    Somméstraat Said|    156 |   2060|           Antwerpen|          BE|              DAG| 1999-01-01 00:00:00| 2100-12-12 00:00:00|2020-12-14|              3|06a6b6950726fcfb4...|   true|
|          0|          2|   van der Zee Julia|          Euro

In [26]:
spark.stop()