In [1]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
import ConnectionConfig as cc
cc.setupEnvironment()

In [2]:
spark = cc.startLocalCluster("DIM_CUSTOMER") # default 4
spark.getActiveSession()

In [3]:
#EXTRACT 
#Extracting data van jdbc bron
cc.set_connectionProfile("VeloDB")

df_operational_users = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "velo_users") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "userid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()


df_operational_subscription_type = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "subscription_types") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "subscriptiontypeid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()

df_operational_subscriptions = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "subscriptions") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "subscriptionid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()

In [4]:
#TRANSFORM
#TEMP VIEW
df_operational_users.createOrReplaceTempView("dimCustomer")
df_operational_subscription_type.createOrReplaceTempView("subscriptionTypes")
df_operational_subscriptions.createOrReplaceTempView("subscriptions")

In [5]:

# TRANSFORM: First, create a temp view for the most recent subscription per user
spark.sql("""

SELECT userId, validfrom, description
FROM (
SELECT 
        s.userId, 
        s.validfrom, 
        st.description,
        ROW_NUMBER() OVER (PARTITION BY s.userId ORDER BY s.validfrom DESC) AS rank
    FROM subscriptions AS s
    LEFT JOIN subscriptionTypes AS st ON s.subscriptiontypeid = st.subscriptiontypeid
)
WHERE rank = 1
""").createOrReplaceTempView("mostRecentSubscription")

df_dim_customer = spark.sql("""
    SELECT 
        monotonically_increasing_id() as customer_SK, 
        u.userid as customer_id,
        u.name as name,
        u.street,  
        u.number,  
        u.zipcode,
        u.city, 
        u.country_code,
        st.description as subscription_type,
        to_timestamp('1999-01-01','yyyy-MM-dd') AS scd_start, 
        to_timestamp('2100-12-12','yyyy-MM-dd') AS scd_end,
        s.validfrom as validform,
        s.subscriptionid as subscription_id,
        md5(concat(u.name, street,number , zipcode,city,country_code )) as md5,
        True as current
    FROM dimCustomer u
        JOIN subscriptions s 
                ON u.userid = s.userid
        JOIN subscriptionTypes st 
                ON s.subscriptiontypeid = st.subscriptiontypeid
    WHERE s.validfrom = (
        SELECT MAX(validfrom) 
        FROM subscriptions 
        WHERE userid = u.userid)
""")

# Create temp view for the final dimension
df_dim_customer.createOrReplaceTempView("dimCustomer")

# Display the final dimension
spark.sql("SELECT * FROM dimCustomer").show()

+-----------+-----------+--------------------+--------------------+--------+-------+--------------------+------------+-----------------+-------------------+-------------------+----------+---------------+--------------------+-------+
|customer_SK|customer_id|                name|              street|  number|zipcode|                city|country_code|subscription_type|          scd_start|            scd_end| validform|subscription_id|                 md5|current|
+-----------+-----------+--------------------+--------------------+--------+-------+--------------------+------------+-----------------+-------------------+-------------------+----------+---------------+--------------------+-------+
|          0|          2|   van der Zee Julia|          Europalaan|     43 |   2610| Wilrijk (Antwerpen)|          BE|            MAAND|1999-01-01 00:00:00|2100-12-12 00:00:00|2021-10-05|              4|9d3359c2cb9800783...|   true|
|          1|          4|      Willems Angela|Graaf Joseph de P...| 

In [6]:
delta_table_path = "spark-warehouse/dimCustomer"
df_dim_customer.write.format("delta").mode("overwrite").save(delta_table_path)

In [7]:
df_dim_customer.createOrReplaceTempView("dimCustomer")
spark.sql("select * from dimCustomer").show()

+-----------+-----------+--------------------+--------------------+--------+-------+--------------------+------------+-----------------+-------------------+-------------------+----------+---------------+--------------------+-------+
|customer_SK|customer_id|                name|              street|  number|zipcode|                city|country_code|subscription_type|          scd_start|            scd_end| validform|subscription_id|                 md5|current|
+-----------+-----------+--------------------+--------------------+--------+-------+--------------------+------------+-----------------+-------------------+-------------------+----------+---------------+--------------------+-------+
|          0|          2|   van der Zee Julia|          Europalaan|     43 |   2610| Wilrijk (Antwerpen)|          BE|            MAAND|1999-01-01 00:00:00|2100-12-12 00:00:00|2021-10-05|              4|9d3359c2cb9800783...|   true|
|          1|          4|      Willems Angela|Graaf Joseph de P...| 

In [8]:
spark.stop()