### Config stuff

In [15]:
import random
import pyspark
from pyspark.sql import SparkSession, functions
import ConnectionConfig as cc
from pyspark.sql.functions import *
cc.setupEnvironment()
cc.listEnvironment()

ALLUSERSPROFILE: C:\ProgramData
APPDATA: C:\Users\saidk\AppData\Roaming
ASL.LOG: Destination=file
COMMONPROGRAMFILES: C:\Program Files\Common Files
COMMONPROGRAMFILES(X86): C:\Program Files (x86)\Common Files
COMMONPROGRAMW6432: C:\Program Files\Common Files
COMPUTERNAME: SAID-KHALAF
COMSPEC: C:\Windows\system32\cmd.exe
DRIVERDATA: C:\Windows\System32\Drivers\DriverData
EFC_7508: 1
FPS_BROWSER_APP_PROFILE_STRING: Internet Explorer
FPS_BROWSER_USER_PROFILE_STRING: Default
HOMEDRIVE: C:
HOMEPATH: \Users\saidk
IGCCSVC_DB: AQAAANCMnd8BFdERjHoAwE/Cl+sBAAAAyWHQQ0UE1EybUO2X28QUdAQAAAACAAAAAAAQZgAAAAEAACAAAACnQfPzRgQfLDsFfSA8yUOlGDgWhclc8+UiXNzfZ6vyfQAAAAAOgAAAAAIAACAAAACmsL61oyMd90tFxJ2o07U7jFSVs8QBAEYbW8GGxh0lQGAAAADZW7PuVVN1xrB8XMybUg4VuohhEvg2y7EtyRkL0ixCJEWB6mIJ63wAZJ1d56BcTP5ivzGYlSMvrOPEJ46RadHt7Ga+sTruwrK7noPoYjO4Pe2H8VmOQKVitCxiLxrnKqtAAAAAXUJVbtr76zut+OVP458xzpZXR4dNPMe+X/u1GgtWk+juReuOTvOYyxZJqJZK0PoXS+58slyVeh91UC/+DZONBw==
INTELLIJ IDEA: C:\Program Files\JetBrains\IntelliJ IDEA 20

In [16]:
spark = cc.startLocalCluster("dimSalesRepInit")
spark.getActiveSession()


# Initial load
We will create a slowly changing dimension type 2 called dimSalesRep based on a sourceTable in our operational database called dbo.salesrep. SCD2  tables demand extra care and  because we will store hirstorical values of the dimension with the help of ranges.
This notebook will create the table and fill it with the initial data. A second notebook will be used for increments of new and changed data.

This is an example of the expected output (salesRepSK is different
```
+----------+-------------+-------------+-----------+-------------------+-------------------+--------------------+-------+
|salesRepID|         name|       office| salesRepSK|          scd_start|            scd_end|                 md5|current|
+----------+-------------+-------------+-----------+-------------------+-------------------+--------------------+-------+
|a46add1...|      Z. Jane|     New York|          0|1990-01-01 00:00:00|2100-12-12 00:00:00|303db545462092a92...|   true|
|s1fedf1...|   P. Chapman|       Berlin|          1|1990-01-01 00:00:00|2100-12-12 00:00:00|14b094c31bf9e4149...|   true|
|d5e6f77...|     T. Crane|     New York|          2|1990-01-01 00:00:00|2100-12-12 00:00:00|6c062f95defda9dc3...|   true|
```




In [17]:
cc.set_connectionProfile("tutorial_op")

df_operational_sales_rep = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "salesrep") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "salesRepID") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()

#Method 1 : Use the DataFrame API
#df_dim_sales_rep = df_sales_rep.withColumn("salesRepSK", expr("uuid()")) \
#    .withColumn("scd_start", lit("1990-01-01").cast("timestamp")) \
#    .withColumn("scd_end", lit("2100-12-12").cast("timestamp")) \
#    .withColumn("md5", md5(concat( col('name'), col('office')))) \
#    .withColumn("current", lit(True))

#Method 2 : Use SQL
df_operational_sales_rep.createOrReplaceTempView("dimSalesRep")
df_dim_sales_rep = spark.sql("select uuid() as salesRepSK, *, to_timestamp('1999-01-01','yyyy-MM-dd') as scd_start, to_timestamp('2100-12-12','yyyy-MM-dd') as scd_end, md5(concat( name, office)) as md5, True as current  from dimSalesRep")

df_dim_sales_rep.printSchema()
df_dim_sales_rep.show()

# Define the path to save the Delta table
delta_salesrep_table_path = "./spark-warehouse/dimSalesRep"

# Save df_dim_sales_rep DataFrame as a Delta table
df_dim_sales_rep.write.format("delta").mode("overwrite").save(delta_salesrep_table_path)

#spark.sql("ALTER TABLE dimSalesRep  ADD columns (salesRepSK long GENERATED ALWAYS AS IDENTITY (START WITH 0 INCREMENT BY 1)")

root
 |-- salesRepSK: string (nullable = false)
 |-- salesrepid: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- office: string (nullable = true)
 |-- scd_start: timestamp (nullable = true)
 |-- scd_end: timestamp (nullable = true)
 |-- md5: string (nullable = true)
 |-- current: boolean (nullable = false)

+--------------------+----------+-------------+-------------+-------------------+-------------------+--------------------+-------+
|          salesRepSK|salesrepid|         name|       office|          scd_start|            scd_end|                 md5|current|
+--------------------+----------+-------------+-------------+-------------------+-------------------+--------------------+-------+
|66a4faaf-1655-4bb...|         1|      R. Zane|       Berlin|1999-01-01 00:00:00|2100-12-12 00:00:00|1f8cbbc272a33dcc1...|   true|
|67f698be-a1e7-46f...|         2|   P. Chapman|       Berlin|1999-01-01 00:00:00|2100-12-12 00:00:00|14b094c31bf9e4149...|   true|
|daedb4af-6a2c-4a

* The function lit() is used when you want a fixed column value for every row. In this case scd_start, scd_end and current.
* The function md5() performs a md5-hash function on the preferred columns. This is needed to detect scd2 changes. When one of the included columns changes, the md5-hash will change. Include all SCD2 columns in the md5-hash function.

## Delete the spark session

In [18]:
spark.stop()