In [8]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
from pyspark.sql import Row
import ConnectionConfig as cc
cc.setupEnvironment()


In [9]:
spark = cc.startLocalCluster("DIM_SLOT") # default 4
spark.getActiveSession()

In [10]:
cc.set_connectionProfile("VeloDB")

df_operational_slots = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "locks") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "lockid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()

df_operational_stations = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "stations") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "stationid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()


In [11]:
df_operational_slots.createOrReplaceTempView("dimLocks")
df_operational_stations.createOrReplaceTempView("dimStations")

In [12]:
# TRANSFORM
df_dim_locks = spark.sql("select monotonically_increasing_id() as lockSK, l.lockId, l.stationlocknr, s.stationnr, s.type, s.street, s.number, s.zipcode, s.district, s.labelid FROM dimLocks as l LEFT JOIN dimStations s on l.stationId = s.stationId")

# TRANSFORM
max_lock_SK = df_dim_locks.agg({"lockSK": "max"}).collect()[0][0]

row = Row("lockSK","lockId", "stationlocknr", "stationnr", "type", "street", "number", "zipcode", "district", "labelid")(max_lock_SK+1, "geen lockId", "geen stationlocknr", "geen station", "geen type", "geen straat", "geen nr", "geen zipcd", "geen district", "geen labelid")

df_geen_slot = spark.createDataFrame([row])

df_dim_locks = df_dim_locks.union(df_geen_slot)
df_dim_locks.createOrReplaceTempView("dimLocks")

spark.sql("select * from dimLocks where stationnr=='geen station'").show()
df_dim_locks = df_dim_locks.union(df_geen_slot)

+------+-----------+------------------+------------+---------+-----------+-------+----------+-------------+------------+
|lockSK|     lockId|     stationlocknr|   stationnr|     type|     street| number|   zipcode|     district|     labelid|
+------+-----------+------------------+------------+---------+-----------+-------+----------+-------------+------------+
|  7542|geen lockId|geen stationlocknr|geen station|geen type|geen straat|geen nr|geen zipcd|geen district|geen labelid|
+------+-----------+------------------+------------+---------+-----------+-------+----------+-------------+------------+



In [13]:
delta_table_path = "spark-warehouse/dimLocks"
df_dim_locks.write.format("delta").mode("overwrite").save(delta_table_path)

In [14]:
df_dim_locks.show()

+------+------+-------------+---------+------------+--------------------+------+-------+---------+-------+
|lockSK|lockId|stationlocknr|stationnr|        type|              street|number|zipcode| district|labelid|
+------+------+-------------+---------+------------+--------------------+------+-------+---------+-------+
|     0|     1|            1|      026|DUBBELZIJDIG|         Meir (2000)|    84|   2000|ANTWERPEN|   NULL|
|     1|     2|            2|      026|DUBBELZIJDIG|         Meir (2000)|    84|   2000|ANTWERPEN|   NULL|
|     2|     3|            3|      026|DUBBELZIJDIG|         Meir (2000)|    84|   2000|ANTWERPEN|   NULL|
|     3|     4|            4|      026|DUBBELZIJDIG|         Meir (2000)|    84|   2000|ANTWERPEN|   NULL|
|     4|     5|            5|      026|DUBBELZIJDIG|         Meir (2000)|    84|   2000|ANTWERPEN|   NULL|
|     5|     6|            6|      026|DUBBELZIJDIG|         Meir (2000)|    84|   2000|ANTWERPEN|   NULL|
|     6|     7|            7|      02

In [15]:
spark.stop()