# Config stuff

In [1]:
import ConnectionConfig as cc
from delta import DeltaTable
cc.setupEnvironment()

# Start local cluster

In [2]:
spark = cc.startLocalCluster("FACT_RIDES")
spark.getActiveSession()

# Create facts table: rides

## Read from sources

### Read from VeloDB database

In [3]:
#EXTRACT

cc.set_connectionProfile("VeloDB")

# Read rides table from source VeloDB database
# Only rows after 2019-01-01 will be read because older rows are corrupt
rides_source_df = spark.read \
    .format("jdbc") \
    .option("url", cc.create_jdbc()) \
    .option("driver" , cc.get_Property("driver")) \
    .option("dbtable", "(select rideid, starttime, endtime, subscriptionid, startlockid, endlockid, round(haversine_km(startpoint[0] :: numeric, startpoint[1]:: numeric, endpoint[0]:: numeric, endpoint[1]:: numeric),3) as distance_km_MV from rides where starttime > '2019-01-01') as subq") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "rideid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 4140000) \
    .load()

rides_source_df.show(10)

### Read from deltatables

In [5]:
#EXTRACT
# Dimension date
dim_date = spark.read.format("delta").load("spark-warehouse/dimdate")

# Dimension weather
dim_weather = spark.read.format("delta").load("spark-warehouse/dimweather")

# Dimension customer
dim_customer = spark.read.format("delta").load("spark-warehouse/dimuser")

# Dimension lock
dim_lock = spark.read.format("delta").load("spark-warehouse/dimlock")

### Read from weather data source

In [6]:
#EXTRACT
weather_responses = spark.read.format("json").option("multiLine",True).load("weather")
weather_responses.show(10)

+--------+------+---+--------------+----------+-------+--------------------+-----+------+--------------------+--------+----------+--------------------+-----------------+-------+
|    base|clouds|cod|         coord|        dt|     id|                main| name|  rain|                 sys|timezone|visibility|             weather|             wind|zipCode|
+--------+------+---+--------------+----------+-------+--------------------+-----+------+--------------------+--------+----------+--------------------+-----------------+-------+
|stations| {100}|200|{44.34, 10.99}|1583593967|3163858|{298.74, 933, 64,...|Zocca|{3.16}|{IT, 2075663, 166...|    7200|     10000|[{overcast clouds...|{349, 1.18, 0.62}|   2000|
|stations| {100}|200|{44.34, 10.99}|1583132121|3163858|{298.74, 933, 64,...|Zocca|{3.16}|{IT, 2075663, 166...|    7200|     10000|[{overcast clouds...|{349, 1.18, 0.62}|   2018|
|stations| {100}|200|{44.34, 10.99}|1583134645|3163858|{298.74, 933, 64,...|Zocca|{3.16}|{IT, 2075663, 166...|

## Create tempviews

In [7]:
# Rides source table
rides_source_df.createOrReplaceTempView("ridesSource")

# Dimension date
dim_date.createOrReplaceTempView("dimDate")

# Dimension weather
dim_weather.createOrReplaceTempView("dimWeather")

# Weather responses
weather_responses.createOrReplaceTempView("weatherResponses")

# Dimension customer
dim_customer.createOrReplaceTempView("dimCustomer")

# Dimension lock
dim_lock.createOrReplaceTempView("dimLock")

## Transform weather responses table

Add weather_ID based on the weather type (condition_id)

Condition id:
- < 800: All codes with a number smaller than 800 means rain in some form. (=onaangenaam code 2)
- = 800: This code means clear sky and sunshine (=aangenaam code 1 if temperature is higher than 15 degrees Celsius)
- \> 800: All other weather conditions (Neutraal code 3)

In [8]:
#TRANSFORM
short_weather_responses = spark.sql("select zipCode as zip_code, dt as timestamp, weather.id[0] as condition_id, main.temp as temperature, \
                                    case \
                                        when condition_id < 800 then 2 \
                                        when condition_id = 800 and main.temp > (273 + 15) then 1 \
                                        when condition_id = 800 and main.temp < (273 + 15) then 3 \
                                        when condition_id > 800 then 3 \
                                    else 4 \
                                    end as weather_ID \
                                    from weatherResponses")
short_weather_responses.show(10)

+--------+----------+------------+-----------+----------+
|zip_code| timestamp|condition_id|temperature|weather_ID|
+--------+----------+------------+-----------+----------+
|    2000|1583593967|         804|     298.48|         3|
|    2018|1583132121|         804|     298.48|         3|
|    2020|1583134645|         804|     298.48|         3|
|    2030|1585973548|         804|     298.48|         3|
|    2050|1569163778|         804|     298.48|         3|
|    2060|1569968446|         804|     298.48|         3|
|    2100|1575264035|         804|     298.48|         3|
|    2140|1619936979|         804|     298.48|         3|
|    2170|1656838375|         804|     298.48|         3|
|    2600|1659778730|         804|     298.48|         3|
+--------+----------+------------+-----------+----------+
only showing top 10 rows



In [9]:
short_weather_responses.createOrReplaceTempView("shortWeatherResponses")

## Build facts table

In [16]:
#TRANSFORM
rides_fact_df = spark.sql("select src.rideid as ride_ID, dd.date_SK, \
                          coalesce(dw.weather_SK, 3) as weather_SK, \
                          dls.lock_SK as start_lock_SK, dle.lock_SK as end_lock_SK, \
                          1 as count_MV, \
                          (unix_timestamp(endtime) - unix_timestamp(starttime)) as rideDuration_MV, \
                          src.distance_km_MV as distance_km_MV,\
                          md5(concat(src.rideid, dd.date_SK, start_lock_SK, end_lock_SK, coalesce(dw.weather_SK, 3), 1, rideDuration_MV, distance_km_MV)) as md5 \
                          from ridesSource as src \
                          left outer join dimDate as dd \
                          on cast(src.starttime as DATE) = cast(dd.CalendarDate as DATE) \
                          left outer join dimLock dls on src.startlockid = dls.lockid \
                          left outer join dimLock dle on src.endlockid = dle.lockid \
                          left outer join shortWeatherResponses wr on dls.zipcode = string(wr.zip_code) \
                          and date_format(src.starttime, 'yyyy-MM-dd-HH') = date_format(from_unixtime(wr.timestamp),'yyyy-MM-dd-HH') \
                          left outer join dimWeather dw on dw.weather_id = wr.weather_ID \
                          left outer join dimCustomer dc on src.subscriptionid = dc.subscriptionid \
                          where src.subscriptionid is not null and dc.userid is not null")

rides_fact_df.show(10)

+-------+-------+----------+--------------------+--------------------+--------+---------------+--------------------+--------------------+
|ride_ID|date_SK|weather_SK|       start_lock_SK|         end_lock_SK|count_MV|rideDuration_MV|      distance_km_MV|                 md5|
+-------+-------+----------+--------------------+--------------------+--------+---------------+--------------------+--------------------+
|     15|     21|         3|64507040-7b6b-42d...|1f447769-441f-4e8...|       1|            893|3.443000000000000000|2290290ec6874a54b...|
|     17|     21|         3|f1ea858c-14ea-4e6...|959ed69c-1e67-40b...|       1|            167|0.699000000000000000|494b0a0f426b5cd20...|
|     19|     21|         3|f702d617-2669-457...|12e45297-054d-43c...|       1|           1134|4.884000000000000000|5da055d4ec0db7d91...|
|     22|     21|         3|a0d504f4-0da9-49f...|11ce321f-29d9-489...|       1|            335|1.402000000000000000|13f5f3662abd640fd...|
|     24|     21|         3|      

### Create temview from facts table

In [12]:
rides_fact_df.createOrReplaceTempView("factRides_new")

## Write facts table to delta table: Initial load

In [13]:
# LOAD
rides_fact_df.write.format("delta").mode("overwrite").saveAsTable("factRides")

## Incremental load

In [17]:
#LOAD
dt_factRides = DeltaTable.forPath(spark,".\spark-warehouse\\factrides")
dt_factRides.toDF().createOrReplaceTempView("factRides_current")

result = spark.sql("merge into factRides_current as target \
                   using factRides_new as source on target.ride_ID = source.ride_ID \
                   when matched and (target.md5 <> source.md5) then update set * \
                   when not matched then insert *")

result.show()

+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-----------------+----------------+----------------+-----------------+
|                0|               0|               0|                0|
+-----------------+----------------+----------------+-----------------+



In [13]:
spark.stop()

ConnectionRefusedError: [WinError 10061] Kan geen verbinding maken omdat de doelcomputer de verbinding actief heeft geweigerd