# Config stuff

In [1]:
import ConnectionConfig as cc
from delta import DeltaTable
cc.setupEnvironment()

# Start local cluster

In [2]:
spark = cc.startLocalCluster("FACT_RIDES")
spark.getActiveSession()

# Create facts table: rides

## Read from sources

### Read from VeloDB database

Filteren en joinen in de brontabel of in de spark omgeving? Momenteel gekozen om de rides table en de locks en stations table apart in te lezen en ze later te joinen in de spark omgeving. Zorgt voor minder DB operaties.

In [9]:
#EXTRACT

cc.set_connectionProfile("VeloDB")

# Read rides table from source VeloDB database
rides_source_df = spark.read \
    .format("jdbc") \
    .option("url", cc.create_jdbc()) \
    .option("driver" , cc.get_Property("driver")) \
    .option("dbtable", "(select rideid, starttime, endtime, subscriptionid, startlockid, endlockid from rides) as subq") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "rideid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 4140000) \
    .load()

# Next read operation performs a join between the locks and stations source tables. The goal is to retrieve a zipcode for each lockid. This zipcode is used to link the weather dimension with the facts table 
locks_with_zip = spark.read \
    .format("jdbc") \
    .option("url", cc.create_jdbc()) \
    .option("driver" , cc.get_Property("driver")) \
    .option("dbtable", "(select l.lockid, s.zipcode from locks l \
    left outer join stations s on l.stationid = s.stationid) as subq") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "lockid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 8000) \
    .load()

locks_with_zip.show(10)

+------+-------+
|lockid|zipcode|
+------+-------+
|     1|   2000|
|     2|   2000|
|     3|   2000|
|     4|   2000|
|     5|   2000|
|     6|   2000|
|     7|   2000|
|     8|   2000|
|     9|   2000|
|    10|   2000|
+------+-------+
only showing top 10 rows



In [10]:
rides_source_df.show(10)

+------+-------------------+-------------------+--------------+-----------+---------+
|rideid|          starttime|            endtime|subscriptionid|startlockid|endlockid|
+------+-------------------+-------------------+--------------+-----------+---------+
|     1|2015-09-22 00:00:00|2012-09-22 00:00:00|         13296|       4849|     3188|
|     2|2015-09-22 00:00:00|2012-09-22 00:00:00|         45924|       NULL|     NULL|
|     3|2015-09-22 00:00:00|2012-09-22 00:00:00|         25722|       2046|     1951|
|     4|2015-09-22 00:00:00|2012-09-22 00:00:00|         31000|       1821|     2186|
|     5|2015-09-22 00:00:00|2012-09-22 00:00:00|         59732|       6382|     2700|
|     6|2015-09-22 00:00:00|2012-09-22 00:00:00|          NULL|       NULL|     NULL|
|     7|2015-09-22 00:00:00|2012-09-22 00:00:00|         31055|       1388|     3401|
|     8|2015-09-22 00:00:00|2012-09-22 00:00:00|         65164|       2572|       13|
|     9|2015-09-22 00:00:00|2012-09-22 00:00:00|      

### Read from deltatables

In [4]:
#EXTRACT
# Dimension date
dim_date = spark.read.format("delta").load("spark-warehouse/dimdate")

# Dimension weather
dim_weather = spark.read.format("delta").load("spark-warehouse/dimweather")

### Read from weather data source

In [5]:
#EXTRACT
weather_responses = spark.read.format("json").option("multiLine",True).load("weather")
weather_responses.show(10)

+--------+------+---+--------------+----------+-------+--------------------+-----+------+--------------------+--------+----------+--------------------+-----------------+-------+
|    base|clouds|cod|         coord|        dt|     id|                main| name|  rain|                 sys|timezone|visibility|             weather|             wind|zipCode|
+--------+------+---+--------------+----------+-------+--------------------+-----+------+--------------------+--------+----------+--------------------+-----------------+-------+
|stations| {100}|200|{44.34, 10.99}|1730638800|3163858|{298.74, 933, 64,...|Zocca|{3.16}|{IT, 2075663, 166...|    7200|     10000|[{overcast clouds...|{349, 1.18, 0.62}|   2000|
|stations| {100}|200|{44.34, 10.99}|1730912400|3163858|{298.74, 933, 64,...|Zocca|{3.16}|{IT, 2075663, 166...|    7200|     10000|[{overcast clouds...|{349, 1.18, 0.62}|   2018|
|stations| {100}|200|{44.34, 10.99}|1731160800|3163858|{298.74, 933, 64,...|Zocca|{3.16}|{IT, 2075663, 166...|

## Create tempviews

In [11]:
# Rides source table
rides_source_df.createOrReplaceTempView("ridesSource")

# Table with lockid's and zipcodes
locks_with_zip.createOrReplaceTempView("locksZip")

# Dimension date
dim_date.createOrReplaceTempView("dimDate")

# Dimension weather
dim_weather.createOrReplaceTempView("dimWeather")

# Weather responses
weather_responses.createOrReplaceTempView("weatherResponses")

## Add zipcodes to rides table

Join ridesSource with lockStationsSource on lockid

In [15]:
# TRANSFORM
rides_with_zipcodes = spark.sql("select src.rideid, src.starttime, src.endtime, src.subscriptionid, src.startlockid, \
                                 src.endlockid, lz.zipcode as startlockZipcode \
                                 from ridesSource as src \
                                 left outer join locksZip as lz on src.startlockid = lz.lockid")
rides_with_zipcodes.show(10)

+------+-------------------+-------------------+--------------+-----------+---------+----------------+
|rideid|          starttime|            endtime|subscriptionid|startlockid|endlockid|startlockZipcode|
+------+-------------------+-------------------+--------------+-----------+---------+----------------+
|     4|2015-09-22 00:00:00|2012-09-22 00:00:00|         31000|       1821|     2186|            2018|
|     9|2015-09-22 00:00:00|2012-09-22 00:00:00|         71164|         50|     2067|            2000|
|     1|2015-09-22 00:00:00|2012-09-22 00:00:00|         13296|       4849|     3188|            2140|
|     5|2015-09-22 00:00:00|2012-09-22 00:00:00|         59732|       6382|     2700|            2660|
|    11|2015-09-22 00:00:00|2012-09-22 00:00:00|           999|        985|     2148|            2018|
|     2|2015-09-22 00:00:00|2012-09-22 00:00:00|         45924|       NULL|     NULL|            NULL|
|     3|2015-09-22 00:00:00|2012-09-22 00:00:00|         25722|       204

## Transform weather responses table

In [8]:
#TRANSFORM
short_weather_responses = spark.sql("select zipCode as zip_code, dt as timestamp, weather.id[0] as condition_id, main.temp as temperature from weatherResponses")
short_weather_responses.show(10)

+--------+----------+------------+-----------+
|zip_code| timestamp|condition_id|temperature|
+--------+----------+------------+-----------+
|    2000|1730638800|         804|     298.48|
|    2018|1730912400|         804|     298.48|
|    2020|1731160800|         804|     298.48|
|    2030|1730638800|         804|     298.48|
|    2050|1730912400|         804|     298.48|
|    2060|1731160800|         804|     298.48|
|    2100|1730638800|         804|     298.48|
|    2140|1730912400|         804|     298.48|
|    2170|1731160800|         804|     298.48|
|    2600|1730638800|         804|     298.48|
+--------+----------+------------+-----------+
only showing top 10 rows



In [10]:
short_weather_responses.createOrReplaceTempView("shortWeatherResponses")

## Build facts table

Koppeling met date dimensie:
- Moet endtime ook een SK hebben naar de date dimensie? Want zorgt voor extra kolom --> Niet goed in feitentabel!
- Eventueel van de kolom date_SK de koppeling naar starttime en endttime samenvoegen?

In [9]:
#TRANSFORM
rides_fact_df = spark.sql("select src.rideid as ride_ID, dd.date_SK \
                          from ridesSource as src \
                          left outer join dimDate as dd on cast(src.starttime as DATE) = cast(dd.CalendarDate as DATE) \
                          where src.subscriptionid is not null ")

rides_fact_df.show(50)

+-------+-------+
|ride_ID|date_SK|
+-------+-------+
|      1|   1096|
|      2|   1096|
|      3|   1096|
|      4|   1096|
|      5|   1096|
|      7|   1096|
|      8|   1096|
|      9|   1096|
|     10|   1096|
|     11|   1096|
|     12|   1096|
|     13|   1096|
|     14|   1096|
|     15|   2557|
|     16|   2557|
|     17|   2557|
|     18|   2557|
|     19|   2557|
|     21|   2557|
|     22|   2557|
|     23|   2557|
|     24|   2557|
|     25|   2557|
|     26|   2557|
|     27|   2557|
|     28|   2557|
|     29|   2557|
|     31|   2557|
|     32|   2557|
|     33|   2557|
|     34|   2557|
|     35|   2557|
|     36|   2557|
|     37|   2557|
|     38|   2557|
|     39|   2557|
|     40|   2557|
|     41|   2557|
|     42|   2557|
|     43|   2557|
|     44|   2557|
|     45|   2557|
|     46|   2557|
|     47|   2557|
|     48|   2557|
|     49|   2557|
|     50|   2557|
|     51|   2557|
|     53|   2557|
|     54|   2557|
+-------+-------+
only showing top 50 rows



In [9]:
spark.stop()

ConnectionRefusedError: [WinError 10061] Kan geen verbinding maken omdat de doelcomputer de verbinding actief heeft geweigerd