# Config stuff

In [1]:
import ConnectionConfig as cc
from delta import DeltaTable
cc.setupEnvironment()

# Start local cluster

In [2]:
spark = cc.startLocalCluster("FACT_RIDES")
spark.getActiveSession()

# Create facts table: rides

## Read from sources

### Read from VeloDB database

In [3]:
#EXTRACT
cc.set_connectionProfile("VeloDB")
rides_source_df = spark.read \
    .format("jdbc") \
    .option("url", cc.create_jdbc()) \
    .option("driver" , cc.get_Property("driver")) \
    .option("dbtable", "rides") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "rideid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 4140000) \
    .load()

rides_source_df.show(10)

+------+-----------------+-----------------+-------------------+-------------------+---------+--------------+-----------+---------+
|rideid|       startpoint|         endpoint|          starttime|            endtime|vehicleid|subscriptionid|startlockid|endlockid|
+------+-----------------+-----------------+-------------------+-------------------+---------+--------------+-----------+---------+
|     1|(51.2083,4.44595)|(51.1938,4.40228)|2015-09-22 00:00:00|2012-09-22 00:00:00|      844|         13296|       4849|     3188|
|     2|(51.2174,4.41597)|(51.2188,4.40935)|2015-09-22 00:00:00|2012-09-22 00:00:00|     4545|         45924|       NULL|     NULL|
|     3|(51.2088,4.40834)|(51.2077,4.39846)|2015-09-22 00:00:00|2012-09-22 00:00:00|     3419|         25722|       2046|     1951|
|     4|(51.2023,4.41208)|(51.2119,4.39894)|2015-09-22 00:00:00|2012-09-22 00:00:00|     1208|         31000|       1821|     2186|
|     5|(51.1888,4.45039)|(51.2221,4.40467)|2015-09-22 00:00:00|2012-09-22 0

### Read from deltatables

In [4]:
#EXTRACT
# Dimension date
dim_date = spark.read.format("delta").load("spark-warehouse/dimdate")

# Dimension weather
dim_weather = spark.read.format("delta").load("spark-warehouse/dimweather")

### Read from weather data source

In [16]:
#EXTRACT
weather_responses = spark.read.format("json").option("multiLine",True).load("weather")
weather_responses.show(10)

+--------+------+---+--------------+----------+-------+--------------------+-----+------+--------------------+--------+----------+--------------------+-----------------+-------+
|    base|clouds|cod|         coord|        dt|     id|                main| name|  rain|                 sys|timezone|visibility|             weather|             wind|zipCode|
+--------+------+---+--------------+----------+-------+--------------------+-----+------+--------------------+--------+----------+--------------------+-----------------+-------+
|stations| {100}|200|{44.34, 10.99}|1730638800|3163858|{298.74, 933, 64,...|Zocca|{3.16}|{IT, 2075663, 166...|    7200|     10000|[{overcast clouds...|{349, 1.18, 0.62}|   2000|
|stations| {100}|200|{44.34, 10.99}|1730912400|3163858|{298.74, 933, 64,...|Zocca|{3.16}|{IT, 2075663, 166...|    7200|     10000|[{overcast clouds...|{349, 1.18, 0.62}|   2018|
|stations| {100}|200|{44.34, 10.99}|1731160800|3163858|{298.74, 933, 64,...|Zocca|{3.16}|{IT, 2075663, 166...|

## Create tempviews

In [7]:
# Rides source table
rides_source_df.createOrReplaceTempView("ridesSource")

# Dimension date
dim_date.createOrReplaceTempView("dimDate")

# Dimension weather
dim_weather.createOrReplaceTempView("dimWeather")

# Weather responses
weather_responses.createOrReplaceTempView("weatherResponses")

## Transform weather responses table

In [19]:
#TRANSFORM
short_weather_responses = spark.sql("select zipCode as zip_code, dt as timestamp, weather.id[0] as condition_id, main.temp as temperature from weatherResponses")
short_weather_responses.show(10)

+--------+----------+------------+-----------+
|zip_code| timestamp|condition_id|temperature|
+--------+----------+------------+-----------+
|    2000|1730638800|         804|     298.48|
|    2018|1730912400|         804|     298.48|
|    2020|1731160800|         804|     298.48|
|    2030|1730638800|         804|     298.48|
|    2050|1730912400|         804|     298.48|
|    2060|1731160800|         804|     298.48|
|    2100|1730638800|         804|     298.48|
|    2140|1730912400|         804|     298.48|
|    2170|1731160800|         804|     298.48|
|    2600|1730638800|         804|     298.48|
+--------+----------+------------+-----------+
only showing top 10 rows



In [20]:
short_weather_responses.createOrReplaceTempView("shortWeatherResponses")

## Build facts table

Koppeling met date dimensie:
- Moet endtime ook een SK hebben naar de date dimensie? Want zorgt voor extra kolom --> Niet goed in feitentabel!
- Eventueel van de kolom date_SK de koppeling naar starttime en endttime samenvoegen?

In [22]:
#TRANSFORM
rides_fact_df = spark.sql("select src.rideid as ride_ID, dd.date_SK \
                          from ridesSource as src \
                          left outer join dimDate as dd on cast(src.starttime as DATE) = cast(dd.CalendarDate as DATE) \
                          where src.subscriptionid is not null ")

rides_fact_df.show(50)

+-------+-------+
|ride_ID|date_SK|
+-------+-------+
|      1|   1096|
|      2|   1096|
|      3|   1096|
|      4|   1096|
|      5|   1096|
|      7|   1096|
|      8|   1096|
|      9|   1096|
|     10|   1096|
|     11|   1096|
|     12|   1096|
|     13|   1096|
|     14|   1096|
|     15|   2557|
|     16|   2557|
|     17|   2557|
|     18|   2557|
|     19|   2557|
|     21|   2557|
|     22|   2557|
|     23|   2557|
|     24|   2557|
|     25|   2557|
|     26|   2557|
|     27|   2557|
|     28|   2557|
|     29|   2557|
|     31|   2557|
|     32|   2557|
|     33|   2557|
|     34|   2557|
|     35|   2557|
|     36|   2557|
|     37|   2557|
|     38|   2557|
|     39|   2557|
|     40|   2557|
|     41|   2557|
|     42|   2557|
|     43|   2557|
|     44|   2557|
|     45|   2557|
|     46|   2557|
|     47|   2557|
|     48|   2557|
|     49|   2557|
|     50|   2557|
|     51|   2557|
|     53|   2557|
|     54|   2557|
+-------+-------+
only showing top 50 rows

