## Het feit rit S2

In [1]:
#SETUP
import ConnectionConfig as cc
cc.setupEnvironment()
from pyspark.sql import SparkSession

In [2]:
#SETUP
spark = cc.startLocalCluster("factRidesSaif")
spark.getActiveSession()


In [3]:
#EXTRACT
cc.set_connectionProfile("VeloDB")

ride_source_df = spark.read \
    .format("jdbc") \
    .option("url", cc.create_jdbc()) \
    .option("driver", cc.get_Property("driver")) \
    .option("query", """
        SELECT *,
            haversine_km(
                CAST(r.startpoint[0] AS NUMERIC), 
                CAST(r.startpoint[1] AS NUMERIC),
                CAST(r.endpoint[0] AS NUMERIC), 
                CAST(r.endpoint[1] AS NUMERIC)
            ) AS distance
        FROM rides r
    """) \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .load()

subscription_source_df = spark.read \
    .format("jdbc") \
    .option("url", cc.create_jdbc()) \
    .option("driver", cc.get_Property("driver")) \
    .option("dbtable", "subscriptions") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "subscriptionid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 1000) \
    .load()

customers_source_df = spark.read \
    .format("jdbc") \
    .option("url", cc.create_jdbc()) \
    .option("driver", cc.get_Property("driver")) \
    .option("dbtable", "velo_users") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "userid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 1000) \
    .load()


In [4]:
ride_source_df.createOrReplaceTempView("rides_source_saif")
subscription_source_df.createOrReplaceTempView("subscription_source")
customers_source_df.createOrReplaceTempView("customers_source")

In [5]:

rides_with_users = spark.sql("""
SELECT r.*, c.userid FROM rides_source_saif r
LEFT JOIN subscription_source s ON r.subscriptionid = s.subscriptionid
left join customers_source c on s.userid = c.userid""")


In [6]:
#EXTRACT
dim_customer = spark.read.format("delta").load("spark-warehouse/dimCustomer")
dim_locks = spark.read.format("delta").load("spark-warehouse/dimLocks")

In [7]:
#TRANSFORM
dim_customer.createOrReplaceTempView("dimCustomer")
dim_locks.createOrReplaceTempView("dimLocks")
rides_with_users.createOrReplaceTempView("rides_with_users")


In [8]:
#TRANSFORM
fact_rides = spark.sql("""
    SELECT 
        monotonically_increasing_id() as rideSK, 
        rideid AS rideId,
        l1.lockSK AS startLockSK,
        l2.lockSK AS endLockSK,
        c.customer_SK AS customerSK,
        r.distance  as rideDistance
        
    FROM
        rides_with_users AS r
    LEFT OUTER JOIN 
        dimLocks AS l1 ON r.startlockId = l1.lockId
    LEFT OUTER JOIN 
        dimLocks AS l2 ON r.endlockId = l2.lockId
    LEFT OUTER JOIN 
        dimCustomer AS c ON (r.userid = c.customer_id
        and r.starttime >= c.scd_start and (r.starttime < c.scd_end  OR c.scd_end is null)
        )
    WHERE 
        l1.lockId IS NOT NULL AND l2.lockId IS NOT NULL AND c.customer_id IS NOT NULL
        
""")

In [9]:
#LOAD
fact_rides.show(20)

+------+------+-----------+---------+----------+--------------------+
|rideSK|rideId|startLockSK|endLockSK|customerSK|        rideDistance|
+------+------+-----------+---------+----------+--------------------+
|     0|     3|       4147|     2270|8589984759|0.699051320786962000|
|     1|    11|       5628|     4177|8589949890|1.061420171911090000|
|     2|    13|       3166|     6046|8589935603|5.575740415708010000|
|     3|    17|       4147|     2270|8589984759|0.699051320786962000|
|     4|    25|       5628|     4177|8589949890|1.061420171911090000|
|     5|    27|       3166|     6046|8589935603|5.575740415708010000|
|     6|    33|       1227|     7149|8589951832|0.489635221161042000|
|     7|    34|       5625|      426|8589993021|2.348743494327990000|
|     8|    42|       2571|     1862|8589955202|0.177913231674759000|
|     9|    50|       2597|     6122|8589945494|1.327670670555210000|
|    10|    58|       3033|     2449|8589985031|4.441904143989020000|
|    11|    67|     

In [10]:
delta_table_path = "spark-warehouse/factRidesS2"
fact_rides.write.format("delta").mode("overwrite").save(delta_table_path)

In [11]:
spark.stop()