# Config stuff

In [1]:
from numpy.ma.core import inner

import ConnectionConfig as cc
from delta import DeltaTable
cc.setupEnvironment()

# Start local cluster

In [2]:
spark = cc.startLocalCluster("FACT_RIDES")
spark.getActiveSession()

# Make tables available

In [11]:
#EXTRACT
# Dimension date
dim_date = spark.read.format("delta").load("spark-warehouse/dimdate")

# Dimension weather
dim_weather = spark.read.format("delta").load("spark-warehouse/dimweather")

# Dimension customer
dim_customer = spark.read.format("delta").load("spark-warehouse/dimuser")

# Dimension lock
dim_lock = spark.read.format("delta").load("spark-warehouse/dimlock")

# Fact rides
fact_rides = spark.read.format("delta").load("spark-warehouse/factrides")

# Make tempviews from tables
dim_date.createOrReplaceTempView("dimDate")
dim_weather.createOrReplaceTempView("dimWeather")
dim_customer.createOrReplaceTempView("dimCustomer")
dim_lock.createOrReplaceTempView("dimLock")
fact_rides.createOrReplaceTempView("factRides")

# Analysis queries

## Student 1

### Wat zijn de drukke momenten (op dagbasis) in de week t.o.v. het weekend?

In [4]:
spark.sql("select dd.DayOfWeekStartMonday, count(fr.count_MV) as CountedRides from factRides fr \
          inner join dimDate dd on fr.date_SK = dd.date_SK \
          group by dd.DayOfWeekStartMonday \
          order by dd.DayOfWeekStartMonday asc ").show()

+--------------------+------------+
|DayOfWeekStartMonday|CountedRides|
+--------------------+------------+
|                   1|      713087|
|                   2|      754211|
|                   3|      542333|
|                   4|      427949|
|                   5|      421904|
|                   6|      540662|
|                   7|      738095|
+--------------------+------------+



### Hebben datumparameters invloed op de afgelegde afstand?

Welke datumparameters kunnen invloed hebben op afgelegde afstand:
- Weekdag
- Weekend

In [10]:
spark.sql("select dd.IsWeekDay, round(avg(fr.distance_km_MV), 3) \
           from factRides fr \
           inner join dimDate dd on fr.date_SK = dd.date_SK \
           group by dd.IsWeekDay").show()

+---------+-----------------------------+
|IsWeekDay|round(avg(distance_km_MV), 3)|
+---------+-----------------------------+
|        N|                        2.490|
|        Y|                        2.485|
+---------+-----------------------------+



### Heeft weer invloed op ritten?

Op welke eigenschappen van de ritten kan het weer een invloed hebben:
1) Op aantal afgelegde ritten in een bepaald weertype
2) Op de gemiddelde duurtijd van een rit in een bepaald weertype
3) Op de gemiddelde afgelegde afstand van een rit in een bepaald weertype

In [6]:
spark.sql("select dw.weather_type, count(count_MV) as CountedRides, \
          round(sum(rideDuration_MV)/count(count_MV)) as AverageRideDuration, \
          round(sum(distance_km_MV)/count(count_MV),3) as AverageRideDistance \
          from factRides fr \
          inner join dimWeather dw on fr.weather_SK = dw.weather_SK \
          group by dw.weather_type \
          having dw.weather_type != 'weertype onbekend'").show()

+------------+------------+-------------------+-------------------+
|weather_type|CountedRides|AverageRideDuration|AverageRideDistance|
+------------+------------+-------------------+-------------------+
|    Neutraal|         219|              666.0|              2.897|
|   Aangenaam|         210|              696.0|              2.928|
| Onaangenaam|          92|              621.0|              2.745|
+------------+------------+-------------------+-------------------+



### Extra vraag 1: Worden in de weekends relatief gezien minder ritten gedaan in slecht weer dan in de week?

In [30]:
spark.sql("""
    select dd.IsWeekDay,
        sum(case when dw.weather_type = 'Onaangenaam' then fr.count_MV else 0 end) as total_rides_bad_weather,
        sum(fr.count_MV) as total_rides_all_weather,
        (total_rides_bad_weather/total_rides_all_weather)*10000 as relative_ratio
    from factRides fr
    inner join dimDate dd on fr.date_SK = dd.date_SK
    inner join dimWeather dw on fr.weather_SK = dw.weather_SK
    group by dd.IsWeekDay
""").show()

+---------+-----------------------+-----------------------+-------------------+
|IsWeekDay|total_rides_bad_weather|total_rides_all_weather|     relative_ratio|
+---------+-----------------------+-----------------------+-------------------+
|        N|                     84|                1278757| 0.6568878997338822|
|        Y|                      8|                2859484|0.02797707558426625|
+---------+-----------------------+-----------------------+-------------------+



### Extra vraag 2: Hebben de seizoenen invloed op de ritten?

In [15]:
spark.sql("select dd.season, count(fr.count_MV) as CountedRides, \
          round(avg(fr.rideDuration_MV),0) as AverageRideDuration, \
          round(avg(fr.distance_km_MV),3) as AverageRideDistance \
          from factRides fr \
          inner join dimDate dd on fr.date_SK = dd.date_SK \
          group by dd.season").show()

+------+------------+-------------------+-------------------+
|season|CountedRides|AverageRideDuration|AverageRideDistance|
+------+------------+-------------------+-------------------+
|summer|     1043785|              580.0|              2.482|
|autumn|     1052308|              585.0|              2.504|
|spring|     1040656|              579.0|              2.478|
|winter|     1001492|              579.0|              2.480|
+------+------------+-------------------+-------------------+



## Student 2

### Wat is de invloed van de woonplaats van de gebruikers op het gebruik van de vehicles?

In [8]:
spark.sql("select dc.city as user_city,\
          count(fr.ride_id) as total_rides,\
          round(AVG(fr.rideDuration_MV),1) AS avg_ride_duration_sec,\
          round(AVG(fr.distance_km_mv),3) AS avg_distance_km\
          from factRides fr\
          left outer join dimCustomer dc on dc.user_sk = fr.user_sk\
          group by dc.city\
          order by total_rides desc").show()


### We willen voorspellen welke sloten preventief onderhoud nodig hebben. Bekijk hoe vaak slotnummers relatief gezien gebruikt worden.

In [71]:
spark.sql("select dl.lockid, (count(fr.start_lockid) + count(fr.end_lockid)) as CountLocks, \
          CountLocks * 1.0 / sum(count(fr.start_lockid) + count(fr.end_lockid)) over() as relative_usage\
          from factRides fr\
          left outer join dimLock dl on dl.lockid = fr.end_lockid\
          group by dl.lockid\
          order by CountLocks desc").show()

# Bij het starten van een rit moet het slot opengaan dus we moeten zowel start als eind slot in rekening brengen
# De sum() over() gaat ervoor zorgen dat hij de berekening gaat doen met alle data over een bepaalde partitie (in dit geval geen dus neemt hij alles)

+------+----------+------------------+
|lockid|CountLocks|    relative_usage|
+------+----------+------------------+
|  1548|     18530|0.0033467323033176|
|  2682|     18428|0.0033283099236663|
|   270|     18408|0.0033246976923622|
|  2772|     18374|0.0033185568991451|
|   738|     18318|0.0033084426514934|
|  3024|     18298|0.0033048304201892|
|  2538|     18296|0.0033044691970588|
|  3258|     18290|0.0033033855276675|
|   702|     18278|0.0033012181888850|
|  2754|     18254|0.0032968835113200|
|  2718|     18250|0.0032961610650592|
|   774|     18236|0.0032936325031463|
|  2825|     18224|0.0032914651643637|
|  4230|     18220|0.0032907427181029|
|  2358|     18162|0.0032802672473208|
|  2790|     18148|0.0032777386854079|
|  3168|     18132|0.0032748489003645|
|  3096|     18128|0.0032741264541037|
|   810|     18122|0.0032730427847125|
|  2988|     18096|0.0032683468840170|
+------+----------+------------------+
only showing top 20 rows



### Als een klant zijn abonnement stopzet, willen we kunnen voorspellen op welke stations dit het meeste effect zal hebben.

In [96]:
spark.sql("select dc.userid, dl.stationid, count(fr.ride_ID) as amount_rides \
          from factRides fr\
          left outer join dimCustomer dc on dc.user_sk = fr.user_sk\
          left outer join dimLock dl on dl.lockid = fr.start_lockid\
          where fr.start_lockid is not null\
          group by dc.userid, dl.stationid\
          order by amount_rides desc").show()

+------+---------+------------+
|userid|stationid|amount_rides|
+------+---------+------------+
| 37367|      162|          17|
|  5750|      180|          14|
| 24821|      162|          13|
| 46611|      205|          13|
| 41934|      294|          13|
| 38243|      180|          12|
|  7362|      160|          12|
| 24711|      180|          12|
| 38203|      180|          12|
| 17990|      180|          12|
|  7485|      161|          12|
| 26189|      180|          12|
| 19281|      278|          12|
| 41885|      161|          12|
| 10430|      161|          12|
|  6014|      180|          12|
| 33185|      294|          12|
| 47280|      161|          12|
| 35983|      162|          12|
| 33761|      161|          12|
+------+---------+------------+
only showing top 20 rows



In [97]:
# Extra zijn dit de ritten die geen start slot hebben (en dus ook geen station)
spark.sql("select dc.userid, count(fr.ride_ID) as amount_rides \
          from factRides fr\
          left outer join dimCustomer dc on dc.user_sk = fr.user_sk\
          left outer join dimLock dl on dl.lockid = fr.start_lockid\
          where fr.start_lockid is null\
          group by dc.userid\
          order by amount_rides desc").show()

+------+------------+
|userid|amount_rides|
+------+------------+
| 24213|          31|
| 45176|          26|
|  6027|          23|
| 34758|          23|
| 37660|          23|
| 38470|          23|
| 34805|          22|
| 38835|          22|
|  3197|          22|
|  7716|          22|
| 14229|          22|
| 32022|          21|
| 28331|          21|
| 37534|          21|
|  2201|          21|
| 57187|          21|
| 39947|          21|
|  2247|          20|
| 54381|          20|
| 23970|          20|
+------+------------+
only showing top 20 rows



### Extra vraag 1: heeft het type abonnement invloed op de lengte van de rit

In [107]:
spark.sql("select dc.subscription_type, count(fr.ride_ID) as total_rides, round(avg(fr.distance_km_mv),3) as avg_distance_km \
          from factRides fr\
          left outer join dimCustomer dc on fr.user_sk = dc.user_sk\
          group by dc.subscription_type\
          order by total_rides desc").show()


+-----------------+-----------+---------------+
|subscription_type|total_rides|avg_distance_km|
+-----------------+-----------+---------------+
|             JAAR|    3001294|          2.581|
|            MAAND|      11832|          2.587|
|              DAG|       1674|          2.666|
+-----------------+-----------+---------------+



### Extra vraag 2: wat zijn de populaire routes tussen verschillende stations

In [123]:
spark.sql("select dls.stationid as start_station, dle.stationid as end_station,\
          count(fr.ride_ID) as total_rides,\
          round(avg(fr.distance_km_mv),3) as avg_distance_km\
          from factRides fr\
          left outer join dimLock dls on fr.start_lockid = dls.lockid\
          left outer join dimLock dle on fr.end_lockid = dle.lockid\
          where fr.distance_km_mv > 0 and dls.stationid is not null and dle.stationid is not null\
          group by dls.stationid, dle.stationid\
          order by total_rides desc").show()

+-------------+-----------+-----------+---------------+
|start_station|end_station|total_rides|avg_distance_km|
+-------------+-----------+-----------+---------------+
|          263|        262|        396|          2.503|
|          181|        179|        356|          0.446|
|          263|        181|        330|          3.238|
|          182|        181|        330|          1.457|
|          262|        263|        326|          2.503|
|          295|        262|        326|          3.725|
|          262|        181|        325|          1.390|
|          262|        182|        322|          1.035|
|          262|         28|        315|          1.694|
|          179|        178|        315|          0.752|
|          262|         55|        314|          1.494|
|          179|        181|        313|          0.446|
|          181|        150|        309|          4.098|
|          181|         39|        309|          0.834|
|          262|        136|        309|         

In [124]:
# Als we de where eruit halen zien we veel ritten die niet aan een station gelinkt zijn en ook vaak dat de rit bij hetzelfde station eindigd dan dat hij gestart is.
spark.sql("select dls.stationid as start_station, dle.stationid as end_station,\
          count(fr.ride_ID) as total_rides,\
          round(avg(fr.distance_km_mv),3) as avg_distance_km\
          from factRides fr\
          left outer join dimLock dls on fr.start_lockid = dls.lockid\
          left outer join dimLock dle on fr.end_lockid = dle.lockid\
          group by dls.stationid, dle.stationid\
          order by total_rides desc").show()

+-------------+-----------+-----------+---------------+
|start_station|end_station|total_rides|avg_distance_km|
+-------------+-----------+-----------+---------------+
|         NULL|       NULL|     246428|          0.397|
|          136|        136|       2045|          0.000|
|           97|         97|       1996|          0.000|
|          124|        124|       1972|          0.000|
|          140|        140|       1969|          0.000|
|           39|         39|       1968|          0.000|
|           52|         52|       1944|          0.000|
|          146|        146|       1935|          0.000|
|          141|        141|       1933|          0.000|
|          121|        121|       1909|          0.000|
|          149|        149|       1908|          0.000|
|          181|        181|       1907|          0.000|
|           55|         55|       1897|          0.000|
|           32|         32|       1879|          0.000|
|           50|         50|       1863|         