In [0]:
dfFactTrips = spark.read.format("delta").load("/delta/star_fact_trips")
dfFactPayments = spark.read.format("delta").load("/delta/star_fact_payments")
dfDimDates = spark.read.format("delta").load("/delta/star_dim_dates")
dfDimStations = spark.read.format("delta").load("/delta/star_dim_stations")
dfDimRiders = spark.read.format("delta").load("/delta/star_dim_riders")

from pyspark.sql import functions as F

In [0]:
goldDOWAndHourAverageTripDuration = dfFactTrips \
    .join(dfDimDates, dfDimDates["date_id"] == dfFactTrips["start_date_id"], "inner") \
    .withColumn("day_of_week", F.date_format(F.to_date(F.concat(F.col("year"), F.lit("-"), F.col("month"), F.lit("-"), F.col("day"))), "EEEE")) \
    .groupBy("hour", "day_of_week") \
    .agg(F.avg("trip_duration").alias("average_trip_duration"))

goldDOWAndHourAverageTripDuration.show()

+----+-----------+---------------------+
|hour|day_of_week|average_trip_duration|
+----+-----------+---------------------+
|   4|  Wednesday|   1357.9923017705928|
|   8|   Thursday|   1163.6825682568258|
|   9|   Saturday|   1313.0925583850112|
|   7|   Thursday|    1246.145605805175|
|   3|     Monday|    1488.135593220339|
|   0|     Friday|   1200.3520650813516|
|  23|     Sunday|    1567.898226985876|
|   7|  Wednesday|   1253.6560360661213|
|  12|  Wednesday|    1219.011237991662|
|   3|     Sunday|   1399.0172692059393|
|   0|  Wednesday|    1185.115367180417|
|   9|  Wednesday|   1252.0780303736142|
|  15|  Wednesday|   1223.2479202348911|
|  11|     Sunday|   1312.7756962827214|
|   7|     Monday|   1235.2872087298983|
|  12|    Tuesday|    1357.705638564729|
|   5|    Tuesday|   1182.7008022652194|
|   6|    Tuesday|   1290.0255538623842|
|  21|   Saturday|   1336.5142405658603|
|  17|   Saturday|   1361.2338646772935|
+----+-----------+---------------------+
only showing top

In [0]:
goldStationsAverageTripDuration = dfFactTrips \
    .join(dfDimStations, dfDimStations["station_id"] == dfFactTrips["start_station_id"], "inner") \
    .withColumnRenamed("name", "start_station_name") \
    .drop("station_id", "latitude", "longitude") \
    .join(dfDimStations, dfDimStations["station_id"] == dfFactTrips["end_station_id"], "inner") \
    .withColumnRenamed("name", "end_station_name") \
    .drop("station_id", "latitude", "longitude") \
    .groupBy("start_station_id", "end_station_id", "start_station_name", "end_station_name") \
    .agg(F.avg("trip_duration").alias("average_trip_duration"))

goldStationsAverageTripDuration.show()

+----------------+--------------+--------------------+--------------------+---------------------+
|start_station_id|end_station_id|  start_station_name|    end_station_name|average_trip_duration|
+----------------+--------------+--------------------+--------------------+---------------------+
|           15529|  TA1309000023|Mies van der Rohe...|Sheffield Ave & W...|   1425.9285714285713|
|    TA1308000029|        LF-005|Stetson Ave & Sou...|Lake Shore Dr & N...|   1978.8067226890757|
|           13061|         18062|Ashland Ave & Div...|Aberdeen St & Ran...|          1009.890625|
|           13042|         13022|Michigan Ave & Oa...|Streeter Dr & Gra...|   1848.7259635747564|
|    KA1504000133|  TA1308000031|  Rush St & Cedar St|Logan Blvd & Elst...|               1634.0|
|    TA1305000003|        LF-005|Fairbanks Ct & Gr...|Lake Shore Dr & N...|   1529.2197183098592|
|             643|         13045|          Smith Park|Dearborn St & Eri...|   1834.3333333333333|
|    TA1309000039|  

In [0]:
goldAgeAndHourAverageTripDuration = dfFactTrips \
    .join(dfDimDates, dfDimDates["date_id"] == dfFactTrips["start_date_id"], "inner") \
    .groupBy("hour", "rider_age") \
    .agg(F.avg("trip_duration").alias("average_trip_duration"))

goldAgeAndHourAverageTripDuration.show()

+----+---------+---------------------+
|hour|rider_age|average_trip_duration|
+----+---------+---------------------+
|  15|       26|   1638.0659501200041|
|   8|       52|    1094.186403508772|
|   7|       55|   1238.9976878612717|
|  22|       53|   2113.6076555023924|
|   3|       22|     1179.11394891945|
|   3|       57|   1170.5890410958905|
|  18|       68|    2082.775956284153|
|  17|       33|    1276.973795286669|
|   5|       40|   1369.2135802469136|
|  22|       33|   1221.2880844645551|
|   3|       30|    1160.981718464351|
|   3|       15|    1165.598253275109|
|  20|       55|    1358.344465648855|
|   5|       49|   1282.0392670157069|
|  11|       23|   1368.6214245653393|
|   7|       33|   1286.8220793140408|
|  12|       37|   1394.5933077830189|
|   0|       25|   1645.4283326368575|
|   0|       57|   1160.5379746835442|
|   6|       37|   1843.9640410958905|
+----+---------+---------------------+
only showing top 20 rows



In [0]:
goldMemberAverageTripDuration = dfFactTrips \
    .join(dfDimRiders, dfDimRiders["rider_id"] == dfFactTrips["rider_id"], "inner") \
    .groupBy("is_member") \
    .agg(F.avg("trip_duration").alias("average_trip_duration"))

goldMemberAverageTripDuration.show()

+---------+---------------------+
|is_member|average_trip_duration|
+---------+---------------------+
|     true|   1314.2656120356567|
|    false|   1279.4272573384933|
+---------+---------------------+



In [0]:
goldYearAmount = dfFactPayments \
                    .join(dfDimDates, dfDimDates["date_id"] == dfFactPayments["date_id"], "inner") \
                    .groupBy("year") \
                    .agg(F.sum("amount").alias("total_amount"))

goldQuarterAmount = dfFactPayments \
                    .join(dfDimDates, dfDimDates["date_id"] == dfFactPayments["date_id"], "inner") \
                    .groupBy("quarter") \
                    .agg(F.sum("amount").alias("total_amount"))

goldMonthAmount = dfFactPayments \
                    .join(dfDimDates, dfDimDates["date_id"] == dfFactPayments["date_id"], "inner") \
                    .groupBy("month") \
                    .agg(F.sum("amount").alias("total_amount"))

goldMonthAmount.show()

+-----+------------------+
|month|      total_amount|
+-----+------------------+
|   12|1799778.4599999962|
|    1| 1855786.829999997|
|    6|1491227.5800000052|
|    3|1348782.3900000006|
|    5|1441279.1900000027|
|    9| 1641916.400000001|
|    4|1395762.7799999989|
|    8|        1592322.58|
|    7|1538960.8799999964|
|   10|1696207.3200000033|
|    2|1907807.2699999996|
|   11|1747273.5699999903|
+-----+------------------+



In [0]:
from pyspark.sql.functions import col, datediff

goldMemberAndAgeAmount = dfFactPayments \
    .join(dfDimRiders, dfDimRiders["rider_id"] == dfFactPayments["rider_id"], "inner") \
    .filter(col("is_member") == True) \
    .withColumn("rider_age", datediff(col("account_start_date"), col("birthday")) / 365) \
    .withColumn("rider_age", col("rider_age").cast("integer")) \
    .groupBy("rider_age") \
    .agg(F.avg("amount").alias("average_amount"))

display(goldMemberAndAgeAmount)

rider_age,average_amount
31,9.0
65,9.0
53,9.0
34,9.0
28,9.0
26,9.0
27,9.0
44,9.0
12,9.0
22,9.0


In [0]:
goldDOWAndHourAverageTripDuration.write.format("delta").mode("overwrite").saveAsTable("goldDOWAndHourAverageTripDuration")

goldStationsAverageTripDuration.write.format("delta").mode("overwrite").saveAsTable("goldStationsAverageTripDuration")

goldAgeAndHourAverageTripDuration.write.format("delta").mode("overwrite").saveAsTable("goldAgeAndHourAverageTripDuration")

goldMemberAverageTripDuration.write.format("delta").mode("overwrite").saveAsTable("goldMemberAverageTripDuration")

goldYearAmount.write.format("delta").mode("overwrite").saveAsTable("goldYearAmount")

goldMonthAmount.write.format("delta").mode("overwrite").saveAsTable("goldMonthAmount")

goldMemberAndAgeAmount.write.format("delta").mode("overwrite").saveAsTable("goldMemberAndAgeAmount")