# Data Extraction
Load data from csv files which are stored in DBFS.

In [0]:
payment_df = spark.read.format('csv').option('sep', ',').load('/FileStore/divvy/payments.csv')
trip_df = spark.read.format('csv').option('sep', ',').load('/FileStore/divvy/trips.csv')
rider_df = spark.read.format('csv').option('sep', ',').load('/FileStore/divvy/riders.csv')
station_df = spark.read.format('csv').option('sep', ',').load('/FileStore/divvy/stations.csv')
trip_df.show()

+----------------+-------------+-------------------+-------------------+------------+------------+-----+
|             _c0|          _c1|                _c2|                _c3|         _c4|         _c5|  _c6|
+----------------+-------------+-------------------+-------------------+------------+------------+-----+
|89E7AA6C29227EFF| classic_bike|2021-02-12 16:14:56|2021-02-12 16:21:43|         525|         660|71934|
|0FEFDE2603568365| classic_bike|2021-02-14 17:52:38|2021-02-14 18:12:09|         525|       16806|47854|
|E6159D746B2DBB91|electric_bike|2021-02-09 19:10:18|2021-02-09 19:19:10|KA1503000012|TA1305000029|70870|
|B32D3199F1C2E75B| classic_bike|2021-02-02 17:49:41|2021-02-02 17:54:06|         637|TA1305000034|58974|
|83E463F23575F4BF|electric_bike|2021-02-23 15:07:23|2021-02-23 15:22:37|       13216|TA1309000055|39608|
|BDAA7E3494E8D545|electric_bike|2021-02-24 15:43:33|2021-02-24 15:49:05|       18003|KP1705001026|36267|
|A772742351171257| classic_bike|2021-02-01 17:47:42|202

# Workspace Cleanup
Recursively remove the data from delta lake and drop the associated tables if they are exist.

In [0]:
spark.sql("DROP TABLE IF EXISTS payments")
spark.sql("DROP TABLE IF EXISTS trips")
spark.sql("DROP TABLE IF EXISTS riders")
spark.sql("DROP TABLE IF EXISTS stations")
spark.sql("DROP TABLE IF EXISTS trip_dates")
spark.sql("DROP TABLE IF EXISTS payment_dates")

Out[6]: DataFrame[]

# Delta Lake Creation
Save the csv files to the `/delta/divvy` delta lake using overwrite mode. After that, create tables from the location of delta lake.

In [0]:
payment_df.write.format('delta').mode('overwrite').saveAsTable('payments')
trip_df.write.format('delta').mode('overwrite').saveAsTable('trips')
rider_df.write.format('delta').mode('overwrite').saveAsTable('riders')
station_df.write.format('delta').mode('overwrite').saveAsTable('stations')

# Star Schema Design
Generate fact and dimension tables. Create columns which help to query effectively. Designing the tables for the <a href="https://adb-29201057511283.3.azuredatabricks.net/?o=29201057511283#notebook/2129164476750309/command/4298484073601913">business questions</a>. Relational ERD for the Divvy Bikeshare Dataset is the following:
![ERD](/files/project_erd.jpeg)

## Helper Functions
Rename the columns and change the column type according to the ERD diagram.

In [0]:
from pyspark.sql.functions import col, cast


def rename_column(table_path, column_dict):
    df = spark.read.table(table_path)
    for fcol, tcol in column_dict.items():
        df = df.withColumnRenamed(fcol, tcol)

    df.write.format("delta").mode("overwrite").option("overwriteSchema", True).saveAsTable(table_path)

def change_column_type(table_path, type_dict):
    df = spark.read.table(table_path)
    for c, t in type_dict.items():
        df = df.withColumn(c, col(c).cast(t))

    df.write.format("delta").mode("overwrite").option("overwriteSchema", True).saveAsTable(table_path)

## Data Formatting
Rename the column names and change the column types.

In [0]:
payment_columns = {'_c0':'payment_id', '_c1':'date_id', '_c2':'amount', '_c3':'rider_id'}
payment_types = {'payment_id':'int', 'amount':'decimal', 'date_id':'date', 'rider_id':'int'}

trip_columns = {'_c0':'trip_id', '_c1':'rideable_type', '_c2':'started_at', '_c3':'ended_at', '_c4':'start_station_id', '_c5':'end_station_id', '_c6':'rider_id'}
trip_types = {'trip_id':'string', 'rideable_type':'string', 'started_at':'timestamp', 'ended_at':'timestamp', 'start_station_id':'int', 'end_station_id':'int', 'rider_id':'int'}

rider_columns = {'_c0':'rider_id', '_c1':'first', '_c2':'last', '_c3':'address', '_c4':'birthday', '_c5':'account_start_date', '_c6':'account_end_date', '_c7':'is_member'}
rider_types = {'rider_id':'int', 'first':'string', 'last':'string', 'address':'string', 'birthday':'date', 'account_start_date':'date', 'account_end_date':'date', 'is_member':'boolean'}

station_columns = {'_c0':'station_id', '_c1':'name', '_c2':'latitude', '_c3':'longitude'}
station_types = {'station_id':'string', 'name':'string', 'latitude':'float', 'longitude':'float'}

rename_column('riders', rider_columns)
change_column_type('riders', rider_types)

rename_column('stations', station_columns)
change_column_type('stations', station_types)

rename_column('trips', trip_columns)
change_column_type('trips', trip_types)

rename_column('payments', payment_columns)
change_column_type('payments', payment_types)

Create additional columns considering the business outcome questions

In [0]:
from pyspark.sql.functions import date_trunc, datediff, to_date

# Calculate the trip duration in seconds
trip_df = spark.read.table('trips')
trip_df = trip_df.withColumn("duration", (col("ended_at") - col("started_at")).cast("long"))\
                .withColumn("time_id", date_trunc("hour", col("started_at")))

# Calculate the age at account start date in year
rider_df = spark.read.table('riders')
rider_df = rider_df.withColumn("age_at_account_start", (datediff(rider_df.account_start_date, rider_df.birthday)/365).cast("int"))
rider_df.write.format("delta").mode("overwrite").option("overwriteSchema", True).saveAsTable('riders')

# Store all columns from riders table except rider_id
rider_columns = rider_df.columns
rider_columns.remove('rider_id')

# Calculate the age of the rider at time of the ride
trip_df = trip_df.join(rider_df, 'rider_id')\
    .withColumn("age_at_ride_time", (datediff(to_date(col("started_at")), col("birthday"))/365).cast("int"))\
    .drop(*rider_columns)

# Reorder the columns in the following strucutre: Primary Key -> Fields -> Foreign Key(s)
trip_df.select('trip_id', 'duration', 'rideable_type', 'age_at_ride_time', 'started_at', 'ended_at', 'start_station_id', 'end_station_id', 'time_id', 'rider_id')\
    .write.format("delta").mode("overwrite").option("overwriteSchema", True).saveAsTable('trips')

payment_df = spark.read.table('payments')
payment_df.select('payment_id', 'amount', 'date_id', 'rider_id').write.format("delta").mode("overwrite").option("overwriteSchema", True).saveAsTable('payments')

## Generate Date Dimensions
Seperate date table dimension will be created for the payment and for the trip. Because tables differ from their time granurality.
- Trip date dimension will be generated hourly due to the interest of time of the day (morning, afternoon, night, evening) information.
- Payment date dimension will be generated daily due to the interest of money spending per month, quarter, year.

In [0]:
from pyspark.sql.functions import min, max
from pyspark.sql.functions import explode, sequence, to_timestamp

# Retrieve the maximum and minimum dates from payment and trip table
payment_min_date, payment_max_date = payment_df.select(min('date_id'), max('date_id')).first()
trip_min_date, trip_max_date = trip_df.select(min('time_id'), max('time_id')).first()

print(trip_min_date, ' ', trip_max_date)
print(payment_min_date, ' ', payment_max_date)

# Generate temporary date views: one of the view is generated daily, the other view is generated hourly
spark.sql(f"SELECT explode(sequence(to_date('{payment_min_date}'), to_date('{payment_max_date}'), INTERVAL 1 DAY)) AS date").createOrReplaceTempView('payment_dates_view')
spark.sql(f"SELECT explode(sequence(to_timestamp('{trip_min_date}'), to_timestamp('{trip_max_date}'), INTERVAL 1 HOUR)) AS time").createOrReplaceTempView('trip_dates_view')

2021-02-01 01:00:00   2022-01-31 23:00:00
2013-02-01   2022-02-01


In [0]:
%sql SELECT * FROM trip_dates_view LIMIT 20

time
2021-02-01T01:00:00.000+0000
2021-02-01T02:00:00.000+0000
2021-02-01T03:00:00.000+0000
2021-02-01T04:00:00.000+0000
2021-02-01T05:00:00.000+0000
2021-02-01T06:00:00.000+0000
2021-02-01T07:00:00.000+0000
2021-02-01T08:00:00.000+0000
2021-02-01T09:00:00.000+0000
2021-02-01T10:00:00.000+0000


In [0]:
trip_dates = spark.sql("""
SELECT
    time as time_id,
    dayofweek(time) AS day_of_week,
    CASE WHEN date_format(time, 'H') BETWEEN 5 AND 11 THEN 'morning'
        WHEN date_format(time, 'H') BETWEEN 12 AND 16 THEN 'afternoon'
        WHEN date_format(time, 'H') BETWEEN 17 AND 21 THEN 'evening'
        ELSE 'night'
     END AS time_of_day
FROM trip_dates_view
ORDER BY time
""")

# Save to delta lake and create table using delta location
trip_dates.write.format('delta').mode('overwrite').saveAsTable('trip_dates')

In [0]:
payment_dates = spark.sql("""
SELECT
    date AS date_id,
    INT(date_format(date, 'M')) AS month,
    INT(date_format(date, 'Q')) AS quarter,
    INT(date_format(date, 'y')) as year
FROM payment_dates_view
ORDER BY date
""")

# Save to delta lake and create table using delta location
payment_dates.write.format('delta').mode('overwrite').saveAsTable('payment_dates')

# Business Questions

Answering the business questions.
- Analyze how much time is spent per ride
  * Based on date and time factors such as day of week and time of day
  * Based on which station is the starting and / or ending station
  * Based on age of the rider at time of the ride
  * Based on whether the rider is a member or a casual rider
- Analyze how much money is spent
  * Per month, quarter, year
  * Per member, based on the age of the rider at account start
- EXTRA CREDIT - Analyze how much money is spent per member
  * Based on how many rides the rider averages per month
  * Based on how many minutes the rider spends on a bike per month

In [0]:
# Load the fact and dimension tables
payment_df = spark.read.table('payments')
trip_df = spark.read.table('trips')

rider_df = spark.read.table('riders')
station_df = spark.read.table('stations')
trip_date_df = spark.read.table('trip_dates')
payment_date_df = spark.read.table('payment_dates')

## Trip Table Queries

In [0]:
from pyspark.sql.functions import sum, avg

# Analyze how much time is spent per ride on average based on day of week
trip_df.join(trip_date_df, 'time_id')\
    .groupBy('day_of_week')\
    .agg(avg('duration').alias('duration_in_seconds_avg'))\
    .orderBy('duration_in_seconds_avg', ascending=False)\
    .show()

# Analyze how much time is spent per ride in total based on day of week
trip_df.join(trip_date_df, 'time_id')\
    .groupBy('day_of_week')\
    .agg(sum('duration').alias('duration_in_seconds_sum'))\
    .orderBy('duration_in_seconds_sum', ascending=False)\
    .show()

# Analyze how much time is spent per ride on average based on time of day
trip_df.join(trip_date_df, 'time_id')\
    .groupBy('time_of_day')\
    .agg(avg('duration').alias('duration_in_seconds_avg'))\
    .orderBy('duration_in_seconds_avg', ascending=False)\
    .show()

# Analyze how much time is spent per ride in total based on time of day
trip_df.join(trip_date_df, 'time_id')\
    .groupBy('time_of_day')\
    .agg(sum('duration').alias('duration_in_seconds_sum'))\
    .orderBy('duration_in_seconds_sum', ascending=False)\
    .show()

+-----------+-----------------------+
|day_of_week|duration_in_seconds_avg|
+-----------+-----------------------+
|          1|      1669.501400089135|
|          7|     1571.6539107809404|
|          6|     1248.3461199415995|
|          2|     1237.3757179680788|
|          5|     1094.5453685426273|
|          3|     1090.6051999331985|
|          4|     1082.1249497027595|
+-----------+-----------------------+

+-----------+-----------------------+
|day_of_week|duration_in_seconds_sum|
+-----------+-----------------------+
|          7|             1292638192|
|          1|             1191232656|
|          6|              815691825|
|          2|              712858338|
|          4|              666952563|
|          3|              659572941|
|          5|              654854454|
+-----------+-----------------------+

+-----------+-----------------------+
|time_of_day|duration_in_seconds_avg|
+-----------+-----------------------+
|      night|     1609.8759526607973|
|  afterno

In [0]:
# Analyze how much time is spent per ride in total based on starting station
trip_df.groupBy('start_station_id')\
    .agg(avg('duration').alias('duration_in_seconds_avg'))\
    .orderBy('duration_in_seconds_avg', ascending=False)\
    .show()

# Analyze how much time is spent per ride on average based on starting station
trip_df.groupBy('start_station_id')\
    .agg(sum('duration').alias('duration_in_seconds_sum'))\
    .orderBy('duration_in_seconds_sum', ascending=False)\
    .show()

# Analyze how much time is spent per ride in total based on ending station
trip_df.groupBy('end_station_id')\
    .agg(avg('duration').alias('duration_in_seconds_avg'))\
    .orderBy('duration_in_seconds_avg', ascending=False)\
    .show()

# Analyze how much time is spent per ride on average based on ending station
trip_df.groupBy('end_station_id')\
    .agg(sum('duration').alias('duration_in_seconds_sum'))\
    .orderBy('duration_in_seconds_sum', ascending=False)\
    .show()

+----------------+-----------------------+
|start_station_id|duration_in_seconds_avg|
+----------------+-----------------------+
|             556|      32282.28205128205|
|             665|     32138.354838709678|
|             587|     24582.036363636365|
|             564|     21203.466666666667|
|           16915|     19059.147887323943|
|           20205|     18518.119047619046|
|           20230|     17122.447552447553|
|             642|     16987.561904761904|
|             537|     16801.197674418603|
|           20211|                15967.8|
|           20119|      12931.41791044776|
|           20103|     12733.497695852535|
|           20112|     12443.551020408164|
|           16970|     12293.792682926829|
|           20231|     11172.219696969696|
|           18025|      10114.14705882353|
|             585|       9558.39800995025|
|           16905|      8794.760299625468|
|           20224|      8758.176781002638|
|           20121|      8711.121019108281|
+----------

In [0]:
# Analyze how much time is spent per ride on average based on age of the rider at time of the ride
trip_df.join(rider_df, trip_df.rider_id == rider_df.rider_id)\
    .groupBy('age_at_account_start')\
    .agg(avg('duration').alias('duration_in_seconds_avg'))\
    .orderBy('duration_in_seconds_avg', ascending=False)\
    .show()

# Analyze how much time is spent per ride in total based on age of the rider at time of the ride
trip_df.join(rider_df, trip_df.rider_id == rider_df.rider_id)\
    .groupBy('age_at_account_start')\
    .agg(sum('duration').alias('duration_in_seconds_sum'))\
    .orderBy('duration_in_seconds_sum', ascending=False)\
    .show()

+--------------------+-----------------------+
|age_at_account_start|duration_in_seconds_avg|
+--------------------+-----------------------+
|                   7|     3494.3839541547277|
|                  67|      1800.708793849111|
|                   8|     1720.1452229299364|
|                  74|              1707.3125|
|                  62|      1534.881186706604|
|                  11|     1489.5041909519844|
|                  66|     1441.1453224341508|
|                  64|     1428.9561598224195|
|                  59|      1415.002880237852|
|                  32|      1381.481177555961|
|                  21|      1377.010394318029|
|                  57|     1371.7438153124333|
|                  40|      1363.636854531693|
|                  20|     1343.6139980452108|
|                  37|     1340.5873700898362|
|                  29|     1337.6662082350597|
|                  10|     1336.3863935350498|
|                  51|     1334.4031211695417|
|            

In [0]:
# Analyze how much time is spent per ride on average based on rider's membership
trip_df.join(rider_df, trip_df.rider_id == rider_df.rider_id)\
    .groupBy('is_member')\
    .agg(avg('duration').alias('duration_in_seconds_avg'))\
    .orderBy('duration_in_seconds_avg', ascending=False)\
    .show()

# Analyze how much time is spent per ride in total based on rider's membership
trip_df.join(rider_df, trip_df.rider_id == rider_df.rider_id)\
    .groupBy('is_member')\
    .agg(sum('duration').alias('duration_in_seconds_sum'))\
    .orderBy('duration_in_seconds_sum', ascending=False)\
    .show()

+---------+-----------------------+
|is_member|duration_in_seconds_avg|
+---------+-----------------------+
|     true|     1314.2656120356567|
|    false|     1279.4272573384933|
+---------+-----------------------+

+---------+-----------------------+
|is_member|duration_in_seconds_sum|
+---------+-----------------------+
|     true|             4818499899|
|    false|             1175301070|
+---------+-----------------------+



## Payment Table Queries

In [0]:
# Analyze how much money is spent on average per month
payment_df.join(payment_date_df, 'date_id')\
        .groupby('month')\
        .agg(sum('amount').alias('amount_sum'))\
        .orderBy('amount_sum', ascending=False)\
        .show()

# Analyze how much money is spent in total per month
payment_df.join(payment_date_df, 'date_id')\
        .groupby('month')\
        .agg(avg('amount').alias('amount_avg'))\
        .orderBy('amount_avg', ascending=False)\
        .show()

# Analyze how much money is spent on average per quarter
payment_df.join(payment_date_df, 'date_id')\
        .groupby('quarter')\
        .agg(sum('amount').alias('amount_sum'))\
        .orderBy('amount_sum', ascending=False)\
        .show()

# Analyze how much money is spent in total per quarter
payment_df.join(payment_date_df, 'date_id')\
        .groupby('quarter')\
        .agg(avg('amount').alias('amount_avg'))\
        .orderBy('amount_avg', ascending=False)\
        .show()

# Analyze how much money is spent on average per year
payment_df.join(payment_date_df, 'date_id')\
        .groupby('year')\
        .agg(sum('amount').alias('amount_sum'))\
        .orderBy('amount_sum', ascending=False)\
        .show()

# Analyze how much money is spent in total per year
payment_df.join(payment_date_df, 'date_id')\
        .groupby('year')\
        .agg(avg('amount').alias('amount_avg'))\
        .orderBy('amount_avg', ascending=False)\
        .show()

+-----+----------+
|month|amount_sum|
+-----+----------+
|    2|   1908071|
|    1|   1855960|
|   12|   1799954|
|   11|   1747391|
|   10|   1696403|
|    9|   1642068|
|    8|   1592482|
|    7|   1539179|
|    6|   1491380|
|    5|   1441372|
|    4|   1395922|
|    3|   1348931|
+-----+----------+

+-----+----------+
|month|amount_avg|
+-----+----------+
|    8|   10.0111|
|   10|   10.0034|
|    2|   10.0034|
|    1|   10.0006|
|    3|    9.9987|
|    4|    9.9946|
|    6|    9.9934|
|    9|    9.9933|
|   12|    9.9931|
|    7|    9.9894|
|   11|    9.9871|
|    5|    9.9868|
+-----+----------+

+-------+----------+
|quarter|amount_sum|
+-------+----------+
|      4|   5243748|
|      1|   5112962|
|      3|   4773729|
|      2|   4328674|
+-------+----------+

+-------+----------+
|quarter|amount_avg|
+-------+----------+
|      1|   10.0011|
|      3|    9.9980|
|      4|    9.9944|
|      2|    9.9916|
+-------+----------+

+----+----------+
|year|amount_sum|
+----+----------

In [0]:
# Analyze how much money is spent on average per member, based on the age of the rider at account start
payment_df.join(rider_df, 'rider_id')\
        .where(rider_df.is_member == True)\
        .groupby('age_at_account_start')\
        .agg(avg('amount').alias('amount_avg'))\
        .orderBy('amount_avg', ascending=False)\
        .show()

# Analyze how much money is spent in total per member, based on the age of the rider at account start
payment_df.join(rider_df, 'rider_id')\
        .where(rider_df.is_member == True)\
        .groupby('age_at_account_start')\
        .agg(sum('amount').alias('amount_sum'))\
        .orderBy('amount_sum', ascending=False)\
        .show()

+--------------------+----------+
|age_at_account_start|amount_avg|
+--------------------+----------+
|                  31|    9.0000|
|                  65|    9.0000|
|                  53|    9.0000|
|                  34|    9.0000|
|                  28|    9.0000|
|                  27|    9.0000|
|                  26|    9.0000|
|                  44|    9.0000|
|                  12|    9.0000|
|                  22|    9.0000|
|                  47|    9.0000|
|                  52|    9.0000|
|                  13|    9.0000|
|                  16|    9.0000|
|                  20|    9.0000|
|                  40|    9.0000|
|                  57|    9.0000|
|                  54|    9.0000|
|                  48|    9.0000|
|                  19|    9.0000|
+--------------------+----------+
only showing top 20 rows

+--------------------+----------+
|age_at_account_start|amount_sum|
+--------------------+----------+
|                  21|    495810|
|                  23|

## Extra Credit Queries

In [0]:
from pyspark.sql.functions import month, count

# Analyze how much money is spent per member based on how many rides the rider averages per month
trip_df.join(rider_df, 'rider_id')\
    .join(payment_df, 'rider_id')\
    .withColumn('month', month('time_id'))\
    .where(rider_df.is_member == True)\
    .groupby('rider_id', 'month')\
    .agg(avg('amount').alias('avg_amount'), count('trip_id').alias('num_rides'))\
    .orderBy('num_rides', ascending=False)\
    .show()

+--------+-----+----------+---------+
|rider_id|month|avg_amount|num_rides|
+--------+-----+----------+---------+
|    3897|    7|    9.0000|    26866|
|   52113|    7|    9.0000|    25668|
|   71138|    7|    9.0000|    25400|
|   71138|    8|    9.0000|    24100|
|    3897|    8|    9.0000|    22018|
|   52113|    8|    9.0000|    21948|
|   71138|    6|    9.0000|    21900|
|    3897|    6|    9.0000|    21210|
|   52113|    6|    9.0000|    20274|
|    4193|    7|    9.0000|    19028|
|   71138|    9|    9.0000|    18800|
|    3897|    9|    9.0000|    17776|
|   71138|    5|    9.0000|    17200|
|   27866|    7|    9.0000|    17160|
|    3897|    5|    9.0000|    16665|
|   52113|    9|    9.0000|    16554|
|    7667|    7|    9.0000|    16356|
|   37388|    7|    9.0000|    16131|
|    4193|    8|    9.0000|    15611|
|   27866|    6|    9.0000|    15400|
+--------+-----+----------+---------+
only showing top 20 rows



In [0]:
# Analyze how much money is spent per member based on how many minutes the rider spends on a bike per month
trip_df.join(rider_df, 'rider_id')\
    .join(payment_df, 'rider_id')\
    .withColumn('month', month('time_id'))\
    .withColumn('minutes', (trip_df.duration/60).cast('int'))\
    .where(rider_df.is_member == True)\
    .groupby('rider_id', 'minutes', 'month')\
    .agg(avg('amount').alias('avg_amount'), avg('duration').alias('avg_duration'))\
    .orderBy('avg_duration', ascending=False)\
    .show()

# The top 1 rider spent 38.85 day averagely (3356649 seconds) worth of riding at 6th month. Probably the bike was not put back to the station.
trip_df.select('rider_id', 'started_at', 'ended_at', 'duration').where(trip_df.rider_id == 1088).orderBy('duration', ascending=False).show()

+--------+-------+-----+----------+------------+
|rider_id|minutes|month|avg_amount|avg_duration|
+--------+-------+-----+----------+------------+
|    1088|  55944|    6|    9.0000|   3356649.0|
|   53843|  55691|    6|    9.0000|   3341501.0|
|   64320|  53921|    5|    9.0000|   3235296.0|
|   58650|  52701|    6|    9.0000|   3162083.0|
|   43851|  49107|    7|    9.0000|   2946429.0|
|   44213|  47010|    6|    9.0000|   2820651.0|
|   31759|  41645|    7|    9.0000|   2498731.0|
|   39724|  41629|    8|    9.0000|   2497750.0|
|   22453|  40724|    7|    9.0000|   2443476.0|
|   23373|  40705|   10|    9.0000|   2442301.0|
|   41276|  40320|    6|    9.0000|   2419234.0|
|   73511|  39587|    7|    9.0000|   2375231.0|
|   69222|  39439|    6|    9.0000|   2366349.0|
|   68654|  38963|    4|    9.0000|   2337785.0|
|   35099|  38685|    7|    9.0000|   2321116.0|
|   55813|  38418|    6|    9.0000|   2305116.0|
|   16270|  36542|    6|    9.0000|   2192529.0|
|   21888|  35622|  

# References

* Drop delta table: https://stackoverflow.com/a/59042576/10721627
* Rename column name: https://stackoverflow.com/a/67236170/10721627
* Change column type: https://stackoverflow.com/a/74391411/10721627 
* Date difference: https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.functions.datediff.html
* Cast string to timestamp: https://stackoverflow.com/a/52369875/10721627
* Simple string name for types: https://stackoverflow.com/a/32286450/10721627
* Generate date dimension: https://www.bluegranite.com/blog/generate-a-calendar-dimension-in-spark
* Datetime patterns for formatting: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
* Join dataframes: https://spark.apache.org/docs/3.1.2/api/python/reference/api/pyspark.sql.DataFrame.join.html
* Rearrange columns: https://stackoverflow.com/a/42913947/10721627
* Columns alias: https://stackoverflow.com/a/33524094/10721627