# Config stuff

In [1]:
import ConnectionConfig as cc
cc.setupEnvironment()

# Start local cluster

In [2]:
spark = cc.startLocalCluster("DIM_DATE",4)
spark.getActiveSession()

# Create date dimension

## Step 1: Generate rows for a sequence of dates

Find the oldest date and the newest date in the rides table. I will do this by querying the rides table in the VeloDB database with a query console.

In [3]:
#Extract
startDate = '2019-09-01'
endDate = '2023-09-17'

df_dates = spark.sql(f"select explode(sequence(to_date('{startDate}'), to_date('{endDate}'), interval 1 day)) as calendarDate, monotonically_increasing_id() as date_SK ")

df_dates.show(10)

+------------+-------+
|calendarDate|date_SK|
+------------+-------+
|  2019-09-01|      0|
|  2019-09-02|      1|
|  2019-09-03|      2|
|  2019-09-04|      3|
|  2019-09-05|      4|
|  2019-09-06|      5|
|  2019-09-07|      6|
|  2019-09-08|      7|
|  2019-09-09|      8|
|  2019-09-10|      9|
+------------+-------+
only showing top 10 rows



## Step 2: Create all dimension fields

In [4]:
#TRANSFORM
df_dates.createOrReplaceTempView('neededDates')

dimDate = spark.sql("select date_SK, \
  year(calendarDate) * 10000 + month(calendarDate) * 100 + day(calendarDate) as dateInt, \
  CalendarDate, \
  year(calendarDate) AS CalendarYear, \
  date_format(calendarDate, 'MMMM') as CalendarMonth, \
  month(calendarDate) as MonthOfYear, \
  date_format(calendarDate, 'EEEE') as CalendarDay, \
  dayofweek(calendarDate) AS DayOfWeek, \
  weekday(calendarDate) + 1 as DayOfWeekStartMonday, \
  case \
    when weekday(calendarDate) < 5 then 'Y' \
    else 'N' \
  end as IsWeekDay, \
  dayofmonth(calendarDate) as DayOfMonth, \
  case \
    when calendarDate = last_day(calendarDate) then 'Y' \
    else 'N' \
  end as IsLastDayOfMonth, \
  dayofyear(calendarDate) as DayOfYear, \
  weekofyear(calendarDate) as WeekOfYearIso, \
  quarter(calendarDate) as QuarterOfYear \
from  \
  neededDates \
order by \
  calendarDate")

dimDate.show()

+-------+--------+------------+------------+-------------+-----------+-----------+---------+--------------------+---------+----------+----------------+---------+-------------+-------------+
|date_SK| dateInt|CalendarDate|CalendarYear|CalendarMonth|MonthOfYear|CalendarDay|DayOfWeek|DayOfWeekStartMonday|IsWeekDay|DayOfMonth|IsLastDayOfMonth|DayOfYear|WeekOfYearIso|QuarterOfYear|
+-------+--------+------------+------------+-------------+-----------+-----------+---------+--------------------+---------+----------+----------------+---------+-------------+-------------+
|      0|20190901|  2019-09-01|        2019|    September|          9|     Sunday|        1|                   7|        N|         1|               N|      244|           35|            3|
|      1|20190902|  2019-09-02|        2019|    September|          9|     Monday|        2|                   1|        Y|         2|               N|      245|           36|            3|
|      2|20190903|  2019-09-03|        2019|    Se

## Step 3: Save dimension to deltatable

In [5]:
#LOAD
dimDate.write.format("delta").mode("overwrite").saveAsTable("dimDate")

In [6]:
spark.stop()