# Config stuff

In [1]:
import ConnectionConfig as cc
cc.setupEnvironment()

# Start local cluster

In [2]:
spark = cc.startLocalCluster("DIM_DATE",4)
spark.getActiveSession()

# Create date dimension

## Step 1: Generate rows for a sequence of dates

Find the oldest date and the newest date in the rides table. I will do this by querying the rides table in the VeloDB database with a query console. You can find the queries I used in the "queries" folder and then the dimDate.sql file.

In [14]:
#Extract
startDate = '2012-09-21'
endDate = '2023-09-17'

df_dates = spark.sql(f"select explode(sequence(to_date('{startDate}'), to_date('{endDate}'), interval 1 day)) as calendarDate, monotonically_increasing_id() as dateSK ")

df_dates.show(10)

+------------+------+
|calendarDate|dateSK|
+------------+------+
|  2012-09-21|     0|
|  2012-09-22|     1|
|  2012-09-23|     2|
|  2012-09-24|     3|
|  2012-09-25|     4|
|  2012-09-26|     5|
|  2012-09-27|     6|
|  2012-09-28|     7|
|  2012-09-29|     8|
|  2012-09-30|     9|
+------------+------+
only showing top 10 rows



## Step 2: Create all dimension fields

In [15]:
#TRANSFORM
df_dates.createOrReplaceTempView('neededDates')

dimDate = spark.sql("select dateSK, \
  year(calendarDate) * 10000 + month(calendarDate) * 100 + day(calendarDate) as dateInt, \
  CalendarDate, \
  year(calendarDate) AS CalendarYear, \
  date_format(calendarDate, 'MMMM') as CalendarMonth, \
  month(calendarDate) as MonthOfYear, \
  date_format(calendarDate, 'EEEE') as CalendarDay, \
  dayofweek(calendarDate) AS DayOfWeek, \
  weekday(calendarDate) + 1 as DayOfWeekStartMonday, \
  case \
    when weekday(calendarDate) < 5 then 'Y' \
    else 'N' \
  end as IsWeekDay, \
  dayofmonth(calendarDate) as DayOfMonth, \
  case \
    when calendarDate = last_day(calendarDate) then 'Y' \
    else 'N' \
  end as IsLastDayOfMonth, \
  dayofyear(calendarDate) as DayOfYear, \
  weekofyear(calendarDate) as WeekOfYearIso, \
  quarter(calendarDate) as QuarterOfYear \
from  \
  neededDates \
order by \
  calendarDate")

dimDate.show()

+------+--------+------------+------------+-------------+-----------+-----------+---------+--------------------+---------+----------+----------------+---------+-------------+-------------+
|dateSK| dateInt|CalendarDate|CalendarYear|CalendarMonth|MonthOfYear|CalendarDay|DayOfWeek|DayOfWeekStartMonday|IsWeekDay|DayOfMonth|IsLastDayOfMonth|DayOfYear|WeekOfYearIso|QuarterOfYear|
+------+--------+------------+------------+-------------+-----------+-----------+---------+--------------------+---------+----------+----------------+---------+-------------+-------------+
|     0|20120921|  2012-09-21|        2012|    September|          9|     Friday|        6|                   5|        Y|        21|               N|      265|           38|            3|
|     1|20120922|  2012-09-22|        2012|    September|          9|   Saturday|        7|                   6|        N|        22|               N|      266|           38|            3|
|     2|20120923|  2012-09-23|        2012|    Septembe

## Step 3: Save dimension to deltatable

In [16]:
#LOAD
dimDate.write.format("delta").mode("overwrite").saveAsTable("dimDate")

In [17]:
spark.stop()