In [1]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
import ConnectionConfig as cc
cc.setupEnvironment()

In [2]:
# This sets up a local cluster with 4 nodes and retrieve the active spark session
spark = cc.startLocalCluster("DIM_DATE",4)
spark.getActiveSession()

In [3]:
#Generate a data Sequence 
#This SQL statement generates a sequence of dates between beginDate and endDate with an increment of 1 day.
#dateSK using monotonically_increasing_id() is good for creating a surrogate key.
from pyspark.sql.functions import *

beginDate = '2009-01-01'
endDate = '2023-12-31'

df_SQL = spark.sql(f"select explode(sequence(to_date('{beginDate}'), to_date('{endDate}'), interval 1 day)) as calendarDate, monotonically_increasing_id() as dateSK ")


df_SQL.createOrReplaceTempView('neededDates' )

# Show the DataFrame
dimDate = spark.sql("select * from neededDates")
dimDate.show()

# Define the path to save the Delta table
delta_table_path = "spark-warehouse/dimDate"

# Save dimDate DataFrame as a Delta table
dimDate.write.format("delta").mode("overwrite").save(delta_table_path)

+------------+------+
|calendarDate|dateSK|
+------------+------+
|  2009-01-01|     0|
|  2009-01-02|     1|
|  2009-01-03|     2|
|  2009-01-04|     3|
|  2009-01-05|     4|
|  2009-01-06|     5|
|  2009-01-07|     6|
|  2009-01-08|     7|
|  2009-01-09|     8|
|  2009-01-10|     9|
|  2009-01-11|    10|
|  2009-01-12|    11|
|  2009-01-13|    12|
|  2009-01-14|    13|
|  2009-01-15|    14|
|  2009-01-16|    15|
|  2009-01-17|    16|
|  2009-01-18|    17|
|  2009-01-19|    18|
|  2009-01-20|    19|
+------------+------+
only showing top 20 rows



In [4]:
#Creating a Dimension table "dimdate"
dimDate = spark.sql("select dateSK, \
  year(calendarDate) * 10000 + month(calendarDate) * 100 + day(calendarDate) as dateInt, \
  CalendarDate, \
  year(calendarDate) AS CalendarYear, \
  date_format(calendarDate, 'MMMM') as CalendarMonth, \
  month(calendarDate) as MonthOfYear, \
  date_format(calendarDate, 'EEEE') as CalendarDay, \
  dayofweek(calendarDate) AS DayOfWeek, \
  weekday(calendarDate) + 1 as DayOfWeekStartMonday, \
  case \
    when weekday(calendarDate) < 5 then 'Y' \
    else 'N' \
  end as IsWeekDay, \
  dayofmonth(calendarDate) as DayOfMonth, \
  case \
    when calendarDate = last_day(calendarDate) then 'Y' \
    else 'N' \
  end as IsLastDayOfMonth, \
  dayofyear(calendarDate) as DayOfYear, \
  weekofyear(calendarDate) as WeekOfYearIso, \
  quarter(calendarDate) as QuarterOfYear \
from  \
  neededDates \
order by \
  calendarDate")

dimDate.show()

+------+--------+------------+------------+-------------+-----------+-----------+---------+--------------------+---------+----------+----------------+---------+-------------+-------------+
|dateSK| dateInt|CalendarDate|CalendarYear|CalendarMonth|MonthOfYear|CalendarDay|DayOfWeek|DayOfWeekStartMonday|IsWeekDay|DayOfMonth|IsLastDayOfMonth|DayOfYear|WeekOfYearIso|QuarterOfYear|
+------+--------+------------+------------+-------------+-----------+-----------+---------+--------------------+---------+----------+----------------+---------+-------------+-------------+
|     0|20090101|  2009-01-01|        2009|      January|          1|   Thursday|        5|                   4|        Y|         1|               N|        1|            1|            1|
|     1|20090102|  2009-01-02|        2009|      January|          1|     Friday|        6|                   5|        Y|         2|               N|        2|            1|            1|
|     2|20090103|  2009-01-03|        2009|      Januar

In [5]:
#from pyspark.sql.functions import explode, expr, sequence,col, date_format
df_SparkSQL = df_SQL \
    .withColumn("year", date_format("calendarDate",'yyyy')) \
    .withColumn("month", date_format("calendarDate",'MMMM')) \
    .withColumn("lasyDayOfMonth" \
                ,expr("case when calendarDate = last_day(calendarDate) then 'Y' \
                else 'N' \
                end as IsLastDayOfMonth"))
df_SparkSQL.show() 

+------------+------+----+-------+--------------+
|calendarDate|dateSK|year|  month|lasyDayOfMonth|
+------------+------+----+-------+--------------+
|  2009-01-01|     0|2009|January|             N|
|  2009-01-02|     1|2009|January|             N|
|  2009-01-03|     2|2009|January|             N|
|  2009-01-04|     3|2009|January|             N|
|  2009-01-05|     4|2009|January|             N|
|  2009-01-06|     5|2009|January|             N|
|  2009-01-07|     6|2009|January|             N|
|  2009-01-08|     7|2009|January|             N|
|  2009-01-09|     8|2009|January|             N|
|  2009-01-10|     9|2009|January|             N|
|  2009-01-11|    10|2009|January|             N|
|  2009-01-12|    11|2009|January|             N|
|  2009-01-13|    12|2009|January|             N|
|  2009-01-14|    13|2009|January|             N|
|  2009-01-15|    14|2009|January|             N|
|  2009-01-16|    15|2009|January|             N|
|  2009-01-17|    16|2009|January|             N|


In [6]:
spark.stop()