# Calendar table creation

This notebook creates a basic calendar table to support data analysis in a Power BI dashboard.

### Tables used:
- None

### Tables created
- Spark DB: ds3_main (stage 3 data science main)
    - Calendar



In [2]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# data lake and container information
storage_account = 'saeduanalytics'
stage1 = f'abfss://stage1@{storage_account}.dfs.core.windows.net'
stage2 = f'abfss://stage2@{storage_account}.dfs.core.windows.net'
stage3 = f'abfss://stage3@{storage_account}.dfs.core.windows.net'

StatementMeta(spark1, 524, 1, Finished, Available)



In [4]:
# date range
start = "2020-01-01"
stop = "2021-12-30"

# create calendar dataframe
temp_df = spark.createDataFrame([(start, stop)], ("start", "stop"))
temp_df = temp_df.select([col(c).cast("timestamp") for c in ("start", "stop")])
temp_df = temp_df.withColumn("stop",date_add("stop",1).cast("timestamp"))
temp_df = temp_df.select([col(c).cast("long") for c in ("start", "stop")])
start, stop = temp_df.first()
interval=60*60*24

df = spark.range(start,stop,interval).select(col("id").cast("timestamp").alias("DateTime"))
df = df.withColumn("Date", to_date(col("DateTime")))

df = df.drop("DateTime")
df = df.withColumn('Year', date_format('Date', 'YYYY'))
df = df.withColumn('Month', date_format('Date', 'MMMM'))
df = df.withColumn('MonthNum', date_format('Date', 'M'))
df = df.withColumn('Week', date_format('Date', 'W'))
df = df.withColumn('Day', date_format('Date', 'D'))

df.show()

StatementMeta(spark1, 524, 3, Finished, Available)

+----------+----+-------+--------+----+---+
|      Date|Year|  Month|MonthNum|Week|Day|
+----------+----+-------+--------+----+---+
|2020-01-01|2020|January|       1|   1|  1|
|2020-01-02|2020|January|       1|   1|  2|
|2020-01-03|2020|January|       1|   1|  3|
|2020-01-04|2020|January|       1|   1|  4|
|2020-01-05|2020|January|       1|   2|  5|
|2020-01-06|2020|January|       1|   2|  6|
|2020-01-07|2020|January|       1|   2|  7|
|2020-01-08|2020|January|       1|   2|  8|
|2020-01-09|2020|January|       1|   2|  9|
|2020-01-10|2020|January|       1|   2| 10|
|2020-01-11|2020|January|       1|   2| 11|
|2020-01-12|2020|January|       1|   3| 12|
|2020-01-13|2020|January|       1|   3| 13|
|2020-01-14|2020|January|       1|   3| 14|
|2020-01-15|2020|January|       1|   3| 15|
|2020-01-16|2020|January|       1|   3| 16|
|2020-01-17|2020|January|       1|   3| 17|
|2020-01-18|2020|January|       1|   3| 18|
|2020-01-19|2020|January|       1|   4| 19|
|2020-01-20|2020|January|       

## Write Data Back to Lake

In [5]:
# write back to the lake in stage 3 ds3_main directory
df.write.format('parquet').mode('overwrite').save(stage3 + '/ds3_main/Calendar')

StatementMeta(spark1, 524, 4, Finished, Available)



## Load to Spark DB

In [4]:
# Create spark db to allow for access to the data in the data lake via SQL on-demand.
# This is only creating metadata for SQL on-demand, pointing to the data in the delta-lake.
# This also makes it possible to connect in Power BI via the azure sql data source connector.
def create_spark_db(db_name, source_path):
    spark.sql(f'CREATE DATABASE IF NOT EXISTS {db_name}')
    spark.sql(f"DROP TABLE IF EXISTS {db_name}.Calendar")
    spark.sql(f"create table if not exists {db_name}.Calendar using PARQUET location '{source_path}/Calendar'")

create_spark_db('ds3_main', stage3 + '/ds3_main')