In [1]:
# script to load date dimension

In [2]:
# Import required libraries
import sys
from lib.spark_session import get_spark_session
from lib.utils import date_data, get_string_cols, get_rundate
from lib.job_control import insert_log, get_max_timestamp
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import current_timestamp, expr, to_date, date_format
from datetime import datetime
from delta import DeltaTable

In [3]:
# JOB Parameters
rundate = get_rundate()
schema_name = "edw"
table_name = "dim_date"
table_full_name = f"{schema_name}.{table_name}"
staging_table_full_name = "edw_stg.dim_date_stg"
print("SPARK_APP: JOB triggered for rundate - " + rundate)

SPARK_APP: JOB triggered for rundate - 20220101


In [4]:
# Generate Spark Session
spark: SparkSession = get_spark_session(f"Dimension load - {table_full_name}")
print("SPARK_APP: Spark UI - " + spark.sparkContext.uiWebUrl)

SPARK_APP: Spark UI - http://16804892cba9:4040


In [5]:
# Spark Configs
spark.conf.set("spark.sql.shuffle.partitions", 8)

In [6]:
# Read data from Staging
df_stg = spark \
    .read \
    .table(staging_table_full_name)

print("SPARK_APP: Staging Data Count - " + str(df_stg.count()))
print("SPARK_APP: Printing Staging Schema --")
df_stg.printSchema()

SPARK_APP: Staging Data Count - 729
SPARK_APP: Printing Staging Schema --
root
 |-- date: date (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- insert_dt: timestamp (nullable = true)
 |-- rundate: string (nullable = true)
 |-- update_dt: timestamp (nullable = true)



In [7]:
# Generate SURROGATE KEYs
df_dim_temp = df_stg.withColumn("row_wid", date_format("date", "yyyyMMdd"))

print("SPARK_APP: Dim Temp Data Count - " + str(df_dim_temp.count()))
print("SPARK_APP: Printing Dim Temp Schema --")
df_dim_temp.printSchema()

SPARK_APP: Dim Temp Data Count - 729
SPARK_APP: Printing Dim Temp Schema --
root
 |-- date: date (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- insert_dt: timestamp (nullable = true)
 |-- rundate: string (nullable = true)
 |-- update_dt: timestamp (nullable = true)
 |-- row_wid: string (nullable = true)



In [8]:
# Get the delta table for Upserts (SCD1)
dt_dim = DeltaTable.forName(spark, table_full_name)

# Validate if the load is full load
if get_max_timestamp(spark, schema_name, table_name) == "1900-01-01 00:00:00.000000":
    print("SPARK_APP: Table is set for full load") 
    # Truncate the Dimension table
    spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", False)
    dt_dim.delete(f"1=1")
    dt_dim.vacuum(0)

# Create the UPSERT logic
dt_dim.alias("dim_date") \
    .merge(df_dim_temp.alias("dim_temp"), "dim_date.date = dim_temp.date") \
    .whenMatchedUpdateAll() \
    .whenNotMatchedInsertAll() \
    .execute()

SPARK_APP: Table is set for full load


In [9]:
# Add job details in JOB CONTROL
insert_log(spark, schema_name, table_name, datetime.now(), rundate)
print("SPARK_APP: Update JOB Control Log")

SPARK_APP: Update JOB Control Log


In [10]:
spark.sql(f"select * from edw.job_control where table_name = '{table_name}' order by insert_dt desc limit 1").show(truncate=False)

+-----------+----------+--------------------------+--------+--------------------------+
|schema_name|table_name|max_timestamp             |rundate |insert_dt                 |
+-----------+----------+--------------------------+--------+--------------------------+
|edw        |dim_date  |2025-12-26 12:57:01.086986|20220101|2025-12-26 12:57:01.608665|
+-----------+----------+--------------------------+--------+--------------------------+



In [11]:
# Get the logs from delta table version
dt_dim.history().limit(1).select("version","operationMetrics.executionTimeMs", 
                                 "operationMetrics.numTargetRowsInserted",
                                "operationMetrics.numTargetRowsUpdated",
                                "operationMetrics.numOutputRows").show(1, False)

+-------+---------------+---------------------+--------------------+-------------+
|version|executionTimeMs|numTargetRowsInserted|numTargetRowsUpdated|numOutputRows|
+-------+---------------+---------------------+--------------------+-------------+
|1      |6589           |729                  |0                   |729          |
+-------+---------------+---------------------+--------------------+-------------+



In [12]:
# Generate Symlink manifest for Athena Access
dt = DeltaTable.forName(spark, table_full_name)
dt.generate("symlink_format_manifest")
print("SPARK_APP: Symlink Manifest file generated")

SPARK_APP: Symlink Manifest file generated


In [14]:
spark.sql("select * from edw.dim_date").show()

+--------+----------+---+-----+----+-----------+--------+--------------------+--------------------+
| row_wid|      date|day|month|year|day_of_week| rundate|           insert_dt|           update_dt|
+--------+----------+---+-----+----+-----------+--------+--------------------+--------------------+
|20220101|2022-01-01|  1|    1|2022|   Saturday|20220101|2025-12-25 13:17:...|2025-12-25 13:17:...|
|20220114|2022-01-14| 14|    1|2022|     Friday|20220101|2025-12-25 13:17:...|2025-12-25 13:17:...|
|20220219|2022-02-19| 19|    2|2022|   Saturday|20220101|2025-12-25 13:17:...|2025-12-25 13:17:...|
|20220225|2022-02-25| 25|    2|2022|     Friday|20220101|2025-12-25 13:17:...|2025-12-25 13:17:...|
|20220302|2022-03-02|  2|    3|2022|  Wednesday|20220101|2025-12-25 13:17:...|2025-12-25 13:17:...|
|20220314|2022-03-14| 14|    3|2022|     Monday|20220101|2025-12-25 13:17:...|2025-12-25 13:17:...|
|20220319|2022-03-19| 19|    3|2022|   Saturday|20220101|2025-12-25 13:17:...|2025-12-25 13:17:...|


In [15]:
spark.stop()