In [1]:
# Script to create date for Date Dimension Landing

In [2]:
# Import required libraries
import sys
from lib.spark_session import get_spark_session
from lib.utils import date_data, get_string_cols, get_rundate
from lib.job_control import insert_log, get_max_timestamp
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import current_timestamp, lit
from datetime import datetime
from delta import DeltaTable

In [3]:
# JOB Parameters
rundate = get_rundate()
schema_name = "edw_ld"
table_name = "dim_date_ld"
table_full_name = f"{schema_name}.{table_name}"
print("SPARK_APP: JOB triggered for rundate - " + rundate)

SPARK_APP: JOB triggered for rundate - 20220101


In [4]:
# Generate Spark Sesison
spark: SparkSession = get_spark_session(f"Landing load -{table_full_name}")
print("SPARK_APP: Spark UI - " + spark.sparkContext.uiWebUrl)

SPARK_APP: Spark UI - http://16804892cba9:4040


In [5]:
# Spark Configs
spark.conf.set("spark.sql.shuffle.partitions", 8)

In [6]:
# Define cols for date landing
_cols = ["date", "day", "month", "year", "day_of_week"]
_data = date_data(rundate, 2)

In [7]:
# Create Raw dataframe and get col info
df_raw = spark.createDataFrame(data=_data, schema=_cols)
print("SPARK_APP: Printing Raw Schema --")
df_raw.printSchema()

# get landing count
print("SPARK_APP: Landing data count - " + str(df_raw.count()))

SPARK_APP: Printing Raw Schema --
root
 |-- date: string (nullable = true)
 |-- day: string (nullable = true)
 |-- month: string (nullable = true)
 |-- year: string (nullable = true)
 |-- day_of_week: string (nullable = true)

SPARK_APP: Landing data count - 729


In [8]:
# cast all columns to string
df_casted = df_raw.selectExpr(get_string_cols(spark, df_raw))
print("SPARK_APP: Casted all Columns to string")

# Add audit columns
df_ld = df_casted.withColumn("insert_dt", current_timestamp()) \
    .withColumn("rundate", lit(rundate))
print("SPARK_APP: Added AUDIT column")

# Get final layer count
print("SPARK_APP: Final layer data count - " + str(df_ld.count()))
print("SPARK_APP: Printing Landing Schema --")
df_ld.printSchema()

SPARK_APP: Casted all Columns to string
SPARK_APP: Added AUDIT column
SPARK_APP: Final layer data count - 729
SPARK_APP: Printing Landing Schema --
root
 |-- date: string (nullable = true)
 |-- day: string (nullable = true)
 |-- month: string (nullable = true)
 |-- year: string (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- insert_dt: timestamp (nullable = false)
 |-- rundate: string (nullable = false)



In [9]:
df_ld.show()

+----------+---+-----+----+-----------+--------------------+--------+
|      date|day|month|year|day_of_week|           insert_dt| rundate|
+----------+---+-----+----+-----------+--------------------+--------+
|2022-01-01| 01|   01|2022|   Saturday|2025-12-25 12:30:...|20220101|
|2022-01-02| 02|   01|2022|     Sunday|2025-12-25 12:30:...|20220101|
|2022-01-03| 03|   01|2022|     Monday|2025-12-25 12:30:...|20220101|
|2022-01-04| 04|   01|2022|    Tuesday|2025-12-25 12:30:...|20220101|
|2022-01-05| 05|   01|2022|  Wednesday|2025-12-25 12:30:...|20220101|
|2022-01-06| 06|   01|2022|   Thursday|2025-12-25 12:30:...|20220101|
|2022-01-07| 07|   01|2022|     Friday|2025-12-25 12:30:...|20220101|
|2022-01-08| 08|   01|2022|   Saturday|2025-12-25 12:30:...|20220101|
|2022-01-09| 09|   01|2022|     Sunday|2025-12-25 12:30:...|20220101|
|2022-01-10| 10|   01|2022|     Monday|2025-12-25 12:30:...|20220101|
|2022-01-11| 11|   01|2022|    Tuesday|2025-12-25 12:30:...|20220101|
|2022-01-12| 12|   0

In [10]:
# Write the data to landing layer checking if table exists
if get_max_timestamp(spark, schema_name, table_name) != "1900-01-01 00:00:00.000000":
    df_ld.write \
        .format("delta") \
        .mode("append") \
        .saveAsTable(table_full_name)
else:
    df_ld.write \
        .format("delta") \
        .mode("overwrite") \
        .saveAsTable(table_full_name)

print("SPARK_APP: Data written to landing layer")

SPARK_APP: Data written to landing layer


In [12]:
# Add job details in JOB CONTROL
insert_log(spark, schema_name, table_name, datetime.now(), rundate)
print("SPARK_APP: Update JOB Control Log")

SPARK_APP: Update JOB Control Log


In [13]:
spark.sql(f"select * from edw.job_control where table_name = '{table_name}' order by insert_dt desc limit 1").show(truncate=False)

+-----------+-----------+--------------------------+--------+--------------------------+
|schema_name|table_name |max_timestamp             |rundate |insert_dt                 |
+-----------+-----------+--------------------------+--------+--------------------------+
|edw_ld     |dim_date_ld|2025-12-25 12:34:49.381687|20220101|2025-12-25 12:34:50.038781|
+-----------+-----------+--------------------------+--------+--------------------------+



In [14]:
# Get the logs from delta table version
dt = DeltaTable.forName(spark, table_full_name)
dt.history().limit(1).select("version","operationMetrics.executionTimeMs",
                             "operationMetrics.numTargetRowsInserted",
                             "operationMetrics.numTargetRowsUpdated",
                             "operationMetrics.numOutputRows").show(1, False)

+-------+---------------+---------------------+--------------------+-------------+
|version|executionTimeMs|numTargetRowsInserted|numTargetRowsUpdated|numOutputRows|
+-------+---------------+---------------------+--------------------+-------------+
|0      |null           |null                 |null                |729          |
+-------+---------------+---------------------+--------------------+-------------+



In [15]:
# Generate Symlink manifest for Athena access
dt.generate("symlink_format_manifest")
print("SPARK_APP: Symlink Manifest file generated")

SPARK_APP: Symlink Manifest file generated


In [17]:
spark.stop()