### Importing the Needed Modules

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import sys
sys.path.append("/Workspace/Users/mohammedthoufiq9360@gmail.com/Retail-And-Ecommerce-Analytics-Platform")
from src.paths import DIM_DATES_PATH
from src.schema_definitions import DIM_DATES_SCHEMA

### Creating a DataFrame with start and end date

In [0]:
start_date = "2020-01-01"
end_date   = "2029-12-31"

dates_df = (
    spark.createDataFrame([(start_date, end_date)], ["start", "end"])
)

### Creating a date column, range between start and end date

In [0]:
dates_df = dates_df.select(
    explode(
        sequence(
            to_date(col("start")),
            to_date(col("end"))
        )
    ).alias("date")
)

### Adding Business Related columns

In [0]:
dim_dates_df = (
    dates_df
    .withColumn("date_sk", date_format(col("date"), "yyyyMMdd").cast("int"))
    .withColumn("day", dayofmonth(col("date")))
    .withColumn("month", month(col("date")))
    .withColumn("month_name", date_format(col("date"), "MMMM"))
    .withColumn("quarter", quarter(col("date")))
    .withColumn("year", year(col("date")))
    .withColumn("day_of_week", dayofweek(col("date")))
    .withColumn("day_name", date_format(col("date"), "EEEE"))
    .withColumn("is_weekend",
    when(dayofweek(col("date")).isin(1, 7), True).otherwise(False)
    )
)


### Dim_date Schema for Reference

In [0]:
DIM_DATES_SCHEMA

{'date_sk': 'integer',
 'date': 'date',
 'day': 'integer',
 'month': 'integer',
 'month_name': 'string',
 'quarter': 'integer',
 'year': 'integer',
 'day_of_week': 'integer',
 'day_name': 'string',
 'is_weekend': 'boolean'}

### Selecting needed columns

In [0]:
dim_dates_df = dim_dates_df.select(
        "date_sk",
        "date",
        "day",
        "month",
        "month_name",
        "quarter",
        "year",
        "day_of_week",
        "day_name",
        "is_weekend"
    )

### Schema Enforcement Check

In [0]:
expected_cols = set(DIM_DATES_SCHEMA.keys())
incoming_cols = set(dim_dates_df.columns)

unknown_cols = incoming_cols - expected_cols

print("Unknown columns in Bronze:", unknown_cols)


Unknown columns in Bronze: set()


### Creating Gold Dim_dates Table

In [0]:
(
    dim_dates_df.write
    .format("delta")
    .mode("overwrite")
    .saveAsTable(DIM_DATES_PATH)
)


In [0]:
spark.read.table(DIM_DATES_PATH).limit(5).display()

date_sk,date,day,month,month_name,quarter,year,day_of_week,day_name,is_weekend
20200101,2020-01-01,1,1,January,1,2020,4,Wednesday,False
20200102,2020-01-02,2,1,January,1,2020,5,Thursday,False
20200103,2020-01-03,3,1,January,1,2020,6,Friday,False
20200104,2020-01-04,4,1,January,1,2020,7,Saturday,True
20200105,2020-01-05,5,1,January,1,2020,1,Sunday,True


In [0]:
spark.read.table(DIM_DATES_PATH).count()

3653