In [0]:
%sql
create database processed_data

New Year's Day	1	1
Valentine's Day	2	14
St. Patrick's Day	3	17
Independence Day	7	4
Halloween	10	31
Veterans Day	11	11
Christmas	12	25

In [0]:
%sql
DROP TABLE IF EXISTS processed_data.dim_date;

CREATE TABLE processed_data.dim_date AS 
SELECT * FROM raw_data.dim_date;

In [0]:
from pyspark.sql import *
from pyspark.sql.functions import *
import datetime
import pandas as pd

In [0]:
#fixed holidays
fixed_holidays = {
    "New Year's Day": (1, 1),
    "Valentine's Day": (2, 14),
    "Independence Day": (7, 4),
    "Halloween": (10, 31),
    "Christmas": (12, 25),
    "St. Patrick's Day":(3,17),
    "Veterans Day":(11,11)
}

fixed_holiday_list = list(fixed_holidays.keys())

df_dim_hldy=spark.sql("select * from raw_data.dim_hldy")

#sarrogate key
df_dim_hldy = df_dim_hldy.withColumn("hldy_id", monotonically_increasing_id() + 1)

df_dim_hldy = df_dim_hldy.withColumn(
    "is_fixed",
    when(col("hldy_label").isin(fixed_holiday_list), lit("Yes")).otherwise(lit("No"))
)

df_dim_hldy = df_dim_hldy.withColumn(
    "hldy_mnth",
    when(col("hldy_label") == "New Year's Day", lit(1))
    .when(col("hldy_label") == "Valentine's Day", lit(2))
    .when(col("hldy_label") == "Independence Day", lit(7))
    .when(col("hldy_label") == "Halloween", lit(10))
    .when(col("hldy_label") == "Christmas", lit(12))
    .when(col("hldy_label") == "St. Patrick's Day", lit(3))
    .when(col("hldy_label") == "Veterans Day", lit(11))
    .otherwise(lit(0))
)

df_dim_hldy = df_dim_hldy.withColumn(
    "hldy_day",
    when(col("hldy_label") == "New Year's Day", lit(1))
    .when(col("hldy_label") == "Valentine's Day", lit(14))
    .when(col("hldy_label") == "Independence Day", lit(4))
    .when(col("hldy_label") == "Halloween", lit(31))
    .when(col("hldy_label") == "Christmas", lit(25))
    .when(col("hldy_label") == "St. Patrick's Day", lit(17))
    .when(col("hldy_label") == "Veterans Day", lit(11))
    .otherwise(lit(0))
)

df_dim_hldy = df_dim_hldy.select("hldy_id", "hldy_label", "is_fixed", "hldy_mnth", "hldy_day")


df_dim_hldy.show()

CREATE TABLE IF NOT EXISTS processed_data.dim_hldy (
    hldy_id LONG,
    hldy_label STRING ,
    is_fixed STRING,
    hldy_mnth INT,
    hldy_day INT
);

In [0]:
#create dim table
df_dim_hldy.write.mode("overwrite").saveAsTable("processed_data.dim_hldy")

In [0]:
%sql
select * from processed_data.dim_hldy

In [0]:
from datetime import date, timedelta
#adding dates from 2016 to 2024
date_range = pd.date_range(start="2016-01-01", end="2024-12-31")
df_dates = spark.createDataFrame(pd.DataFrame({"date": date_range}))

#Extract necessary date----month----year-----day----dayofweek
df_dates = df_dates.withColumn("year", expr("year(date)")) \
                   .withColumn("month", expr("month(date)")) \
                   .withColumn("day", expr("day(date)")) \
                   .withColumn("dow", expr("dayofweek(date)")) 

#fixed date holidays
fixed_holidays = {
    "New_Years_Day": (1, 1),
    "Independence_Day": (7, 4),
    "Halloween": (10, 31),
    "Christmas": (12, 25),
    "Valentines_Day": (2, 14),
    "Veterans_Day": date(11, 11)
}

#floating holidays-------------logic req
def get_floating_holidays(year):
    
    def nth_weekday(year, month, weekday, nth):
        first_day = date(year, month, 1)
        first_occurrence = first_day + timedelta(days=(weekday - first_day.weekday() + 7) % 7)
        return first_occurrence + timedelta(weeks=(nth - 1))

    holidays = {
        "Presidents_Day": nth_weekday(year, 2, 0, 3),  #3rd Monday of Feb
        "Easter": date(year, 3, 31),  #Easter calculation needed no fixed date
        "Mothers_Day": nth_weekday(year, 5, 6, 2),  #2nd Sunday of May
        "Memorial_Day": nth_weekday(year, 5, 0, -1),  #Last Monday of May
        "Fathers_Day": nth_weekday(year, 6, 6, 3),  #3rd Sunday of June
        "Labor_Day": nth_weekday(year, 9, 0, 1),  #      1st Monday of Sep
        "Columbus_Day": nth_weekday(year, 10, 0, 2),  #       2nd Monday of Oct
        "Election_Day": nth_weekday(year, 11, 1, 1),  #   1st Tuesday of Nov
        "Thanksgiving": nth_weekday(year, 11, 3, 4),  #          4th Thursday of Nov
        "Martin_Luther_King_Day": nth_weekday(year, 1, 0, 3),  # 3rd Monday of Jan
    }
    
    return holidays

#-----------holiday mappings df creation-----------------
holiday_records = []
for year in range(2016, 2025):
    holidays = get_floating_holidays(year)
    
    for hldy, (month, day) in fixed_holidays.items():
        holiday_records.append((hldy, f"{year}-{month:02d}-{day:02d}"))
    
    for hldy, hldy_date in holidays.items():
        holiday_records.append((hldy, hldy_date.strftime("%Y-%m-%d")))

df_hldy = spark.createDataFrame(holiday_records, ["hldy_label", "date"])

#  -----Join with dim_date columns----
df_dim_time_hldy = df_dates.join(df_hldy, "date", "left") \
    .withColumn("hldy_id", expr("monotonically_increasing_id()")) \
    .withColumn("is_fixed", when(col("hldy_label").isin(list(fixed_holidays.keys())), lit("Yes")).otherwise(lit("No")))

df_dim_time_hldy.show()
df_dim_time_hldy.write.mode("overwrite").saveAsTable("processed_data.dim_time_hldy")


In [0]:
%sql
select * from processed_data.dim_time_hldy

In [0]:
%sql
select * from hive_metastore.raw_data.dim_date limit 20 

In [0]:
%sql
drop table processed_data.dim_date

In [0]:
#new date dimension table from 2016 to 2024----------------
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, year, month, dayofmonth, dayofweek, weekofyear, quarter, expr, lit
from pyspark.sql.types import IntegerType
import datetime

# Generate Date Range from 2016-01-01 to 2024-12-31
start_date = datetime.date(2016, 1, 1)
end_date = datetime.date(2024, 12, 31)
date_list = [(start_date + datetime.timedelta(days=x)) for x in range((end_date - start_date).days + 1)]
df = spark.createDataFrame([(d,) for d in date_list], ["date"])

# Generate Required Columns
df = df.withColumn("fscldt_id", date_format("date", "yyyyMMdd").cast(IntegerType())) \
       .withColumn("fscldt_label", date_format("date", "MMM d, yyyy")) \
       .withColumn("fsclwk_id", expr("year(date) * 100 + weekofyear(date)")) \
       .withColumn("fsclwk_label", expr("concat('WK ', lpad(weekofyear(date), 2, '0'), ', ', year(date))")) \
       .withColumn("fsclmth_id", expr("year(date) * 100 + month(date)")) \
       .withColumn("fsclmth_label", date_format("date", "MMM, yyyy")) \
       .withColumn("fsclqrtr_id", expr("year(date) * 10 + quarter(date)")) \
       .withColumn("fsclqrtr_label", expr("concat('Q', quarter(date), ', ', year(date))")) \
       .withColumn("fsclyr_id", year(col("date"))) \
       .withColumn("fsclyr_label", year(col("date")).cast("string")) \
       .withColumn("fscldow", dayofweek(col("date"))) \
       .withColumn("fscldom", dayofmonth(col("date"))) \
       .withColumn("fscldoq", expr("dayofmonth(date) + (quarter(date) - 1) * 30")) \
       .withColumn("fscldoy", expr("dayofyear(date)")) \
       .withColumn("fsclwoy", weekofyear(col("date"))) \
       .withColumn("fsclmoy", month(col("date"))) \
       .withColumn("fsclqoy", quarter(col("date")))

# Add Last Year and Two Years Ago Fiscal Dates
df = df.withColumn("ly_fscldt_id", expr("fscldt_id - 10000")) \
       .withColumn("lly_fscldt_id", expr("fscldt_id - 20000"))

# Add Season Labels
df = df.withColumn("ssn_id", expr("""
    CASE 
        WHEN fsclmoy IN (12,1,2) THEN concat('WINT', fsclyr_id) 
        WHEN fsclmoy IN (3,4,5) THEN concat('SPRG', fsclyr_id) 
        WHEN fsclmoy IN (6,7,8) THEN concat('SUMR', fsclyr_id) 
        ELSE concat('FALL', fsclyr_id) 
    END
""")) \
       .withColumn("ssn_label", expr("""
    CASE 
        WHEN fsclmoy IN (12,1,2) THEN 'Winter' 
        WHEN fsclmoy IN (3,4,5) THEN 'Spring' 
        WHEN fsclmoy IN (6,7,8) THEN 'Summer' 
        ELSE 'Fall' 
    END
"""))

# Show Sample Data
df.show(10)

# Save Table in Databricks
df.write.mode("overwrite").saveAsTable("processed_data.dim_date")
