In [0]:
dbutils.widgets.dropdown(
    name="environment",
    defaultValue="fq_dev",
    choices=["fq_dev", "fq_test", "fq_prod"],
    label="Select environment"
)

dbutils.widgets.combobox(
    name="source",
    defaultValue="foodquest_sharepoint",
    choices=["posist", "netsuite", "other","excel_sheet","foodquest_sharepoint"],
    label="Source"
)

dbutils.widgets.combobox(
    name="domain",
    defaultValue="hr_workforce",
    choices=["discount", "sales", "cost","wastage","hr_workforce"],
    label="Domain"
)

environment = dbutils.widgets.get("environment")
source = dbutils.widgets.get("source")
domain = dbutils.widgets.get("domain")

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, to_date, concat_ws, last_day, expr, explode, row_number
from pyspark.sql.window import Window
from delta.tables import DeltaTable
import sys

def get_external_location(name: str) -> str:
    return (spark.sql(f"DESCRIBE EXTERNAL LOCATION `{name}`")
             .select("url")
             .collect()[0][0]
    )

bronze_path = get_external_location(f"{environment}_extloc_bronze")
silver_path = get_external_location(f"{environment}_extloc_silver")
gold_path = get_external_location(f"{environment}_extloc_gold")
checkpoint_path = get_external_location(f"{environment}_extloc_checkpoint")
staging_path = get_external_location(f"{environment}_extloc_staging")

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col, trim

def list_excel_files_recursively(root_path):
    files = []
    for item in dbutils.fs.ls(root_path):
        if item.isDir():
            files.extend(list_excel_files_recursively(item.path))
        elif item.path.lower().endswith(".xlsx"):
            files.append(item.path)
    return files


In [0]:
from datetime import date

base_path = staging_path + "FoodQuest/HRWorkforce/hr-output"

today = date.today()
year  = today.strftime("%Y")
month = today.strftime("%m")
day   = today.strftime("%d")

root_path = f"{base_path}/{year}/{month}/{day}/"
print("Reading from path:", root_path)

In [0]:
print(year)
print(month)
print(day)

In [0]:
try:
    excel_files = list_excel_files_recursively(root_path)
except Exception:
    print(f"No folder found for today: {root_path}")
    dbutils.notebook.exit("NO_DATA_FOR_TODAY")

if not excel_files:
    print(f"Folder exists but no Excel files found at: {root_path}")
    dbutils.notebook.exit("NO_DATA_FOR_TODAY")

print(f"Found {len(excel_files)} Excel files")

In [0]:
from pyspark.sql.functions import lit, current_timestamp,expr

dfs = []

for file_path in excel_files:
    df = (
        spark.read
        .format("excel")
        .option("headerRows", 2)
        .option("inferSchema",False)
        .load(file_path)
        .withColumn("file_path", lit(file_path))
        .withColumn("ingestion_ts",current_timestamp())
        .withColumn("sys_id",expr("uuid ()"))
        )
    dfs.append(df)


In [0]:
from functools import reduce

final_df = reduce(lambda d1, d2: d1.unionByName(d2), dfs)
final_df.display()

In [0]:
# from pyspark.sql.functions import count
# final_df.agg(count("*")).display()

In [0]:
# final_df.select("file_path").distinct().display()

In [0]:
from pyspark.sql.functions import col, when, date_format, expr
from pyspark.sql.types import *

df_bronze_base = (
    final_df
    .withColumn(
        "Date",
        when(
            col("date").isNotNull(),
            date_format(
                expr("date_add('1899-12-30', cast(date as int))"),
                "yyyy-MM-dd"
            )
        ).otherwise(None)
    )
    .withColumnRenamed("Leave Categories", "leave_categories")
    .withColumnRenamed("RHC", "rhc_count")
    .withColumnRenamed("_c3", "rhc_value")
    .withColumnRenamed("RC", "rc_count")
    .withColumnRenamed("_c5", "rc_value")
    .withColumnRenamed("QCC", "qcc_count")
    .withColumnRenamed("_c7", "qcc_value")
    .withColumnRenamed("RAC", "rac_count")
    .withColumnRenamed("_c9", "rac_value")
    .withColumnRenamed("PAC", "pac_count")
    .withColumnRenamed("_c11", "pac_value")
    .withColumnRenamed("SAC", "sac_count")
    .withColumnRenamed("_c13", "sac_value")
    .withColumnRenamed("LTM", "ltm_count")
    .withColumnRenamed("_c15", "ltm_value")
    .withColumnRenamed("TM", "tm_count")
    .withColumnRenamed("_c17", "tm_value")
    .withColumnRenamed("CRL", "crl_count")
    .withColumnRenamed("_c19", "crl_value")
    .withColumnRenamed("Trainee (CTMs or TMs)", "trainee_count")
    .withColumnRenamed("_c21", "trainee_value")
)


In [0]:
# df_bronze_base.display()

In [0]:
df_bronze_base= df_bronze_base.withColumn("RunDate",lit(today))

In [0]:
# from pyspark.sql.functions import count
# df_bronze_base.agg(count("*")).display()

In [0]:
# spark.sql(f"""
# CREATE TABLE IF NOT EXISTS fq_dev_catalog.bronze.hr_workforce
# USING DELTA
# TBLPROPERTIES (delta.enableChangeDataFeed = true, delta.autoOptimize.optimizeWrite = true, delta.autoOptimize.autoCompact = true, delta.columnMapping.mode = 'name')
# """)

In [0]:
from delta.tables import DeltaTable

table_name = "fq_dev_catalog.bronze.hr_workforce"

# Batch write to Delta table
df_bronze_base.write \
    .format("delta") \
    .mode("overwrite")\
    .option("mergeSchema", "true") \
    .saveAsTable(table_name)


In [0]:
# %sql
# select * from  fq_dev_catalog.bronze.hr_workforce

In [0]:
# %sql
# select count (*) from  fq_dev_catalog.bronze.hr_workforce