In [0]:
dbutils.widgets.dropdown(
    name="environment",
    defaultValue="fq_dev",
    choices=["fq_dev", "fq_test", "fq_prod"],
    label="Select environment"
)

dbutils.widgets.combobox(
    name="source",
    defaultValue="foodquest_sharepoint",
    choices=["posist", "netsuite", "other","excel_sheet","foodquest_sharepoint"],
    label="Source"
)

dbutils.widgets.combobox(
    name="domain",
    defaultValue="hr_workforce",
    choices=["discount", "sales", "cost","wastage","hr_workforce"],
    label="Domain"
)

environment = dbutils.widgets.get("environment")
source = dbutils.widgets.get("source")
domain = dbutils.widgets.get("domain")

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, to_date, concat_ws, last_day, expr, explode, row_number
from pyspark.sql.window import Window
from delta.tables import DeltaTable
import sys

def get_external_location(name: str) -> str:
    return (spark.sql(f"DESCRIBE EXTERNAL LOCATION `{name}`")
             .select("url")
             .collect()[0][0]
    )

bronze_path = get_external_location(f"{environment}_extloc_bronze")
silver_path = get_external_location(f"{environment}_extloc_silver")
gold_path = get_external_location(f"{environment}_extloc_gold")
checkpoint_path = get_external_location(f"{environment}_extloc_checkpoint")
staging_path = get_external_location(f"{environment}_extloc_staging")

In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import col, current_timestamp, max as _max

SOURCE_TABLE ="fq_dev_catalog.bronze.hr_workforce"
VERSION_TABLE = "fq_dev_catalog.bronze.version_control"
CONSUMER_NAME = "hr_workforce_silver_trial"

# 1️ Get latest version from source
delta_table = DeltaTable.forName(spark, SOURCE_TABLE)

latest_version = (
    delta_table.history()
    .select(_max("version").alias("latest_version"))
    .collect()[0]["latest_version"]
)

# 2️ Get last processed version
if spark.catalog.tableExists(VERSION_TABLE):
    last_processed_version = (
        spark.table(VERSION_TABLE)
        .filter(col("source_table") == SOURCE_TABLE)
        .filter(col("consumer_name") == CONSUMER_NAME)
        .agg(_max("last_processed_version").alias("last_version"))
        .collect()[0]["last_version"]
    )
else:
    last_processed_version = None

last_processed_version = last_processed_version if last_processed_version is not None else -1

# 3️ Decide whether to process
if last_processed_version >= latest_version:
    print(
        f"[INFO] No new data to process | "
        f"Latest version: {latest_version}, "
        f"Last processed: {last_processed_version}"
    )
    df_bronze_workforce_new = None  
else:
    start_version = last_processed_version + 1
    print(
        f"[INFO] Processing versions from {start_version} to {latest_version}"
    )

    df_bronze_workforce_new = (
        spark.read
        .format("delta")
        .option("startingVersion", start_version)
        .table(SOURCE_TABLE)
    )


In [0]:
df_bronze_workforce_new.printSchema()

In [0]:
# from pyspark.sql.functions import count
# df_bronze_workforce_new.agg(count("*")).display()

In [0]:
# df_bronze_workforce_new.display()

In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import current_timestamp, lit

REJECTED_TABLE = "fq_dev_catalog.silver.workforce_rejected"

def upsert_rejected(
    df,
    rejection_type,
    rejection_reason,
    run_date,
    run_id,
    rejected_stage="SILVER"
):
    rejected_df = (
        df
        .withColumn("rejection_type", lit(rejection_type))
        .withColumn("rejection_reason", lit(rejection_reason))
        .withColumn("rejected_stage", lit(rejected_stage))
        .withColumn("run_date", lit(run_date))
        .withColumn("run_id", lit(run_id))
        .withColumn("rejection_ts", current_timestamp())
    )

    target = DeltaTable.forName(spark, REJECTED_TABLE)

    (
        target.alias("t")
        .merge(
            rejected_df.alias("s"),
            """
            t.run_date = s.run_date
            AND t.business_date = s.business_date
            AND t.deployment_name = s.deployment_name
            AND t.leave_categories = s.leave_categories
            AND t.designation = s.designation
            AND t.rejection_reason = s.rejection_reason
            """
        )
        .whenNotMatchedInsertAll()
        .execute()
    )



In [0]:
store_df = (
        spark.read
            .table(f"{environment}_catalog.silver.dim_store")
            .select(
            "deployment_name"
        )
    )

In [0]:
# from pyspark.sql.functions import lit, current_timestamp, col

# workforce_rejected_stores = (
#     df_silver_workforce.alias("b")
#     .join(
#         store_df.alias("s"),
#         col("b.deployment_name") == col("s.deployment_name"),
#         "left"
#     )
#     .filter(col("s.deployment_name").isNull())
#     .select("b.*")   # keep only budget columns
#     .withColumn(
#         "rejection_reason",
#         lit("STORE_NOT_FOUND_IN_MASTER_TABLE_dim_store")
#     )
#     .withColumn("rejection_ts", current_timestamp())
# )

In [0]:
rejected_reference_df = (
    df_bronze_workforce_new.alias("b")
    .join(
        store_df.alias("s"),
        col("b.deployment_name") == col("s.deployment_name"),
        "left"
    )
    .filter(col("s.deployment_name").isNull())
    .select("b.*")
)

write_rejected(
    rejected_reference_df,
    rejection_type="REFERENCE_VALIDATION",
    rejection_reason="STORE_NOT_FOUND_IN_MASTER_TABLE_dim_store"
)


In [0]:
from pyspark.sql.functions import col, regexp_extract, count

df_silver_base = (
    df_bronze_workforce_new
    .withColumn("rhc_value", col("rhc_value").cast("decimal(12,3)"))
    .withColumn("rc_value", col("rc_value").cast("decimal(12,3)"))
    .withColumn("qcc_value", col("qcc_value").cast("decimal(12,3)"))
    .withColumn("rac_value", col("rac_value").cast("decimal(12,3)"))
    .withColumn("pac_value", col("pac_value").cast("decimal(12,3)"))
    .withColumn("sac_value", col("sac_value").cast("decimal(12,3)"))
    .withColumn("ltm_value", col("ltm_value").cast("decimal(12,3)"))
    .withColumn("tm_value", col("tm_value").cast("decimal(12,3)"))
    .withColumn("crl_value", col("crl_value").cast("decimal(12,3)"))
    .withColumn("trainee_value", col("trainee_value").cast("decimal(12,3)"))
    .withColumn("deployment_name", regexp_extract(col("file_path"), r"/([^/]+)\.xlsx$", 1))
    .withColumnRenamed("Date","business_date")
    .withColumn("source_system", lit("FoodQuest_sharepoint"))
    
)

In [0]:
# df_silver_base.display()

In [0]:
from pyspark.sql.functions import col,element_at, split

df_silver_workforce = (
    df_silver_base
    .select(
        col("business_date"),
        col("leave_categories"),

        col("rhc_count"), col("rhc_value"),
        col("rc_count"), col("rc_value"),
        col("qcc_count"), col("qcc_value"),
        col("rac_count"), col("rac_value"),
        col("pac_count"), col("pac_value"),
        col("sac_count"), col("sac_value"),
        col("ltm_count"), col("ltm_value"),
        col("tm_count"), col("tm_value"),
        col("crl_count"), col("crl_value"),
        col("trainee_count"), col("trainee_value"),
        col("deployment_name"),
        col("ingestion_ts"),
        col("file_path"),
        col("source_system")
    )
)


In [0]:
# df_silver_workforce.display()

In [0]:
# df_silver_workforce.select("deployment_name").distinct().display()

In [0]:
# df_silver_workforce.agg(count("*")).display()

In [0]:
# store_df = (
#         spark.read
#             .table(f"{environment}_catalog.silver.dim_store")
#             .select(
#             "deployment_name"
#         )
#     )

In [0]:
# from pyspark.sql.functions import lit, current_timestamp, col

# workforce_rejected_stores = (
#     df_silver_workforce.alias("b")
#     .join(
#         store_df.alias("s"),
#         col("b.deployment_name") == col("s.deployment_name"),
#         "left"
#     )
#     .filter(col("s.deployment_name").isNull())
#     .select("b.*")   # keep only budget columns
#     .withColumn(
#         "rejection_reason",
#         lit("STORE_NOT_FOUND_IN_MASTER_TABLE_dim_store")
#     )
#     .withColumn("rejection_ts", current_timestamp())
# )

In [0]:

workforce_rejected_stores.select("deployment_name").distinct().display()

In [0]:
# %sql
# drop table if exists fq_dev_catalog.silver.workforce_rejected_stores

In [0]:
# spark.sql(f"""
# CREATE TABLE IF NOT EXISTS fq_dev_catalog.silver.workforce_rejected_stores
# USING DELTA
# TBLPROPERTIES (delta.enableChangeDataFeed = true, delta.autoOptimize.optimizeWrite = true, delta.autoOptimize.autoCompact = true, delta.columnMapping.mode = 'name')
# """)

In [0]:
target_table="fq_dev_catalog.silver.workforce_rejected_stores"
def write_workforce_rejected_to_silver(df):
    query = (
            workforce_rejected_stores.write
                .format("delta")
                .mode("overwrite")
                .option("mergeSchema", "true")
                .saveAsTable(target_table)
    )
    return query
write_workforce_rejected_to_silver(workforce_rejected_stores)

In [0]:
# %sql
# select count("*") from fq_dev_catalog.silver.workforce_rejected_stores

In [0]:
# %sql
# select distinct deployment_name from fq_dev_catalog.silver.workforce_rejected_stores

In [0]:
from pyspark.sql.functions import col

# Rename once, outside function
store_df = store_df.withColumnRenamed(
    "deployment_name", "store_deployment_name"
)

def create_final_workforce(df, store_df):

    df_joined = (
        df.join(
            store_df,
            col("deployment_name") == col("store_deployment_name"),
            "inner"
        )
    )

    df_final = (
        df_joined
        .select(
            col("business_date"),
            col("leave_categories"),

            col("rhc_count"), col("rhc_value"),
            col("rc_count"), col("rc_value"),
            col("qcc_count"), col("qcc_value"),
            col("rac_count"), col("rac_value"),
            col("pac_count"), col("pac_value"),
            col("sac_count"), col("sac_value"),
            col("ltm_count"), col("ltm_value"),
            col("tm_count"), col("tm_value"),
            col("crl_count"), col("crl_value"),
            col("trainee_count"), col("trainee_value"),

            col("deployment_name"),

            # metadata
            col("ingestion_ts"),
            col("file_path"),
            col("source_system")
        )
    )

    return df_final


df_final_base = create_final_workforce(
    df_silver_workforce,
    store_df
)


In [0]:
# df_final_base.display()

In [0]:
# df_final_base.select("deployment_name").distinct().display()

In [0]:
# df_final_base.agg(count("*")).display()

In [0]:
from pyspark.sql.functions import row_number, last, col
from pyspark.sql.window import Window
from datetime import date
today=date.today()

# Window for row numbering (FIXED)
w_row = (
    Window
    .partitionBy("deployment_name") 
    .orderBy("file_path")            
)

df = df_final_base.withColumn(
    "row_num",
    row_number().over(w_row)
)


In [0]:
w_fill = (
    Window
    .partitionBy("deployment_name")
    .orderBy("row_num")
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)
)

df_filled = (
    df
    .withColumn(
        "business_date",
        last(col("business_date"), ignorenulls=True).over(w_fill)
    )
    
    .drop("row_num")
)

In [0]:
# df_filled.display()

In [0]:
# df_filled.agg(count("*")).display()

In [0]:
# df_filled.selectExpr("max(business_date)").show()


In [0]:
# df_filled.agg(count("*")).display()

In [0]:
from pyspark.sql.functions import expr

df_schema = df_filled.selectExpr(
    "business_date",
    "leave_categories",
    "deployment_name",
    "file_path",
    "source_system",
    """
        stack(
            10,
            'RHC', rhc_count, rhc_value,
            'RC',  rc_count,  rc_value,
            'QCC', qcc_count, qcc_value,
            'RAC', rac_count, rac_value,
            'PAC', pac_count, pac_value,
            'SAC', sac_count, sac_value,
            'LTM', ltm_count, ltm_value,
            'TM',  tm_count,  tm_value,
            'CRL', crl_count, crl_value,
            'Trainee', trainee_count, trainee_value
        ) as (designation, employee_count, employee_cost)
    """
)

# df_schema.display()

In [0]:
# df_schema.filter(col("business_date")=="2026-12-31").display()

In [0]:
# from pyspark.sql.functions import count
# df_schema.agg(count("*")).display()

In [0]:
from pyspark.sql.functions import col
from pyspark.sql.types import DateType, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col, trim, when

final_df = df_schema.select(
    col("business_date").cast(DateType()).alias("business_date"),
    "deployment_name",
    "leave_categories",
    "designation",
    "employee_count",
    "employee_cost",
    "file_path",
    "source_system"
)


In [0]:
# final_df.display()

In [0]:
# from pyspark.sql.functions import count
# final_df.agg(count("*")).display()

In [0]:
from pyspark.sql.functions import col

final_df_unique = (
    final_df
    .dropDuplicates(
        [
            "business_date",
            "deployment_name",
            "leave_categories",
            "designation"
        ]
    )
)


In [0]:
final_df_filled = (
    final_df_unique
    .dropna(
        subset=[
            "business_date",
            "deployment_name",
            "leave_categories",
            "designation"
        ]
    )
)


In [0]:
# from pyspark.sql.functions import count
# final_df_filled.agg(count("*")).display()

In [0]:
from pyspark.sql.functions import col

final_workforce_data = (
    final_df_filled
    .select(
        col("business_date"),
        col("deployment_name"),
        col("leave_categories").alias("leave_category"),
        col("designation"),
        col("employee_count"),
        col("employee_cost"),
        col("source_system")
    )
)


In [0]:
from datetime import date
from pyspark.sql.functions import lit

today = date.today()

final_workforce_data = (
    final_workforce_data
    .withColumn("run_date", lit(today))
)


In [0]:
# final_workforce_data.display()

In [0]:
# final_workforce_data.agg(count("*")).display()

In [0]:
# %sql
# CREATE TABLE IF NOT EXISTS fq_dev_catalog.silver.hr_workforce (
#   business_date DATE ,
#   deployment_name STRING NOT NULL,
#   leave_category STRING NOT NULL,
#   designation STRING NOT NULL,
#   employee_count LONG ,
#   employee_cost DECIMAL(12,3),
#   source_system STRING NOT NULL,
#   run_date DATE
# )
# USING DELTA
# TBLPROPERTIES (
#    delta.enableChangeDataFeed = true,
#    delta.autoOptimize.optimizeWrite = true,
#    delta.autoOptimize.autoCompact = true,
#    delta.columnMapping.mode = 'name'
# );


In [0]:
from delta.tables import DeltaTable
import sys

def upsert_to_silver_batch(
    df,
    table_name,
    business_keys
):
    try:
        # Skip empty DataFrame
        if df.isEmpty():
            print("No data to upsert")
            return

        # Load target Delta table
        silver_table = DeltaTable.forName(spark, table_name)

        # Build merge condition dynamically
        merge_condition = " AND ".join(
            [f"t.{k} = s.{k}" for k in business_keys]
        )

        (
            silver_table.alias("t")
            .merge(
                df.alias("s"),
                merge_condition
            )
            .whenMatchedUpdate(set={
                "employee_count": "s.employee_count",
                "employee_cost": "s.employee_cost",
                "source_system": "s.source_system",
                "run_date": "s.run_date"
            })
            .whenNotMatchedInsert(values={
                "business_date": "s.business_date",
                "deployment_name":"s.deployment_name",
                "leave_category": "s.leave_category", 
                "designation" :"s.designation",
                "employee_count": "s.employee_count",
                "employee_cost": "s.employee_cost",
                "source_system": "s.source_system",
                "run_date": "s.run_date"
            })
            .execute()
        )

        print("Batch upsert to silver completed successfully")

    except Exception as e:
        print(f"Upsert to Silver failed: {e}")
        sys.exit(1)

In [0]:
silver_table_name = f"{environment}_catalog.silver.hr_workforce"

business_keys = [
        "business_date",
        "deployment_name",
        "leave_category",
        "designation"
]

upsert_to_silver_batch(
    final_workforce_data,        
    silver_table_name,       
    business_keys        
)

In [0]:
# spark.sql(f"""
# INSERT INTO {VERSION_TABLE} (source_table, consumer_name, last_processed_version, updated_at)
# SELECT
#   '{SOURCE_TABLE}',
#   '{CONSUMER_NAME}',
#   {latest_version},
#   current_timestamp()
# WHERE NOT EXISTS (
#   SELECT 1
#   FROM {VERSION_TABLE}
#   WHERE
#     source_table = '{SOURCE_TABLE}'
#     AND consumer_name = '{CONSUMER_NAME}'
# )
# """)

In [0]:
# spark.sql(f"""
# UPDATE {VERSION_TABLE}
# SET
#   last_processed_version = {latest_version},
#   updated_at = current_timestamp()
# WHERE
#   source_table = '{SOURCE_TABLE}'
#   AND consumer_name = '{CONSUMER_NAME}'
# """)

In [0]:
# %sql
# select * from fq_dev_catalog.silver.hr_workforce

In [0]:
# %sql
# select * from fq_dev_catalog.silver.hr_workforce
# where deployment_name = "ALBAIK - SQ DB27 - AL TALLAH - 1005028"

In [0]:
# %sql
# select count("*") from fq_dev_catalog.silver.hr_workforce

In [0]:
# %sql
# select distinct deployment_name from fq_dev_catalog.silver.hr_workforce