In [0]:
dbutils.widgets.dropdown(
    name="environment",
    defaultValue="fq_dev",
    choices=["fq_dev", "fq_test", "fq_prod"],
    label="Select environment"
)

# Source selection as combobox
dbutils.widgets.combobox(
    name="source",
    defaultValue="excel_sheet",
    choices=["posist", "netsuite","excel_sheet", "other"],
    label="Source"
)

# Domain selection as combobox
dbutils.widgets.combobox(
    name="domain",
    defaultValue="budget",
    choices=["discount", "sales", "cost","wastage","budget"],
    label="Domain"
)

environment = dbutils.widgets.get("environment")
source = dbutils.widgets.get("source")
domain = dbutils.widgets.get("domain")

In [0]:
bronze_path = spark.sql(
    f"DESCRIBE EXTERNAL LOCATION `{environment}_extloc_bronze`"
).select("url").collect()[0][0]

silver_path = spark.sql(
    f"DESCRIBE EXTERNAL LOCATION `{environment}_extloc_silver`"
).select("url").collect()[0][0]

gold_path = spark.sql(
    f"DESCRIBE EXTERNAL LOCATION `{environment}_extloc_gold`"
).select("url").collect()[0][0]

checkpoint = spark.sql(
    f"DESCRIBE EXTERNAL LOCATION `{environment}_extloc_checkpoint`"
).select("url").collect()[0][0]

staging = spark.sql(
    f"DESCRIBE EXTERNAL LOCATION `{environment}_extloc_staging`"
).select("url").collect()[0][0]

In [0]:
# spark.sql(f"""
# CREATE TABLE IF NOT EXISTS `{environment}_catalog`.`bronze`.`{domain}`
# USING DELTA
# TBLPROPERTIES (delta.enableChangeDataFeed = true, delta.autoOptimize.optimizeWrite = true, delta.autoOptimize.autoCompact = true, delta.columnMapping.mode = 'name')
# """)

In [0]:
# from pyspark.sql.functions import current_timestamp, col, regexp_replace
# import traceback
# from datetime import datetime


# # ----------------------------------------
# # Global Exception Logger
# # ----------------------------------------
# def log_exception(step: str, error: Exception):
#     print("ERROR OCCURRED")
#     print(f" Step       : {step}")
#     print(f" Time       : {datetime.now()}")
#     print(f" Error Type : {type(error).__name__}")
#     print(f" Message    : {error}")
#     print(" Stacktrace:")
#     traceback.print_exc()





In [0]:
from pyspark.sql.functions import current_timestamp, col, regexp_replace, expr
import time

def read_budget_files():
        df = (
            spark.readStream
                .format("cloudFiles")
                .option("cloudFiles.format", "csv")
                .option("header",True)
                .option("inferSchema", True)
                .option("multiline", True)
                .option("cloudFiles.allowOverwrites", "true")
                .option("cloudFiles.schemaLocation", f"{checkpoint}/{source}/{domain}/infer_schema_budget")
                .option("cloudFiles.inferColumnTypes", True)
                .option("multiLine", "true")
                .load(staging + "/budget/Budget-Overall-24,25,26/")
                .withColumn("ingestion_ts", current_timestamp())
                .withColumn("sys_id", expr("uuid()"))
                .withColumn("file_path", regexp_replace(col("_metadata.file_path"), "%20", " "))
        )
        return df
    
df_budget=read_budget_files()

time.sleep(20)



In [0]:
from pyspark.sql.functions import count
df_budget.agg(count("DateOrg")).display()


In [0]:
from pyspark.sql.functions import col, lit, substring, current_timestamp
import re

def to_snake_case(name):
    return re.sub(r'[\s\-]+', '_', name).lower()

def to_snake_case_df(df):
    for col_name in df.columns:
        df = df.withColumnRenamed(col_name, to_snake_case(col_name))
    return df


In [0]:
df_budget=to_snake_case_df(df_budget)


In [0]:
# df_budget.display()

In [0]:
def write_budget_to_bronze(df):
    query = (
            df.writeStream
                .format("delta")
                .outputMode("append")
                .trigger(availableNow=True)
                .queryName(f"{domain}_bronze_autoloader")
                .option("checkpointLocation", f"{checkpoint}/{source}/{domain}/checkpoint_budget_final")
                .option("mergeSchema", "true")
                .toTable(f"`{environment}_catalog`.`bronze`.`{domain}`"
                )
    )
    return query

query = write_budget_to_bronze(df_budget)
query.awaitTermination()

print(" Bronze Auto Loader finished processing all available files.")
    


In [0]:
print(query.status)

In [0]:
if not spark.streams.active:
    print("No active streaming queries")

In [0]:
for query in spark.streams.active:
    query.stop()

In [0]:
  # %sql select count("*")from fq_dev_catalog.bronze.budget

In [0]:
# from pyspark.sql.functions import col, sum
# df.agg((sum("budget"))).display()


In [0]:
# from pyspark.sql.functions import count
# cdf_df = (
#     spark.read
#          .format("delta")
#          .option("readChangeFeed", "true")
#          .option("startingVersion", 0)
#          .table("fq_dev_catalog.bronze.budget")
# )

# cdf_df.display()



In [0]:
# from delta.tables import *

# table_name = 'fq_dev_catalog.bronze.budget'

# #Let's get the last table version to only see the last update mofications
# last_version = str(DeltaTable.forName(spark, table_name).history(1).head()["version"])
# print(f"our Delta table last version is {last_version}, let's select the last changes to see our DELETE and UPDATE operations (last 2 versions):")

# changes = spark.read.format("delta") \
#                     .option("readChangeData", "true") \
#                     .option("startingVersion", int(last_version) -1) \
#                     .table(table_name)
# display(changes)

In [0]:
# %sql
#  describe history fq_dev_catalog.bronze.budget