In [0]:

dbutils.widgets.dropdown(
    name="environment",
    defaultValue="fq_dev",
    choices=["fq_dev", "fq_test", "fq_prod"],
    label="Select environment"
)

# Source selection as combobox
dbutils.widgets.combobox(
    name="source",
    defaultValue="netsuite",
    choices=["posist", "netsuite","excel_sheet", "other"],
    label="Source"
)

# Domain selection as combobox
dbutils.widgets.combobox(
    name="domain",
    defaultValue="wastage",
    choices=["discount", "sales", "cost","wastage"],
    label="Domain"
)

environment = dbutils.widgets.get("environment")
source = dbutils.widgets.get("source")
domain = dbutils.widgets.get("domain")


In [0]:
bronze_path = spark.sql(
    f"DESCRIBE EXTERNAL LOCATION `{environment}_extloc_bronze`"
).select("url").collect()[0][0]

silver_path = spark.sql(
    f"DESCRIBE EXTERNAL LOCATION `{environment}_extloc_silver`"
).select("url").collect()[0][0]

gold_path = spark.sql(
    f"DESCRIBE EXTERNAL LOCATION `{environment}_extloc_gold`"
).select("url").collect()[0][0]

checkpoint = spark.sql(
    f"DESCRIBE EXTERNAL LOCATION `{environment}_extloc_checkpoint`"
).select("url").collect()[0][0]

staging = spark.sql(
    f"DESCRIBE EXTERNAL LOCATION `{environment}_extloc_staging`"
).select("url").collect()[0][0]

In [0]:
from pyspark.sql.functions import current_timestamp, col, regexp_replace, expr
import time


def read_wastage_stream():
    df = (
        spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "json")
            .option("cloudFiles.allowOverwrites", "true")
            .option("cloudFiles.schemaLocation", f"{checkpoint}/{source}/{domain}/infer_wastage_schema")
            .option("cloudFiles.inferColumnTypes", True)
            .option("multiLine", "true")
            .load(staging + "FoodQuest/Netsuite/Wastage/*/*/*.json")
            .withColumn("ingestion_ts", current_timestamp())
            .withColumn("sys_id", expr("uuid()"))
            .withColumn("file_path", regexp_replace(col("_metadata.file_path"), "%20", " "))
    )
    return df

wastage_df = read_wastage_stream()

time.sleep(20)


In [0]:
# display(wastage_df)

In [0]:
def write_wastage_stream(df):
    query = (
        df.writeStream
            .format("delta")
            .outputMode("append")
            .trigger(availableNow=True)   # batch-once
            .queryName(f"{domain}_bronze_autoloader")
            .option("checkpointLocation", f"{checkpoint}/{source}/{domain}/wastage_checkpoint_all_data")
            .option("mergeSchema", "true")
            .toTable(f"`{environment}_catalog`.`bronze`.`{domain}`")
    )
    return query

query = write_wastage_stream(wastage_df)
query.awaitTermination()

print(" Bronze Auto Loader finished processing all available files.")


In [0]:
print(query.status)

if not spark.streams.active:
    print("No active streaming queries")


In [0]:
# if not spark.streams.active:
#     print("No active streaming queries")


In [0]:
for query in spark.streams.active:
    query.stop()

In [0]:
# from pyspark.sql.functions import count
# wastage_df.agg(count("results")).display()

In [0]:
# spark.sql(f"""
# CREATE TABLE IF NOT EXISTS `{environment}_catalog`.`bronze`.`{domain}`
# USING DELTA
# TBLPROPERTIES (delta.enableChangeDataFeed = true, delta.autoOptimize.optimizeWrite = true, delta.autoOptimize.autoCompact = true)
# """)


In [0]:
# %sql
# select * from fq_dev_catalog.bronze.wastage

In [0]:
# %sql
# select count("*") from fq_dev_catalog.bronze.wastage

In [0]:
# %sql
# describe history fq_dev_catalog.bronze.wastage

In [0]:
# %sql
# select distinct _commit_version from table_changes('fq_dev_catalog.bronze.wastage',0)

In [0]:
# from pyspark.sql.functions import col
# cdf_df = (
#     spark.read
#          .format("delta")
#          .option("readChangeFeed", "true")
#          .option("startingVersion", 0)
#          .table("fq_dev_catalog.bronze.wastage")
# )

# cdf_df.select(col("_commit_version")).distinct().display()

In [0]:
# cdf_df.display()