# DAY 7 : Workflows & Job Orchestration

In [0]:
 from pyspark.sql import functions as F

# Create a dropdown widget for the month
dbutils.widgets.dropdown("month", "Oct", ["Oct", "Nov"])

# Get the value from the widget
selected_month = dbutils.widgets.get("month")

print(f"Running pipeline for month: {selected_month}")

Running pipeline for month: Oct


In [0]:
# --- TASK 1: BRONZE ---
def run_bronze(month):
    raw_path = f"/Volumes/workspace/ecommerce/ecommerce_data/2019-{month}.csv"
    df = spark.read.csv(raw_path, header=True, inferSchema=True)
    df.write.format("delta").mode("overwrite").saveAsTable(f"workspace.ecommerce.bronze_{month}")
    return f"Bronze {month} complete."

# --- TASK 2: SILVER ---
def run_silver(month):
    bronze_df = spark.read.table(f"workspace.ecommerce.bronze_{month}")
    silver_df = bronze_df.dropna(subset=["user_id"]).dropDuplicates()
    silver_df.write.format("delta").mode("overwrite").saveAsTable(f"workspace.ecommerce.silver_{month}")
    return f"Silver {month} complete."

# --- TASK 3: GOLD ---
def run_gold(month):
    silver_df = spark.read.table(f"workspace.ecommerce.silver_{month}")
    gold_df = silver_df.groupBy("brand").agg(F.sum("price").alias("total_revenue"))
    gold_df.write.format("delta").mode("overwrite").saveAsTable(f"workspace.ecommerce.gold_revenue_{month}")
    return f"Gold {month} complete."

# Execution Logic
print(run_bronze(selected_month))
print(run_silver(selected_month))
print(run_gold(selected_month))

Bronze Oct complete.
Silver Oct complete.
Gold Oct complete.
