In [0]:
dbutils.widgets.text("env", "dev")
env = dbutils.widgets.get("env")
catalog = f"supply_{env}"

In [0]:
df_bronze = spark.table(f"{catalog}.bronze.makeup_supply_chain_raw")
display(df_bronze.limit(10))

In [0]:
(df_bronze.count(), len(df_bronze.columns))

## Product & Sales Information
| Field                        | Description                                     |
| ---------------------------- | ----------------------------------------------- |
| **Product Type**             | Category or type of product in the supply chain |
| **SKU (Stock Keeping Unit)** | Unique identifier for each product              |
| **Price**                    | Selling price of the product                    |
| **Number of Products Sold**  | Quantity of units sold in a given period        |
| **Revenue Generated**        | Revenue from product sales                      |

## Customer Information
| Field                     | Description                                            |
| ------------------------- | ------------------------------------------------------ |
| **Customer Demographics** | Customer characteristics (age, gender, location, etc.) |

## Inventory & Stock
| Field            | Description                 |
| ---------------- | --------------------------- |
| **Availability** | Product availability status |
| **Stock Levels** | Quantity currently in stock |

## Orders & Shipping
| Field                    | Description                                 |
| ------------------------ | ------------------------------------------- |
| **Order Quantities**     | Number of units in each order               |
| **Shipping Times**       | Time taken to deliver products              |
| **Shipping Carriers**    | Carrier or service responsible for shipment |
| **Shipping Costs**       | Cost associated with shipping               |
| **Transportation Modes** | Mode of transport (air, sea, land)          |
| **Routes**               | Shipping paths used for delivery            |

## Suppliers & Manufacturing
| Field                       | Description                                   |
| --------------------------- | --------------------------------------------- |
| **Supplier Name**           | Vendor providing the product/material         |
| **Location**                | Warehouse, supplier, or distribution location |
| **Lead Time**               | Time required to receive goods from supplier  |
| **Production Volumes**      | Units produced in a given period              |
| **Manufacturing Lead Time** | Time required to manufacture a product        |
| **Manufacturing Costs**     | Costs associated with production              |

## Quality & Inspection
| Field                  | Description                               |
| ---------------------- | ----------------------------------------- |
| **Inspection Results** | Outcome of quality checks                 |
| **Defect Rates**       | Percentage or count of defective products |

## General Cost Information
| Field     | Description                                      |
| --------- | ------------------------------------------------ |
| **Costs** | Operational costs across supply chain activities |


In [0]:
from utils.dq_profiling import run_all_profiling_checks
from utils.dq_reporting import profiling_report_to_df
from utils.config_loader import load_config

In [0]:
nb_path = (
    dbutils.notebook.entry_point.getDbutils()
    .notebook()
    .getContext()
    .notebookPath()
    .get()
)
silver_config = load_config('transactions', 'silver', nb_path)
dq_config = load_config('transactions', 'dq', nb_path)


In [0]:
from pathlib import Path


repo_root = Path("/Workspace" + nb_path).parents[1]
rules_path = repo_root / "configs" / "data_quality" / "transactions_dq.yaml"

if not rules_path.exists():
    raise FileNotFoundError(f"Data quality rules file not found at {rules_path}")

print(f"Using data qulaity rules from: {rules_path}")

In [0]:
bronze_report = run_all_profiling_checks(df_bronze, dq_config)


In [0]:

print("\n===== DATA QUALITY REPORT =====\n")

for key, value in bronze_report.items():
    print(f"\n--- {key.upper()} ---")

    if hasattr(value, "show"):   # it's a Spark DataFrame
        value.show(truncate=False)

    elif isinstance(value, dict):  # it's a metrics dict (like duplicates)
        for k, v in value.items():
            print(f"{k}: {v}")

    else:
        print(value)

In [0]:
metrics_df = profiling_report_to_df(spark, "transactions", bronze_report)
display(metrics_df)

In [0]:
metrics_df.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable(f"{catalog}.bronze.data_quality_metrics_dev")

We will apply the silver structure after doing the data quality checks

In [0]:
from utils.silver_structural import apply_structural_cleaning

In [0]:
silver_config = load_config('transactions', 'silver', nb_path)

In [0]:
silver_config

In [0]:
silver_df, metadata_df = apply_structural_cleaning(df_bronze, silver_config)

In [0]:
silver_df

In [0]:
metadata_df

In [0]:
df_bronze.schema