In [0]:
# Catalog & Schema Setup
catalog = "main"
gold_schema = "instacart_gold"
silver_schema = "instacart_silver"

print("Running Data Quality Checks...")


In [0]:
# Check for Null Values
from pyspark.sql.functions import col, sum, when

# Load fact table
fact_df = spark.table(f"{catalog}.{gold_schema}.fact_order_items")

# Null check for important columns
null_check = fact_df.select([
    sum(when(col(c).isNull(), 1).otherwise(0)).alias(c + "_nulls")
    for c in ["order_id", "product_id", "user_id"]
])

display(null_check)


In [0]:
# Check duplicate order-product combinations
duplicate_check = (
    fact_df.groupBy("order_id", "product_id")
    .count()
    .filter("count > 1")
)

print("Duplicate records:", duplicate_check.count())


In [0]:
# Row Count Consistency Check
bronze_prior = spark.table("main.instacart_bronze.order_products__prior")
gold_fact = spark.table("main.instacart_gold.fact_order_items")

print("Bronze Prior Count:", bronze_prior.count())
print("Gold Fact Count:", gold_fact.count())
