In [0]:
from pyspark.sql.functions import col, expr, lit

spark.sql("USE CATALOG workspace")
spark.sql("USE SCHEMA instacart")

# ==========================================
# 1. CREATE SILVER_ORDERS
# ==========================================
print("Processing silver_orders...")
df_orders = spark.read.table("bronze_orders")

# Replace nulls in 'days_since_prior_order' with 0
df_orders_clean = df_orders.fillna(0, subset=["days_since_prior_order"])

df_orders_clean.write.format("delta").mode("overwrite").saveAsTable("silver_orders")
print("Created silver_orders")

# ==========================================
# 2. CREATE SILVER_PRODUCTS (Fixed with expr)
# ==========================================
print("Processing silver_products...")
df_products = spark.read.table("bronze_products")
df_aisles = spark.read.table("bronze_aisles")
df_departments = spark.read.table("bronze_departments")

# FIX: Use expr() to access the SQL function try_cast
# This converts " Blunted" -> NULL safely without crashing
df_products_clean = df_products \
    .withColumn("aisle_id", expr("try_cast(aisle_id as int)")) \
    .withColumn("department_id", expr("try_cast(department_id as int)")) \
    .filter(col("aisle_id").isNotNull() & col("department_id").isNotNull()) 

# Join with lookup tables
df_products_joined = df_products_clean \
    .join(df_aisles, "aisle_id", "left") \
    .join(df_departments, "department_id", "left") \
    .select(
        col("product_id"), 
        col("product_name"), 
        col("aisle"), 
        col("department"), 
        col("aisle_id"), 
        col("department_id")
    )

df_products_joined.write.format("delta").mode("overwrite").saveAsTable("silver_products")
print(f"Created silver_products. Rows: {df_products_joined.count()}")

# ==========================================
# 3. CREATE SILVER_ORDER_ITEMS
# ==========================================
print("Processing silver_order_items...")
df_prior = spark.read.table("bronze_order_products__prior")
df_train = spark.read.table("bronze_order_products__train")

df_order_items = df_prior.union(df_train)

df_order_items.write.format("delta").mode("overwrite").saveAsTable("silver_order_items")
print("Created silver_order_items")

Processing silver_orders...
Created silver_orders
Processing silver_products...
Created silver_products. Rows: 49687
Processing silver_order_items...
Created silver_order_items
