In [0]:
from pyspark.sql.functions import col, count, when

table_names = [
    "bronze_aisles",
    "bronze_departments",
    "bronze_orders",
    "bronze_products",
    "bronze_order_products__prior",
    "bronze_order_products__train"
]

spark.sql("USE CATALOG workspace")
spark.sql("USE SCHEMA instacart")

for table in table_names:
    print(f"--- Analyzing: {table} ---")
    df = spark.read.table(table)
    
    # 1. Check Shape (Rows)
    row_count = df.count()
    print(f"Total Rows: {row_count}")
    
    # 2. Check for Nulls in each column
    print("Null Value Counts:")
    df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()
    
    # 3. Check Data Schema
    print("Schema:")
    df.printSchema()
    
    # 4. Preview Data
    print(f"Sample rows from {table}:")
    display(df.limit(5))
    
    print("\n")

--- Analyzing: bronze_aisles ---
Total Rows: 134
Null Value Counts:
+--------+-----+-------------------+
|aisle_id|aisle|ingestion_timestamp|
+--------+-----+-------------------+
|       0|    0|                  0|
+--------+-----+-------------------+

Schema:
root
 |-- aisle_id: integer (nullable = true)
 |-- aisle: string (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = true)

Sample rows from bronze_aisles:


aisle_id,aisle,ingestion_timestamp
1,prepared soups salads,2026-02-11T14:46:07.382Z
2,specialty cheeses,2026-02-11T14:46:07.382Z
3,energy granola bars,2026-02-11T14:46:07.382Z
4,instant foods,2026-02-11T14:46:07.382Z
5,marinades meat preparation,2026-02-11T14:46:07.382Z




--- Analyzing: bronze_departments ---
Total Rows: 21
Null Value Counts:
+-------------+----------+-------------------+
|department_id|department|ingestion_timestamp|
+-------------+----------+-------------------+
|            0|         0|                  0|
+-------------+----------+-------------------+

Schema:
root
 |-- department_id: integer (nullable = true)
 |-- department: string (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = true)

Sample rows from bronze_departments:


department_id,department,ingestion_timestamp
1,frozen,2026-02-11T14:46:13.962Z
2,other,2026-02-11T14:46:13.962Z
3,bakery,2026-02-11T14:46:13.962Z
4,produce,2026-02-11T14:46:13.962Z
5,alcohol,2026-02-11T14:46:13.962Z




--- Analyzing: bronze_orders ---
Total Rows: 3421083
Null Value Counts:
+--------+-------+--------+------------+---------+-----------------+----------------------+-------------------+
|order_id|user_id|eval_set|order_number|order_dow|order_hour_of_day|days_since_prior_order|ingestion_timestamp|
+--------+-------+--------+------------+---------+-----------------+----------------------+-------------------+
|       0|      0|       0|           0|        0|                0|                206209|                  0|
+--------+-------+--------+------------+---------+-----------------+----------------------+-------------------+

Schema:
root
 |-- order_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- eval_set: string (nullable = true)
 |-- order_number: integer (nullable = true)
 |-- order_dow: integer (nullable = true)
 |-- order_hour_of_day: integer (nullable = true)
 |-- days_since_prior_order: double (nullable = true)
 |-- ingestion_timestamp: timestamp (nul

order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,ingestion_timestamp
2539329,1,prior,1,2,8,,2026-02-11T14:46:20.567Z
2398795,1,prior,2,3,7,15.0,2026-02-11T14:46:20.567Z
473747,1,prior,3,3,12,21.0,2026-02-11T14:46:20.567Z
2254736,1,prior,4,4,7,29.0,2026-02-11T14:46:20.567Z
431534,1,prior,5,4,15,28.0,2026-02-11T14:46:20.567Z




--- Analyzing: bronze_products ---
Total Rows: 49688
Null Value Counts:
+----------+------------+--------+-------------+-------------------+
|product_id|product_name|aisle_id|department_id|ingestion_timestamp|
+----------+------------+--------+-------------+-------------------+
|         0|           0|       0|            0|                  0|
+----------+------------+--------+-------------+-------------------+

Schema:
root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- aisle_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = true)

Sample rows from bronze_products:


product_id,product_name,aisle_id,department_id,ingestion_timestamp
1,Chocolate Sandwich Cookies,61,19,2026-02-11T14:46:28.268Z
2,All-Seasons Salt,104,13,2026-02-11T14:46:28.268Z
3,Robust Golden Unsweetened Oolong Tea,94,7,2026-02-11T14:46:28.268Z
4,Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce,38,1,2026-02-11T14:46:28.268Z
5,Green Chile Anytime Sauce,5,13,2026-02-11T14:46:28.268Z




--- Analyzing: bronze_order_products__prior ---
Total Rows: 32434489
Null Value Counts:
+--------+----------+-----------------+---------+-------------------+
|order_id|product_id|add_to_cart_order|reordered|ingestion_timestamp|
+--------+----------+-----------------+---------+-------------------+
|       0|         0|                0|        0|                  0|
+--------+----------+-----------------+---------+-------------------+

Schema:
root
 |-- order_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- add_to_cart_order: integer (nullable = true)
 |-- reordered: integer (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = true)

Sample rows from bronze_order_products__prior:


order_id,product_id,add_to_cart_order,reordered,ingestion_timestamp
1322049,7736,3,0,2026-02-11T14:46:40.786Z
1322049,27681,4,0,2026-02-11T14:46:40.786Z
1322050,30764,1,0,2026-02-11T14:46:40.786Z
1322050,20738,2,0,2026-02-11T14:46:40.786Z
1322050,38775,3,0,2026-02-11T14:46:40.786Z




--- Analyzing: bronze_order_products__train ---
Total Rows: 1384617
Null Value Counts:
+--------+----------+-----------------+---------+-------------------+
|order_id|product_id|add_to_cart_order|reordered|ingestion_timestamp|
+--------+----------+-----------------+---------+-------------------+
|       0|         0|                0|        0|                  0|
+--------+----------+-----------------+---------+-------------------+

Schema:
root
 |-- order_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- add_to_cart_order: integer (nullable = true)
 |-- reordered: integer (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = true)

Sample rows from bronze_order_products__train:


order_id,product_id,add_to_cart_order,reordered,ingestion_timestamp
1,49302,1,1,2026-02-11T14:46:50.509Z
1,11109,2,1,2026-02-11T14:46:50.509Z
1,10246,3,0,2026-02-11T14:46:50.509Z
1,49683,4,0,2026-02-11T14:46:50.509Z
1,43633,5,1,2026-02-11T14:46:50.509Z




