In [0]:
# Databricks notebook source
# ============================================
# Project: Instacart Lakehouse AI Project
# Layer: Bronze
# Description: Convert raw CSV files to Bronze Delta Tables
# Author: Shweta Pawar
# ============================================

# COMMAND ----------

raw_path = "/Volumes/main/default/instacart_vol/instacart/raw"
bronze_path = "/Volumes/main/default/instacart_vol/instacart/bronze"

dbutils.fs.mkdirs(bronze_path)

print("Bronze path ready:", bronze_path)

# COMMAND ----------

files = [
    "orders.csv",
    "products.csv",
    "departments.csv",
    "aisles.csv",
    "order_products__prior.csv",
    "order_products__train.csv"
]

for file in files:
    df = spark.read.option("header", True).csv(f"{raw_path}/{file}")
    
    table_name = file.replace(".csv", "")
    
    df.write.format("delta") \
        .mode("overwrite") \
        .save(f"{bronze_path}/{table_name}")
    
    print(f"Bronze table created: {table_name}")


### Validate Bronze Layer

In [0]:
bronze_path = "/Volumes/main/default/instacart_vol/instacart/bronze"

tables = [
    "orders",
    "products",
    "departments",
    "aisles",
    "order_products__prior",
    "order_products__train"
]

for table in tables:
    df = spark.read.format("delta").load(f"{bronze_path}/{table}")
    print(f"{table} row count:", df.count())


In [0]:
# Create schema
spark.sql("CREATE SCHEMA IF NOT EXISTS main.instacart_bronze")

raw_path = "/Volumes/main/default/instacart_vol/instacart/raw"

files = [
    "orders.csv",
    "products.csv",
    "departments.csv",
    "aisles.csv",
    "order_products__prior.csv",
    "order_products__train.csv"
]

for file in files:
    df = spark.read.option("header", True).csv(f"{raw_path}/{file}")
    
    table_name = file.replace(".csv", "")
    
    df.write.format("delta") \
        .mode("overwrite") \
        .saveAsTable(f"main.instacart_bronze.{table_name}")
    
    print(f"Managed Bronze table created: main.instacart_bronze.{table_name}")


In [0]:
spark.sql("SELECT COUNT(*) FROM main.instacart_bronze.orders").show()
spark.sql("SELECT COUNT(*) FROM main.instacart_bronze.order_products__prior").show()
