# ü•â Bronze Layer ‚Äî Raw Data Ingestion
**Project:** End-to-End Retail Lakehouse | Microsoft Fabric

**Layer:** Bronze (Raw / Landing Zone)

**Purpose:** Ingest raw CSV data as-is into Delta Lake format with metadata columns.

```
Source CSVs ‚Üí Fabric Lakehouse Files ‚Üí Bronze Delta Tables
```

> üìå **Before running:** Upload all CSVs from `data/` folder into your Fabric Lakehouse under `Files/raw_data/`

In [None]:
# ============================================================
# CELL 1 ‚Äî Configuration
# ============================================================
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    current_timestamp, lit, input_file_name, col, to_timestamp
)
import datetime

# Your Fabric Lakehouse name ‚Äî update this!
LAKEHOUSE_NAME = "RetailLakehouse"

# Source path (Files section of Lakehouse)
RAW_PATH = f"abfss://your_workspace@onelake.dfs.fabric.microsoft.com/{LAKEHOUSE_NAME}.Lakehouse/Files/raw_data"

# Target path (Tables section ‚Äî Delta Lake)
BRONZE_PATH = f"abfss://your_workspace@onelake.dfs.fabric.microsoft.com/{LAKEHOUSE_NAME}.Lakehouse/Tables/bronze"

# Ingestion run timestamp
RUN_TIMESTAMP = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")

print(f"‚úÖ Config loaded. Run timestamp: {RUN_TIMESTAMP}")

In [None]:
# ============================================================
# CELL 2 ‚Äî Helper Function: Ingest CSV ‚Üí Bronze Delta
# ============================================================
def ingest_to_bronze(source_file: str, table_name: str, schema=None):
    """
    Reads a CSV from the raw Files zone and writes it to a
    Bronze Delta table with metadata columns.
    
    Args:
        source_file: filename in raw_data/ folder
        table_name: target Delta table name (bronze_<name>)
        schema: optional explicit StructType schema
    """
    print(f"\nüì• Ingesting: {source_file} ‚Üí bronze_{table_name}")
    
    # Read CSV
    reader = spark.read.option("header", True).option("inferSchema", True)
    if schema:
        reader = reader.schema(schema)
    
    df = reader.csv(f"{RAW_PATH}/{source_file}")
    
    # Add Bronze metadata columns
    df = df \
        .withColumn("_bronze_ingested_at", current_timestamp()) \
        .withColumn("_source_file", input_file_name()) \
        .withColumn("_batch_run_ts", lit(RUN_TIMESTAMP)) \
        .withColumn("_is_deleted", lit(False))
    
    row_count = df.count()
    print(f"   Rows read: {row_count:,}")
    print(f"   Columns:   {len(df.columns)}")
    
    # Write to Bronze Delta (overwrite for full load, append for incremental)
    df.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .save(f"{BRONZE_PATH}/{table_name}")
    
    print(f"   ‚úÖ Written to: bronze/{table_name}")
    return row_count

print("‚úÖ Helper function defined.")

In [None]:
# ============================================================
# CELL 3 ‚Äî Ingest Customers
# ============================================================
customers_count = ingest_to_bronze("customers.csv", "customers")

# Preview
spark.read.format("delta").load(f"{BRONZE_PATH}/customers").show(5, truncate=False)

In [None]:
# ============================================================
# CELL 4 ‚Äî Ingest Products
# ============================================================
products_count = ingest_to_bronze("products.csv", "products")
spark.read.format("delta").load(f"{BRONZE_PATH}/products").show(5, truncate=False)

In [None]:
# ============================================================
# CELL 5 ‚Äî Ingest Stores
# ============================================================
stores_count = ingest_to_bronze("stores.csv", "stores")
spark.read.format("delta").load(f"{BRONZE_PATH}/stores").show(5, truncate=False)

In [None]:
# ============================================================
# CELL 6 ‚Äî Ingest Transactions (main fact table)
# ============================================================
transactions_count = ingest_to_bronze("transactions.csv", "transactions")
spark.read.format("delta").load(f"{BRONZE_PATH}/transactions").show(5, truncate=False)

In [None]:
# ============================================================
# CELL 7 ‚Äî Bronze Layer Summary
# ============================================================
print("=" * 50)
print("ü•â BRONZE LAYER INGESTION COMPLETE")
print("=" * 50)
summary = [
    ("bronze_customers",    customers_count),
    ("bronze_products",     products_count),
    ("bronze_stores",       stores_count),
    ("bronze_transactions", transactions_count),
]
for table, count in summary:
    print(f"  ‚úÖ {table:<30} {count:>10,} rows")
print(f"\n  Total rows ingested: {sum(c for _,c in summary):,}")
print(f"  Run timestamp: {RUN_TIMESTAMP}")

In [None]:
# ============================================================
# CELL 8 ‚Äî Data Quality Check on Bronze
# ============================================================
from pyspark.sql.functions import count, isnan, when, isnull

def null_check(table_name):
    df = spark.read.format("delta").load(f"{BRONZE_PATH}/{table_name}")
    data_cols = [c for c in df.columns if not c.startswith("_")]
    null_counts = df.select([
        count(when(isnull(col(c)), c)).alias(c) for c in data_cols
    ])
    print(f"\nüîç Null check ‚Äî bronze_{table_name}:")
    null_counts.show(truncate=False)

null_check("transactions")
null_check("customers")