In [0]:
# Auto-reload setup
%load_ext autoreload
%autoreload 2

In [0]:
# Cell 1: Setup

import sys
repo_path = "/Workspace/Repos/vchhatbar11@outlook.com/ecom-project/src"
if repo_path not in sys.path:
    sys.path.append(repo_path)

from config.storage_config import *
from config.catalog_config import *
from config.spark_config import configure_spark
from utils.logger import log_info, log_metric

from pyspark.sql.types import StructType, StructField, IntegerType
from pyspark.sql.functions import col

# Apply minimal Spark config
configure_spark(spark)

log_info(f"Target: {CATALOG_NAME}.{BRONZE_SCHEMA}")
log_info(f"Source: {BRONZE_ORDER_ITEMS}")

In [0]:
# Cell 2: Check data exists

log_info("Checking bronze storage...")

files = dbutils.fs.ls(BRONZE_ORDER_ITEMS)

# Count CSV files manually
csv_count = 0
total_bytes = 0

for f in files:
    if f.name.endswith('.csv') and not f.name.startswith('._'):
        csv_count += 1
        total_bytes += f.size

total_gb = total_bytes / (1024**3)

log_info(f"Found {csv_count} CSV files")
log_info(f"Total size: {total_gb:.2f} GB")

In [0]:
# Define schema explicitly (no inference for performance)
schema = StructType([
    StructField("order_id", IntegerType(), nullable=True),
    StructField("product_id", IntegerType(), nullable=True)
])

log_info("Reading CSV files with explicit schema...")

df = spark.read \
    .schema(schema) \
    .option("header", "true") \
    .csv(BRONZE_ORDER_ITEMS)

# Cache for reuse
df.cache()

row_count = df.count()
log_metric("Rows loaded", f"{row_count:,}")
log_info("Data cached in memory")

# Quick validation
df.show(5, truncate=False)

In [0]:
# Prepare data with partition column
log_info("Adding partition column...")

df_partitioned = df.withColumn(
    "order_id_bucket",
    (col("order_id") / 10000000).cast("int")
)

# Set catalog context
spark.sql(f"USE CATALOG {CATALOG_NAME}")
spark.sql(f"USE SCHEMA {BRONZE_SCHEMA}")

# Drop existing table (if re-running)
table_name = BRONZE_ORDER_ITEMS_TABLE.split('.')[-1]
spark.sql(f"DROP TABLE IF EXISTS {table_name}")
log_info(f"Dropped existing table: {table_name}")

# Create Iceberg table
log_info("Creating Iceberg table ...")

df_partitioned.writeTo(table_name) \
    .using("iceberg") \
    .partitionedBy("order_id_bucket") \
    .create()

log_info(f"Iceberg table created: {BRONZE_ORDER_ITEMS_TABLE}")

# Verify row count
final_count = spark.sql(f"SELECT COUNT(*) as cnt FROM {table_name}").collect()[0]['cnt']
log_metric("Table row count", f"{final_count:,}")

assert final_count == row_count, "Row count mismatch!"

In [0]:
# Quick validation queries
log_info("Running validation checks...")

# Check 1: Sample data
log_info("Sample records:")
spark.sql(f"SELECT * FROM {BRONZE_ORDER_ITEMS_TABLE} LIMIT 5").show(truncate=False)

# Check 2: Null counts
log_info("Null value check:")
spark.sql(f"""
    SELECT 
        SUM(CASE WHEN order_id IS NULL THEN 1 ELSE 0 END) as null_order_ids,
        SUM(CASE WHEN product_id IS NULL THEN 1 ELSE 0 END) as null_product_ids
    FROM {BRONZE_ORDER_ITEMS_TABLE}
""").show()

# Check 3: Basic stats
log_info("Basic statistics:")
spark.sql(f"SELECT * FROM {BRONZE_ORDER_ITEMS_TABLE}").describe().show()

log_info("Validation complete")

In [0]:
# Show table details
log_info("Table metadata:")
spark.sql(f"DESCRIBE DETAIL {BRONZE_ORDER_ITEMS_TABLE}").show(truncate=False)

log_info("Table schema:")
spark.sql(f"DESCRIBE {BRONZE_ORDER_ITEMS_TABLE}").show(truncate=False)