In [0]:
# Auto-reload Setup
%load_ext autoreload
%autoreload 2

In [0]:
import sys
repo_path = "/Workspace/Users/vchhatbar11@outlook.com/ecom-project/src"
sys.path.append(repo_path)

from config import *
from utils.logger import log_info, log_metric

# Cell 2: Analyze partition skew
log_info("=" * 60)
log_info("DATA QUALITY ANALYSIS: Partition Skew")
log_info("=" * 60)

# Distribution analysis
spark.sql(f"""
    SELECT 
        FLOOR(order_id / 1000000) as order_id_millions,
        COUNT(*) as row_count,
        ROUND(COUNT(*) * 100.0 / 800000000, 2) as pct_of_total
    FROM {BRONZE_ORDER_ITEMS_TABLE}
    GROUP BY order_id_millions
    ORDER BY order_id_millions
""").show(100, truncate=False)

# Cell 3: Skew metrics
skew_stats = spark.sql(f"""
    SELECT 
        order_id_bucket,
        COUNT(*) as rows,
        ROUND(COUNT(*) / 1000000.0, 2) as rows_millions
    FROM {BRONZE_ORDER_ITEMS_TABLE}
    GROUP BY order_id_bucket
    ORDER BY rows DESC
""")

skew_stats.show(20, truncate=False)

max_partition = skew_stats.collect()[0]
log_metric("Largest partition rows", f"{max_partition['rows']:,}")
log_metric("Skew factor", f"{max_partition['rows'] / 10000000:.1f}x")