In [0]:
# Databricks notebook source
# COMMAND ----------
SOURCE_TABLE = "workspace.default.bronze_events"

# COMMAND ----------
# Load bronze data
df = spark.table(SOURCE_TABLE)
display(df.limit(20))


# Dataset Profiling â€“ Bronze E-commerce Events

This notebook performs exploratory data profiling on the Bronze layer
to understand data quality, schema consistency, null patterns, and
value distributions before designing the Silver layer.


In [0]:
SOURCE_TABLE = "workspace.default.bronze_events"


## Profiling Objectives

We analyze the following aspects:
- Row count and schema
- Event type distribution
- Null value patterns
- Cardinality of key fields
- Timestamp coverage


In [0]:
print("Total rows:", df.count())
print("Total columns:", len(df.columns))
print("Columns:", df.columns)


In [0]:
display(
    df.groupBy("event_type")
      .count()
      .orderBy("count", ascending=False)
)


In [0]:
display(
    df.selectExpr(
        "min(event_time) as min_time",
        "max(event_time) as max_time"
    )
)


In [0]:
from pyspark.sql import functions as F


In [0]:
display(
    df.select([
        (F.count(F.when(F.col(c).isNull(), c)) / df.count()).alias(c)
        for c in df.columns
    ])
)


In [0]:
display(
    df.filter(df.event_type == "purchase")
      .select("price")
      .summary()
)
