In [0]:
import logging

logger = logging.getLogger("runtime_logger")
logger.setLevel(logging.INFO)
logger.propagate = False

formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s')

# Clear handlers on re-run
logger.handlers.clear()

console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)

logger.addHandler(console_handler)

logger.info("Logger initialized (console only)")

2026-01-15 01:20:58,055 | INFO | Logger initialized (console only)


Part A is already done in 2.4.
Part B – Set Up Logging (≈15 min) which is above

In [0]:
import os
import random
import numpy as np

# Must be set before Python hashing happens
os.environ["PYTHONHASHSEED"] = "0"

random.seed(0)
np.random.seed(0)

print("Random seeds fixed")

Random seeds fixed


Part C – Reproducibility Setup (≈15 min)

In [0]:
%pip freeze

annotated-types==0.7.0
anyio==4.6.2
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
arrow==1.3.0
asttokens==2.0.5
astunparse==1.6.3
async-lru==2.0.4
attrs==24.3.0
azure-common==1.1.28
azure-core==1.34.0
azure-identity==1.20.0
azure-mgmt-core==1.5.0
azure-mgmt-web==8.0.0
azure-storage-blob==12.23.0
azure-storage-file-datalake==12.17.0
babel==2.16.0
beautifulsoup4==4.12.3
black==24.10.0
bleach==6.2.0
blinker==1.7.0
boto3==1.36.2
botocore==1.36.3
cachetools==5.5.1
certifi==2025.1.31
cffi==1.17.1
chardet==4.0.0
charset-normalizer==3.3.2
click==8.1.7
cloudpickle==3.0.0
comm==0.2.1
contourpy==1.3.1
cryptography==43.0.3
cycler==0.11.0
Cython==3.0.12
databricks-connect==17.2.4
databricks-sdk==0.49.0
dbus-python==1.3.2
debugpy==1.8.11
decorator==5.1.1
defusedxml==0.7.1
Deprecated==1.2.13
distlib==0.3.9
distro==1.9.0
distro-info==1.7+build1
docstring-to-markdown==0.11
executing==0.8.3
facets-overview==1.1.1
fastapi==0.115.12
fastjsonschema==2.21.1
filelock==3.18.0
fonttools==4.55.3
fqdn==1.5.1


In [0]:
import hashlib
import json
from pathlib import Path

def sha256_file(path, chunk_size=8192):
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
            h.update(chunk)
    return h.hexdigest()


In [0]:
%sql
SHOW CATALOGS;

catalog
bronze
samples
system
workspace


In [0]:
%sql
SHOW SCHEMAS IN bronze;

databaseName
information_schema


In [0]:
import mlflow
import json
import tempfile
from pyspark.sql.functions import sha2, concat_ws, col, collect_list

tables = [
    "bronze.device_messages_raw",
    "bronze.rapid_step_tests_raw",
]

hashes = {}

for table in tables:
    df = spark.table(table)

    # Deterministic row-level hash
    row_hashes = df.select(
        sha2(
            concat_ws("||", *[col(c).cast("string") for c in df.columns]),
            256
        ).alias("row_hash")
    )

    # Deterministic dataset-level hash
    dataset_hash = (
        row_hashes
        .orderBy("row_hash")
        .groupBy()
        .agg(
            sha2(concat_ws("", collect_list("row_hash")), 256)
            .alias("dataset_hash")
        )
        .collect()[0]["dataset_hash"]
    )

    hashes[table] = dataset_hash

# Log hashes to MLflow (no filesystem permissions needed)
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
    json.dump(hashes, f, indent=2)
    tmp_path = f.name

mlflow.log_artifact(tmp_path, artifact_path="data_lineage")

print("Dataset hashes logged to MLflow:")
hashes

Dataset hashes logged to MLflow:


{'bronze.device_messages_raw': '938e400aff171de7f5f8202a9ed4de935d0bc5c31fa0cc7724c3be2156785fd8',
 'bronze.rapid_step_tests_raw': '6131c7c90b4f92b5e5a39df31dd696c0d08ce8b5a32e6a5b5e519e81a3769a5f'}

In [0]:
####Load from Unity Catalog → Pandas
import pandas as pd

messages_pd = spark.table("bronze.device_messages_raw").toPandas()
tests_pd = spark.table("bronze.rapid_step_tests_raw").toPandas()
# ---- device_messages_raw cleanup ----
messages_pd.columns = messages_pd.columns.str.strip()

messages_pd["sensor_type"] = messages_pd["sensor_type"].astype(str).str.strip()
messages_pd["message_origin"] = messages_pd["message_origin"].astype(str).str.strip()

messages_pd["timestamp"] = pd.to_datetime(
    messages_pd["timestamp"], errors="coerce"
)

# ---- rapid_step_tests_raw cleanup ----
tests_pd.columns = tests_pd.columns.str.strip()

tests_pd["start_time"] = pd.to_datetime(
    tests_pd["start_time"], errors="coerce"
)

tests_pd["total_steps"] = (
    pd.to_numeric(tests_pd["total_steps"], errors="coerce")
    .fillna(0)
    .astype(int)
)
####Join on device_id
etl_df = tests_pd.merge(
    messages_pd,
    on="device_id",
    how="inner"
)
####Create a tidy table
tidy_df = etl_df[[
    "device_id",
    "start_time",
    "sensor_type",
    "message_origin",
    "total_steps"
]].copy()

tidy_df.rename(columns={
    "start_time": "test_start_time",
    "sensor_type": "item_name",
    "message_origin": "category",
    "total_steps": "quantity"
}, inplace=True)
####Top 5 “items” by quantity (sensor activity)
top_5_items = (
    tidy_df
    .groupby("item_name", as_index=False)["quantity"]
    .sum()
    .sort_values("quantity", ascending=False)
    .head(5)
)
####“Revenue by category” → Activity by message origin
activity_by_category = (
    tidy_df
    .groupby("category", as_index=False)["quantity"]
    .sum()
    .sort_values("quantity", ascending=False)
)
####Busiest hour of day (tests started)
tidy_df["hour"] = tidy_df["test_start_time"].dt.hour

busiest_hour = (
    tidy_df
    .groupby("hour", as_index=False)["quantity"]
    .sum()
    .sort_values("quantity", ascending=False)
    .head(1)
)
####Save results (MLflow — permission safe)
import mlflow
import tempfile
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

def log_df(df, name):
    with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
        df.to_csv(f.name, index=False)
        mlflow.log_artifact(f.name, artifact_path="etl_metrics")

log_df(top_5_items, f"top_5_items_{timestamp}.csv")
log_df(activity_by_category, f"activity_by_category_{timestamp}.csv")
log_df(busiest_hour, f"busiest_hour_{timestamp}.csv")

print("ETL metrics logged to MLflow artifacts")


ETL metrics logged to MLflow artifacts


Part D – ETL with Pandas (≈40 min)