In [0]:
import pandas as pd
import os
import logging

print("Current working directory:", os.getcwd())

menu_df = pd.read_csv("/Workspace/Repos/win185@ensign.edu/Databricks/data/menu_items.csv")
orders_df = pd.read_csv("/Workspace/Repos/win185@ensign.edu/Databricks/data/order_details.csv")

logging.info(f"Loaded menu_items.csv with {len(menu_df)} rows")
logging.info(f"Loaded order_details.csv with {len(orders_df)} rows")


In [0]:
# =========================
# Logging & Reproducibility Setup
# =========================

import logging
import os
import sys
from datetime import datetime
import platform
import random
import numpy as np

# -------------------------
# Reproducibility
# -------------------------
os.environ["PYTHONHASHSEED"] = "0"
random.seed(0)
np.random.seed(0)

# -------------------------
# Paths
# -------------------------
PROJECT_ROOT = os.getcwd()
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
os.makedirs(LOG_DIR, exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M")
LOG_FILE = os.path.join(LOG_DIR, f"run_{timestamp}.log")

# -------------------------
# Logging Configuration
# -------------------------
logger = logging.getLogger("etl_logger")
logger.setLevel(logging.INFO)
logger.handlers.clear()

formatter = logging.Formatter(
    "%(asctime)s | %(levelname)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

# Console handler
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(formatter)

# File handler
file_handler = logging.FileHandler(LOG_FILE)
file_handler.setFormatter(formatter)

logger.addHandler(console_handler)
logger.addHandler(file_handler)

# -------------------------
# Log run metadata
# -------------------------
logger.info("=== ETL RUN STARTED ===")
logger.info(f"Timestamp: {timestamp}")
logger.info(f"Python version: {sys.version}")
logger.info(f"Platform: {platform.platform()}")
logger.info(f"Working directory: {PROJECT_ROOT}")
logger.info(f"Log file: {LOG_FILE}")


Part A is already done in 2.4.
Part B – Set Up Logging (≈15 min) which is above

In [0]:
import os
import random
import numpy as np

# Must be set before Python hashing happens
os.environ["PYTHONHASHSEED"] = "0"

random.seed(0)
np.random.seed(0)

print("Random seeds fixed")

Part C – Reproducibility Setup (≈15 min)

In [0]:
%pip freeze > /Workspace/Repos/win185@ensign.edu/Databricks/requirements.txt

In [0]:
import os
import hashlib
import json

data_dir = "/Workspace/Repos/win185@ensign.edu/Databricks/data"
output_file = "data_hashes.json"

hashes = {}

csv_files = [f for f in os.listdir(data_dir) if f.endswith(".csv")]

for filename in csv_files:
    file_path = os.path.join(data_dir, filename)
    sha256_hash = hashlib.sha256()

    with open(file_path, "rb") as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)

    hashes[filename] = sha256_hash.hexdigest()

# Write hashes to JSON
with open(output_file, "w") as f:
    json.dump(hashes, f, indent=2)

print(f"Data hashes written to {output_file}")
print(json.dumps(hashes, indent=2))


This generates the SHA-256 Hashes

In [0]:
%sql
SHOW CATALOGS;

In [0]:
%sql
SHOW SCHEMAS IN bronze;

In [0]:
import mlflow
import json
import tempfile
from pyspark.sql import SparkSession
from pyspark.sql.functions import sha2, concat_ws, col, collect_list

# Ensure Spark session exists (safe in Databricks too)
spark = SparkSession.builder.getOrCreate()

# Map logical table names to CSV paths
tables = {
    "menu_items": "/Volumes/workspace/bronze/restaurant/menu_items.csv",
    "order_details": "/Volumes/workspace/bronze/restaurant/order_details.csv"
}

hashes = {}

for name, path in tables.items():
    # Read CSV with Spark
    df = spark.read.csv(path, header=True, inferSchema=True)

    # Create deterministic row-level hash
    row_hashes = df.select(
        sha2(
            concat_ws(
                "||",
                *[col(c).cast("string") for c in df.columns]
            ),
            256
        ).alias("row_hash")
    )

    # Create deterministic dataset-level hash
    dataset_hash = (
        row_hashes
        .orderBy("row_hash")              # Order-independent
        .groupBy()
        .agg(
            sha2(
                concat_ws("", collect_list("row_hash")),
                256
            ).alias("dataset_hash")
        )
        .collect()[0]["dataset_hash"]
    )

    hashes[name] = dataset_hash

# Write hashes to a temporary JSON file
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
    json.dump(hashes, f, indent=2)
    tmp_path = f.name

# Log artifact to MLflow
# Write hashes to a temporary JSON file
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
    json.dump(hashes, f, indent=2)
    tmp_path = f.name

# Log artifact to MLflow safely
active_run = mlflow.active_run()
if active_run is None:
    with mlflow.start_run():
        mlflow.log_artifact(tmp_path, artifact_path="data_lineage")
else:
    mlflow.log_artifact(tmp_path, artifact_path="data_lineage")

print("Dataset hashes logged to MLflow:")
print(hashes)

In [0]:
####Load from Unity Catalog → Pandas
import pandas as pd

messages_pd = spark.table("bronze.device_messages_raw").toPandas()
tests_pd = spark.table("bronze.rapid_step_tests_raw").toPandas()
# ---- device_messages_raw cleanup ----
messages_pd.columns = messages_pd.columns.str.strip()

messages_pd["sensor_type"] = messages_pd["sensor_type"].astype(str).str.strip()
messages_pd["message_origin"] = messages_pd["message_origin"].astype(str).str.strip()

messages_pd["timestamp"] = pd.to_datetime(
    messages_pd["timestamp"], errors="coerce"
)

# ---- rapid_step_tests_raw cleanup ----
tests_pd.columns = tests_pd.columns.str.strip()

tests_pd["start_time"] = pd.to_datetime(
    tests_pd["start_time"], errors="coerce"
)

tests_pd["total_steps"] = (
    pd.to_numeric(tests_pd["total_steps"], errors="coerce")
    .fillna(0)
    .astype(int)
)
####Join on device_id
etl_df = tests_pd.merge(
    messages_pd,
    on="device_id",
    how="inner"
)
####Create a tidy table
tidy_df = etl_df[[
    "device_id",
    "start_time",
    "sensor_type",
    "message_origin",
    "total_steps"
]].copy()

tidy_df.rename(columns={
    "start_time": "test_start_time",
    "sensor_type": "item_name",
    "message_origin": "category",
    "total_steps": "quantity"
}, inplace=True)
####Top 5 “items” by quantity (sensor activity)
top_5_items = (
    tidy_df
    .groupby("item_name", as_index=False)["quantity"]
    .sum()
    .sort_values("quantity", ascending=False)
    .head(5)
)
####“Revenue by category” → Activity by message origin
activity_by_category = (
    tidy_df
    .groupby("category", as_index=False)["quantity"]
    .sum()
    .sort_values("quantity", ascending=False)
)
####Busiest hour of day (tests started)
tidy_df["hour"] = tidy_df["test_start_time"].dt.hour

busiest_hour = (
    tidy_df
    .groupby("hour", as_index=False)["quantity"]
    .sum()
    .sort_values("quantity", ascending=False)
    .head(1)
)
####Save results (MLflow — permission safe)
import mlflow
import tempfile
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

def log_df(df, name):
    with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
        df.to_csv(f.name, index=False)
        mlflow.log_artifact(f.name, artifact_path="etl_metrics")

log_df(top_5_items, f"top_5_items_{timestamp}.csv")
log_df(activity_by_category, f"activity_by_category_{timestamp}.csv")
log_df(busiest_hour, f"busiest_hour_{timestamp}.csv")

print("ETL metrics logged to MLflow artifacts")


Part D – ETL with Pandas (≈40 min)