PART 0 — Setup + Confirm Tables Exist (Counts toward Data Import)

In [0]:
# Databricks notebook source
# COMMAND ----------
# (Optional) set these if your tables are named differently
DEVICE_TABLE = "workspace.bronze.device_messages_raw"   # OR workspace.bronze.device_message_raw
STEP_TABLE   = "workspace.bronze.rapid_step_tests_raw"  # OR workspace.bronze.rapid_step_test_raw
# COMMAND ----------
# Confirm tables load and show visible records (rubric: Data Import)
device_raw = spark.table(DEVICE_TABLE)
step_raw   = spark.table(STEP_TABLE)

display(device_raw.limit(10))
display(step_raw.limit(10))
# COMMAND ----------
# Show schemas so you can verify column names for joins/plots
device_raw.printSchema()
step_raw.printSchema()


PART 1 — SQL (1.1–1.7) + Markdown Notes Above Each


In [0]:
%sql
-- This query groups step data by device to calculate total steps.
-- The result returns one row per device with columns for device_id and total_steps.

SELECT *
FROM workspace.bronze.device_messages_raw
LIMIT 10;

SELECT *
FROM workspace.bronze.rapid_step_tests_raw
LIMIT 10;

SELECT 'device_messages_raw' AS table_name, COUNT(*) AS row_count
FROM workspace.bronze.device_messages_raw
UNION ALL
SELECT 'rapid_step_tests_raw' AS table_name, COUNT(*) AS row_count
FROM workspace.bronze.rapid_step_tests_raw;

SELECT
  COUNT(*) AS total_rows,
  SUM(CASE WHEN session_key IS NULL THEN 1 ELSE 0 END) AS null_session_key,
  SUM(CASE WHEN timestamp IS NULL THEN 1 ELSE 0 END) AS null_timestamp
FROM workspace.bronze.device_messages_raw;

SELECT
  COUNT(*) AS total_rows,
  SUM(CASE WHEN total_steps IS NULL THEN 1 ELSE 0 END) AS null_total_steps
FROM workspace.bronze.rapid_step_tests_raw;

CREATE OR REPLACE TEMP VIEW device_clean AS
SELECT
  *,
  to_timestamp(from_unixtime(CAST(timestamp / 1000 AS BIGINT))) AS event_ts
FROM workspace.bronze.device_messages_raw;

CREATE OR REPLACE TEMP VIEW step_test_clean AS
SELECT *
FROM workspace.bronze.rapid_step_tests_raw;

SELECT
  d.device_id,
  d.session_key,
  d.event_ts,
  s.total_steps
FROM device_clean d
LEFT JOIN step_test_clean s
  ON d.device_id = s.device_id
LIMIT 50;


STEP 1 — Create ONE Python Cell (Part 2.1–2.2)

In [0]:
# Join device metadata with aggregated step totals for combined analysis
device_df = spark.table("workspace.bronze.device_messages_raw")
step_df = spark.table("workspace.bronze.rapid_step_tests_raw")

device_df2 = device_df.dropna(subset=["device_id", "timestamp"])
step_df2 = step_df.dropna(subset=["device_id", "total_steps"])

display(device_df2.limit(10))
display(step_df2.limit(10))


STEP 2 — Feature Engineering (Part 2.3)

In [0]:
## We group the data by device_id and calculate total steps.
## This aggregation allows us to compare overall activity levels across devices in a consistent way.

from pyspark.sql.functions import (
    count, avg, min as spark_min, max as spark_max,
    col, regexp_replace
)

features_df = (
    device_df2
    .withColumn(
        "distance_cm",
        regexp_replace(col("distance"), "cm", "").cast("double")
    )
    .groupBy("device_id")
    .agg(
        count("*").alias("device_message_count"),
        avg("distance_cm").alias("avg_distance_cm"),
        spark_min("timestamp").alias("min_timestamp"),
        spark_max("timestamp").alias("max_timestamp"),
    )
)

display(features_df.limit(20))

STEP 3 — Join Features to Step Tests (Part 2.4)

In [0]:
joined_df = step_df2.join(features_df, on="device_id", how="left")
display(joined_df.limit(50))


STEP 4 — Validation Check (Part 2.5)

In [0]:
from pyspark.sql.functions import col, count, sum as spark_sum, when

validation_df = joined_df.select(
    count("*").alias("rows"),
    spark_sum(when(col("device_message_count").isNull(), 1).otherwise(0)).alias("null_message_count"),
    spark_sum(when(col("avg_distance_cm").isNull(), 1).otherwise(0)).alias("null_avg_distance"),
)

display(validation_df)


STEP 5 — Plot (Part 2.6)

In [0]:
import matplotlib.pyplot as plt
from pyspark.sql.functions import col, regexp_replace

one_device = device_df2.select("device_id").limit(1).collect()[0]["device_id"]

pdf = (
    device_df2
    .withColumn(
        "distance_cm",
        regexp_replace(col("distance"), "cm", "").cast("double")
    )
    .filter(col("device_id") == one_device)
    .select("timestamp", "distance_cm")
    .orderBy("timestamp")
    .toPandas()
)

plt.figure()
plt.plot(pdf["timestamp"], pdf["distance_cm"])
plt.xlabel("timestamp")
plt.ylabel("distance (cm)")
plt.title(f"Distance over time for device_id = {one_device}")
plt.xticks(rotation=30)
plt.tight_layout()
plt.show()


### Reflection

Accurate data preparation ensures that step counts and device metrics are correctly represented without duplication or distortion.
By carefully cleaning and aggregating the data, I reduce the risk of misleading results or biased interpretations.
This supports ethical data use by promoting transparency, accuracy, and honest reporting of activity patterns.

