In [0]:
df_device = spark.table("workspace.bronze.device_message_raw")
df_steps = spark.table("workspace.bronze.rapid_step_test_raw")


In [0]:
display(df_device)
display(df_steps)

In [0]:
from pyspark.sql.functions import regexp_extract, col

df_device = df_device.withColumn(
    "distance_cm", regexp_extract(col("distance"), r"(\d+)", 1).cast("int")
)

In [0]:
from pyspark.sql.functions import lit

df_device = df_device.withColumn("source", lit("device"))
df_steps = df_steps.withColumn("source", lit("step"))

In [0]:
df_steps_window = df_steps.select(
"device_id", "start_time", "stop_time"
)

In [0]:
from pyspark.sql.functions import when, regexp_extract, col

df_labeled = (
    df_device.alias("d")
    .join(
        df_steps_window.alias("s"),
        (col("d.device_id") == col("s.device_id")) &
        (col("d.timestamp").between(col("s.start_time"), col("s.stop_time"))),
        "left"
    )
    .withColumn(
        "step_label",
        when(col("s.start_time").isNotNull(), "step").otherwise("no_step")
    )
    .withColumn(
        "distance_cm",
        regexp_extract(col("d.distance"), r"(\d+)", 1).cast("int")
    )
)

In [0]:
df_final = df_labeled.selectExpr(
    "timestamp",
    "sensor_type as sensorType",
    "distance_cm",
    "d.device_id as deviceId",
    "step_label",
    "source"
)

In [0]:
spark.sql("USE silver")
df_final.write.mode("overwrite").saveAsTable("labeled_step_test")


In [0]:
%sql
SELECT
  step_label,
  COUNT(*) AS row_count
FROM labeled_step_test
GROUP BY step_label

In [0]:
%sql
SELECT *
FROM labeled_step_test
WHERE step_label NOT IN ('step', 'no_step')
OR step_label IS NULL
LIMIT 50;

In [0]:
%sql
SELECT *
FROM labeled_step_test
WHERE source NOT IN ('device', 'step')
OR source IS NULL
LIMIT 50;

In [0]:
df_final.createOrReplaceTempView("final_df")

In [0]:
%sql
CREATE OR REPLACE TABLE labeled_step_test AS
SELECT * FROM final_df;

When automating health related data pipelines, engineers need to be careful with patient privacy and make sure sensitive data is properly secured and only accessible to the right people. Data accuracy is just as important, since bad or incomplete data can lead to misleading results or poor decisions. Itâ€™s also important to watch for bias in the data and avoid building systems that unfairly impact certain groups. Engineers should be clear that these pipelines are meant to support analysis, not provide medical diagnoses or treatment advice. In the end, ethical automation is about protecting people first and being thoughtful about how health data is collected, processed, and used.
