Step 1 - Load the Parquet Files into the datalake

In [0]:
df_device = spark.read.parquet("/Workspace/Repos/win185@ensign.edu/Databricks/data/device_messages.parquet")

# Confirm rows exist
df_device.count()  # Should return > 0
df_device.printSchema()
df_device.show(5)

Write The Bronze Device Messages

In [0]:
# Save to bronze
df_device.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("workspace.bronze.device_messages_raw")

Write The Bronze Rapid Step

In [0]:
df_tests = spark.read.parquet("/Workspace/Repos/win185@ensign.edu/Databricks/data/rapid_step_tests.parquet")

# Confirm rows exist
df_tests.count()  # Should return > 0
df_tests.printSchema()
df_tests.show(5)

In [0]:
# Save to bronze
df_tests.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("workspace.bronze.rapid_step_tests_raw")

Recheck Bronze Datalake Count

In [0]:
spark.read.table("workspace.bronze.device_messages_raw").count()

In [0]:
spark.read.table("workspace.bronze.rapid_step_tests_raw").count()

Inspect Column Names

In [0]:
spark.read.table("workspace.bronze.device_messages_raw").printSchema()

In [0]:
spark.read.table("workspace.bronze.rapid_step_tests_raw").printSchema()

Now we rebuild the silver table correctly

In [0]:
from pyspark.sql.functions import col, regexp_replace

# Step 1: Load bronze tables
df_device = spark.read.table("workspace.bronze.device_messages_raw")
df_tests = spark.read.table("workspace.bronze.rapid_step_tests_raw")

# Step 2: Clean the 'distance' column by removing 'cm' and casting to float
df_device_clean = df_device.withColumn(
    "distance_cm",
    regexp_replace(col("distance"), "cm", "").cast("float")
)

# Step 3: Join device and test data on 'device_id'
df_joined = df_device_clean.join(df_tests, on="device_id", how="inner")

# Step 4: Select only the relevant columns and filter out null distances
df_silver = df_joined.select(
    "timestamp",
    "sensor_type",
    "distance_cm",
    "device_id",
    "test_time",
    "total_steps"
).filter(col("distance_cm").isNotNull())

# Step 5: Write to silver table
df_silver.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .option("mergeSchema", "true") \
    .saveAsTable("workspace.silver.labeled_step_test")

Last Step - Verify Silver Data

In [0]:
# Verify the data made it in and looks correct
spark.read.table("workspace.silver.labeled_step_test") \
    .select("distance_cm").summary().show()