In [0]:
from pyspark.sql.types import (
    StructType, StructField,
    IntegerType, TimestampType
)
from pyspark.sql.functions import col

In [0]:
vitals_schema = StructType([
    StructField("patient_id", IntegerType(), True),
    StructField("heart_rate", IntegerType(), True),
    StructField("systolic_bp", IntegerType(), True),
    StructField("oxygen_level", IntegerType(), True),
    StructField("event_time", TimestampType(), True)
])

In [0]:
vitals_stream = (
    spark.readStream
    .schema(vitals_schema)
    .option("header", "true")
    .csv("/Volumes/workspace/healthcare_analytics/dataextract/vitals_stream")
)

In [0]:
vitals_stream.isStreaming

True

In [0]:
from pyspark.sql.functions import current_timestamp, lit

vitals_bronze = (
    vitals_stream
    .withColumn("ingest_ts", current_timestamp())
    .withColumn("source_system", lit("simulated_vitals"))
)

In [0]:
dbutils.fs.mkdirs(
    "/Volumes/workspace/healthcare_analytics/datastore/checkpoints/vitals_bronze"
)

True

In [0]:
display(
    dbutils.fs.ls(
        "/Volumes/workspace/healthcare_analytics/datastore/checkpoints"
    )
)

path,name,size,modificationTime
dbfs:/Volumes/workspace/healthcare_analytics/datastore/checkpoints/vitals_bronze/,vitals_bronze/,0,1768760093019


In [0]:
(
    vitals_bronze.writeStream
    .format("delta")
    .option(
        "checkpointLocation",
        "/Volumes/workspace/healthcare_analytics/datastore/checkpoints/vitals_bronze"
    )
    .outputMode("append")
    .trigger(availableNow=True)   # required for Serverless
    .toTable("healthcare_analytics.bronze_vitals_stream")
)

<pyspark.sql.connect.streaming.query.StreamingQuery at 0xff928c5f0ef0>

In [0]:
spark.table("healthcare_analytics.bronze_vitals_stream").count()

1000