In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import uuid, random
from datetime import datetime, timedelta

In [0]:
# Linking EventHub connection string using Azure Key Vault for security and establishing permission for databricks to access EventHub
connection_string = dbutils.secrets.get(scope="hospitalanalyticsvaultscope",key="eventhubkey")
event_hub_name = "hospital-data"

event_hubs_conf = {
    'eventhubs.connectionString': sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(connection_string)
}

# Departments in hospital
departments = ["Emergency", "Surgery", "ICU", "Pediatrics", "Maternity", "Oncology", "Cardiology"]
genders = ["Male", "Female"]



In [0]:
# creating a fake patient data
def generate_patient_event():
    admission_time = datetime.utcnow() - timedelta(hours=random.randint(0,72))
    discharge_time = admission_time + timedelta(hours = random.randint(1,72))

    event = {
        "patient_id":str(uuid.uuid4()),
        "gender":random.choice(genders),
        "age":random.randint(1,100),
        "department":random.choice(departments),
        "admission_time":admission_time.isoformat(),
        "discharge_time":admission_time.isoformat(),
        "bed_id":random.randint(1,500),
        "hospital_id":random.randint(1,7)
    }

    if random.random() < 0.05:
        event['age'] = random.randint(101,150)
    if random.random() < 0.05:
        event['admission_time'] = (datetime.utcnow() + timedelta(hours=random.randint(1,72))).isoformat()

    return event        

In [0]:
# Generate micro batch of Events:
def generate_batch(n):
    return [generate_patient_event() for i in range(n)]
  

In [0]:
import time
# convert to spark DataFrame (simulate streaming using foreachBatch)
schema = StructType([
    StructField("patient_id", StringType()),
    StructField("gender",StringType()),
    StructField("age",IntegerType()),
    StructField("department",StringType()),
    StructField("admission_time",StringType()),
    StructField("discharge_time",StringType()),
    StructField("bed_id",IntegerType()),
    StructField("hospital_id",IntegerType())
])


In [0]:
# Data which was generated is streamed to Azure EventHub 
while True:
    batch = generate_batch(10)
    df = spark.createDataFrame(batch, schema)
   
    df_json = df.select(to_json(struct("*")).alias("body"))

    df_json.write.format("eventhubs").options(**event_hubs_conf).option("eventhubs.name","hospital-data").save()

    df.show()
    time.sleep(2)

+--------------------+------+---+----------+--------------------+--------------------+------+-----------+
|          patient_id|gender|age|department|      admission_time|      discharge_time|bed_id|hospital_id|
+--------------------+------+---+----------+--------------------+--------------------+------+-----------+
|2cd54130-e271-479...|Female| 52|       ICU|2025-09-04T06:00:...|2025-09-04T06:00:...|   116|          5|
|eda52907-ab95-451...|Female|100|Cardiology|2025-09-04T03:00:...|2025-09-04T03:00:...|   387|          3|
|82a87953-693a-421...|Female| 56|Cardiology|2025-09-05T23:00:...|2025-09-05T23:00:...|   411|          5|
|91254b73-00b8-4a1...|Female| 84|Pediatrics|2025-09-05T06:00:...|2025-09-05T06:00:...|   198|          1|
|07f07c1b-fe50-433...|  Male| 65|       ICU|2025-09-06T06:00:...|2025-09-06T06:00:...|   446|          3|
|ae24c930-9659-478...|Female| 77| Emergency|2025-09-03T18:00:...|2025-09-03T18:00:...|   474|          4|
|899dcdd7-abd6-4c1...|  Male|113|  Oncology|20

com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:136)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:136)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:133)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:133)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:730)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:448)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:448)
	at com.databricks.spark.chauffeur.ChauffeurState.cancelExecutio

In [0]:
connection_string = connection_string = dbutils.secrets.get(scope="hospitalanalyticsvaultscope",key="eventhubkey")
 
event_hub_name = "hospital-data"

event_hubs_conf = {
    'eventhubs.connectionString': sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(connection_string)
}

In [0]:
# Extracting data(binary format) from EventHub to the Bronze layer in ADLS Gen2 Storage
raw_df = (spark.readStream.format("eventhubs").options(**event_hubs_conf).load())

json_df = raw_df.selectExpr("cast(body as STRING) as raw_json")

spark.conf.set(
    "fs.azure.account.key.hospitaldatastorag.dfs.core.windows.net",
    dbutils.secrets.get(scope="hospitalanalyticsvaultscope",key="storageacct")
)

bronze_path = "abfss://bronze@hospitaldatastorag.dfs.core.windows.net/patient_flow"

(
json_df.writeStream.format("delta").outputMode("append").option("checkpointLocation","dbfs:/mnt/bronze/_checkpoints/patient_flow").start(bronze_path)
)

<pyspark.sql.streaming.query.StreamingQuery at 0x7fa3a4c95a20>

In [0]:
display(spark.read.format("delta").load(bronze_path).count())

1330