#### **Source**: Event Hub
#### **Destination**: Lakehouse Table

In [10]:
key_vault = "akvforfabric"
key_vault_secret_name = "ehconnstr"


StatementMeta(, c94b3534-4fcc-48b5-ae39-77c8c8c8da5b, 12, Finished, Available)

##### Set the configuration to connect to the Eventhub

In [11]:
# Get Event Hub Connection String from Key Vault 
from notebookutils import mssparkutils
EH_CS = mssparkutils.credentials.getSecret(f"https://{key_vault}.vault.azure.net/", key_vault_secret_name) # IoT Hub connection string (Event Hub Compatible) 
print(EH_CS) 

StatementMeta(, c94b3534-4fcc-48b5-ae39-77c8c8c8da5b, 13, Finished, Available)

[REDACTED]


In [12]:
# Declaring the path for the delta file
table_delta_file_location = f"Tables/Struc_streaming_flight_data"
checkpointLocation = f"Files/_checkpoint/Struc_streaming_flight_data"

StatementMeta(, c94b3534-4fcc-48b5-ae39-77c8c8c8da5b, 14, Finished, Available)

In [13]:
connectionString = EH_CS

'''
#  setting the starting offset. 
#  Incase we would like to restart the job, it will ingest the data from the beginging. However it will depend on the 
#  event hubs message retention policy. 
import json
startTime = "2000-01-01T00:01:05.662231Z"

startingEventPosition = {
"offset": None,
"seqNo": -1, #not in use
"enqueuedTime": startTime,
"isInclusive": True
}


ehConf = {
  'eventhubs.connectionString' : sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(connectionString),
  'eventhubs.startingPosition' : json.dumps(startingEventPosition)
}

# delete the delta file if it exists
if mssparkutils.fs.exists(table_delta_file_location):
  mssparkutils.fs.rm(table_delta_file_location, recurse=True)
if mssparkutils.fs.exists(checkpointLocation):
  mssparkutils.fs.rm(checkpointLocation, recurse=True)

'''
# for the normal retry scenario
ehConf = {
  'eventhubs.connectionString' : sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(connectionString)
}

StatementMeta(, c94b3534-4fcc-48b5-ae39-77c8c8c8da5b, 15, Finished, Available)

In [14]:
df = spark \
    .readStream \
    .format("eventhubs") \
    .options(**ehConf) \
  .load()

StatementMeta(, c94b3534-4fcc-48b5-ae39-77c8c8c8da5b, 16, Finished, Available)

In [15]:
# create spark schema for the below smaple JSON
#JSON = {"icao24": "e80219", "firstSeen": 1689789634, "firstSeen_time": "2023-07-19 23:30:34", "estDepartureAirport": null, "lastSeen": 1689790990, "lastseen_time": "2023-07-19 23:53:10", "estArrivalAirport": "SKBO", "durationOfFlightInMinutes": 22.6, "callsign": "ARE4015 ", "estDepartureAirportHorizDistance": 0, "estDepartureAirportVertDistance": 0, "estArrivalAirportHorizDistance": 4697, "estArrivalAirportVertDistance": 33, "departureAirportCandidatesCount": 0, "arrivalAirportCandidatesCount": 31}
from pyspark.sql.types import * 
Schema = StructType([StructField("icao24", StringType(), False),
                        StructField("firstSeen", IntegerType(), False),
                        StructField("firstSeen_time", TimestampType(), False),
                        StructField("estDepartureAirport", StringType(), True),
                        StructField("lastSeen", IntegerType(), False),
                        StructField("lastseen_time", TimestampType(), False),
                        StructField("estArrivalAirport", StringType(), True),
                        StructField("durationOfFlightInMinutes", DoubleType(), True),
                        StructField("callsign", StringType(), True),
                        StructField("estDepartureAirportHorizDistance", IntegerType(), True),
                        StructField("estDepartureAirportVertDistance", IntegerType(), True),
                        StructField("estArrivalAirportHorizDistance", IntegerType(), True),
                        StructField("estArrivalAirportVertDistance", IntegerType(), True),
                        StructField("departureAirportCandidatesCount", IntegerType(), True),
                        StructField("arrivalAirportCandidatesCount", IntegerType(), True)
                        ])


StatementMeta(, c94b3534-4fcc-48b5-ae39-77c8c8c8da5b, 17, Finished, Available)

In [16]:
# Before writing the data, we are doing transformation on the dataframe.
from pyspark.sql.functions import from_json, col

df1 = df.withColumn("bodyAsString", df["body"].cast("string")) \
        .select(from_json("bodyAsString", Schema).alias("events")) \
        .select("events.*") \
        .repartition(24) # paritioning the data in memory

StatementMeta(, c94b3534-4fcc-48b5-ae39-77c8c8c8da5b, 18, Finished, Available)

In [17]:
def write2table(df2, epoch_id):
    df2.write.format("delta").mode("append").partitionBy("estDepartureAirport").save(f"Tables/Struc_streaming_flight_data")

# here paritionby partition the data in disk

StatementMeta(, c94b3534-4fcc-48b5-ae39-77c8c8c8da5b, 19, Finished, Available)

In [18]:
# We are microbatching in every 15 seconds.
df1.writeStream \
    .outputMode("append") \
    .trigger(processingTime='15 seconds') \
    .option("checkpointLocation",checkpointLocation) \
    .foreachBatch(write2table) \
    .start() \
    .awaitTermination() # Here we can provide the timeout value incase we want spark streaming job to be stopped after sometime. 

StatementMeta(, c94b3534-4fcc-48b5-ae39-77c8c8c8da5b, 20, Finished, Cancelled)