In [1]:
from azure.eventhub import EventData
from azure.eventhub import EventHubClient
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json
from datetime import datetime

## Connection details

In [3]:
kv_scope = 'key-vault-secret'

# Variables
eventhubs_namespace = dbutils.secrets.get(scope =kv_scope, key = 'traffic-eventhubs-namespace') 
eventhubs_accesskey = dbutils.secrets.get(scope =kv_scope, key = 'traffic-eventhubs-accesskey') 
eventhubs_accessid = dbutils.secrets.get(scope =kv_scope, key = 'traffic-eventhubs-accessid') 
eventhubs_name = dbutils.secrets.get(scope =kv_scope, key = 'traffic-eventhubs-name') 

# Build connection string with the above information
cameraHubConnectionString = 'Endpoint=sb://{}.servicebus.windows.net/;SharedAccessKeyName={};SharedAccessKey={};EntityPath={}'.format(
  eventhubs_namespace,
  eventhubs_accessid,
  eventhubs_accesskey,
  eventhubs_name)

print(cameraHubConnectionString)

## Event schema definition
1. Everything is defined as a string, otherwise we get null values

In [5]:
# Define schema and create incoming camera eventstream
cameraEventSchema = StructType([ StructField('TrajectId', StringType(), True),
                      StructField('CameraId', StringType(), True),
                      StructField('EventTime', StringType(), True),
                      StructField('Lane', StringType(), True),
                      StructField('Country', StringType(), True),
                      StructField('LicensePlate', StringType(), True),
                      StructField('Make', StringType(), True),
                      StructField('Color', StringType(), True)])



## Configure event hub reader

In [7]:
# Starting position
startingEventPosition = {
  'offset': '@latest',  
  'seqNo': -1,            #not in use
  'enqueuedTime': None,   #not in use
  'isInclusive': True
}

# Source with default settings
ehConf = {
  'eventhubs.connectionString' : cameraHubConnectionString,
  'eventhubs.consumerGroup': 'db-ingestion',
  'eventhubs.startingPosition': json.dumps(startingEventPosition),
  'maxEventsPerTrigger': 5
}

incomingStream = spark \
  .readStream \
  .format('eventhubs') \
  .options(**ehConf) \
  .load()

## Transform streams to readable dataframes
1. First we define the 4 event hubs properties (Offset, Time, Timestamp and Body)
1. Then, by using the from_json method on the Body property, we apply the above defined schema 
1. After this, we select from the deserialized json the 5 properties which we will ingest

This is the json we'll get
```json
{"TrajectId":"01","CameraId":"Camera1","EventTime":"2019-12-09T09:59:58.2710792+00:00","Lane":"2","Country":"BE","LicensePlate":"1-KHC-729","Make":"Renault","Color":"Gray"}
```

In [9]:
# Define parsing query selecting the required properties from the incoming telemetry data
cameraData = \
  incomingStream \
  .withColumn('Offset', col('offset')) \
  .withColumn('Body', col('body')) \
  .withColumn('CameraEvents', from_json(col('Body').cast(StringType()), cameraEventSchema)) \
  .withColumn('Time (readable)', col('CameraEvents.EventTime').cast(TimestampType())) \
  .withColumn('Timestamp', col('enqueuedTime')) \
  .withColumn('TrajectId', col('CameraEvents.TrajectId').cast(StringType())) \
  .withColumn('CameraId', col('CameraEvents.CameraId').cast(StringType())) \
  .withColumn('EventTime', col('CameraEvents.EventTime').cast(TimestampType())) \
  .withColumn('Lane', col('CameraEvents.Lane').cast(IntegerType())) \
  .withColumn('Country', col('CameraEvents.Country').cast(StringType())) \
  .withColumn('LicensePlate', col('CameraEvents.LicensePlate').cast(StringType())) \
  .withColumn('Make', col('CameraEvents.Make').cast(StringType())) \
  .withColumn('Color', col('CameraEvents.Color').cast(StringType())) \
  .select('TrajectId', 'CameraId', 'EventTime', 'Lane', 'Country', 'LicensePlate', 'Make', 'Color') \

## Stream all iot telemetry to Spark table

This is needed as having multiple queries on the same EventHub stream would result in epoch issues. 

For this, the [following stackoverflow post](https://stackoverflow.com/questions/54750779/reusing-an-event-hub-stream-for-multiple-queries-in-azure-data-bricks/54761116#54761116) gives more details.

In [11]:
delta_table_name = 'CameraTelemetry' + datetime.today().strftime('%Y%m%d')
print('Saving all data in table', delta_table_name)

In [12]:
cameraData.writeStream \
  .format('delta') \
  .outputMode('append') \
  .option('checkpointLocation', '/data/' + delta_table_name + '/_checkpoints/data_file') \
  .table(delta_table_name)
