In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime, timedelta
import json
import pyspark.sql.functions

## Load variables from key vault

In [3]:
kv_scope = 'key-vault-secret'

# Variables
storage_account_name = dbutils.secrets.get(scope=kv_scope, key='traffic-storage-accountname') 
storage_account_access_key = dbutils.secrets.get(scope=kv_scope, key='traffic-storage-accountkey') 
eventgrid_accesskey = dbutils.secrets.get(scope=kv_scope, key='traffic-eventgrid-accesskey') 
eventgrid_topic = dbutils.secrets.get(scope=kv_scope, key='traffic-eventgrid-topicendpoint')

In [4]:
traffic_table_name = 'CameraTelemetry' + datetime.today().strftime('%Y%m%d')
speed_table_name = 'SpeedMeasurements' + datetime.today().strftime('%Y%m%d')

## Mounting the segment configuration json from blob

- Using the mount functionality to load the blob file

In [6]:
mount_name = 'traffic-config'
to_be_mounted = True
mounts = dbutils.fs.ls('/mnt/')

for mnt in mounts:
  if mnt.name.startswith(mount_name):
    to_be_mounted = False

if to_be_mounted:
  dbutils.fs.mount(
  source = 'wasbs://traffic-config@' + storage_account_name + '.blob.core.windows.net',
  mount_point = '/mnt/' + mount_name,
  extra_configs = {'fs.azure.account.key.' + storage_account_name + '.blob.core.windows.net':storage_account_access_key})
else:
  print('Traffic config already mounted')

## Parsing segment configuration

- Reading the json file (`multiLine=True` !!)
- Adding calculated field for maximum duration (`(distance / speedlimit) * 3.6`), where 3.6 is coming from meters/second
- Only returning the relevant fields for the calculation query

In [8]:
segment_config = spark.read.json('/mnt/' + mount_name, multiLine=True) \
  .withColumn('TrajectId', col('segmentId')) \
  .withColumn('MinDuration', ((col('cameraDistance') / col('speedLimit')) * 3.6)) \
  .select('TrajectId','MinDuration', 'CameraDistance', 'SpeedLimit')
display(segment_config)

TrajectId,MinDuration,CameraDistance,SpeedLimit
dev,80.0,2000,90
edge01,80.0,2000,90
01,80.0,2000,90
02,150.0,5000,120
03,261.81818181818187,8000,110
04,49.09090909090909,1500,110


In [9]:
timestamp_from = datetime.utcnow() - timedelta(hours=0, minutes=20)
print(timestamp_from)

In [10]:
cameraStream = spark.readStream.format('delta') \
  .table(traffic_table_name) \
  .where(col('EventTime')>'2019-12-12 18:30:00') 

## Query that shows traffic (car count) per 5 second window

In [12]:
import pyspark.sql.functions as F

traffic_df = cameraStream \
  .where(col('EventTime') > timestamp_from) \
  .groupBy(window('EventTime', '30 seconds')) \
  .agg(count('*').alias('count')) \
  .select(col('window.start').alias('start'), 'count')

display(traffic_df)

start,count
2019-12-12T19:12:00.000+0000,15
2019-12-12T19:04:00.000+0000,49
2019-12-12T19:10:00.000+0000,14
2019-12-12T19:03:30.000+0000,40
2019-12-12T19:07:00.000+0000,45
2019-12-12T19:00:30.000+0000,11
2019-12-12T19:07:30.000+0000,45
2019-12-12T19:02:30.000+0000,46
2019-12-12T19:06:30.000+0000,40
2019-12-12T19:06:00.000+0000,46


## Query that shows the number of cars, grouped by make

In [14]:
make_df = cameraStream \
  .groupBy('Make', window('EventTime', '30 seconds')) \
  .agg(count('*').alias('count')) \
  .sort(col('count').desc()) \
  .select(col('window.start').alias('start'), 'Make', 'count')

display(make_df)

start,Make,count
2019-12-12T19:04:00.000+0000,BMW,11
2019-12-12T19:07:00.000+0000,Suzuki,11
2019-12-12T19:03:30.000+0000,Mazda,10
2019-12-12T19:07:30.000+0000,Suzuki,9
2019-12-12T19:08:00.000+0000,Mercedes,8
2019-12-12T19:05:30.000+0000,Mazda,8
2019-12-12T19:06:30.000+0000,Mazda,8
2019-12-12T19:04:30.000+0000,BMW,7
2019-12-12T19:09:00.000+0000,Suzuki,7
2019-12-12T19:09:30.000+0000,Mazda,7


## Query that shows number of cars per segment

In [16]:
segment_df = cameraStream \
  .groupBy('TrajectId', window('EventTime', '60 seconds')) \
  .agg(count('*').alias('count')) \
  .sort(col('count').desc()) \
  .select('TrajectId', col('window.start').alias('WindowTime'), 'count')

display(segment_df)

TrajectId,WindowTime,count
2,2019-12-12T19:15:00.000+0000,72
2,2019-12-12T19:12:00.000+0000,67
2,2019-12-12T19:16:00.000+0000,63
2,2019-12-12T19:10:00.000+0000,63
2,2019-12-12T19:04:00.000+0000,62
2,2019-12-12T19:17:00.000+0000,60
2,2019-12-12T19:11:00.000+0000,60
2,2019-12-12T19:13:00.000+0000,59
2,2019-12-12T19:07:00.000+0000,57
2,2019-12-12T19:14:00.000+0000,57


## Query that shows the number of cars, grouped by country

In [18]:
import pyspark.sql.functions as F

country_df = cameraStream \
  .groupBy('Country') \
  .agg(count('*').alias('count')) \
  .sort(F.col('count').desc()) \
  .select('Country', 'count')

display(country_df)

Country,count
BE,612
NL,85
FR,64
DE,40
PL,34
HU,15
PT,3


## Speed measurements & reports

In [20]:
speedMeasurementStream = spark.readStream.option('ignoreChanges', 'true').table(speed_table_name)

In [21]:
avg_speed_df = speedMeasurementStream \
  .groupBy('TrajectId', window('LastEvent', '60 seconds')) \
  .agg(avg('Speed').alias('Speed')) \
  .select('TrajectId', col('window.start').alias('WindowTime'), 'Speed')

display(avg_speed_df)

TrajectId,WindowTime,Speed
1,2019-12-12T19:21:00.000+0000,127.4933470231131
2,2019-12-12T19:04:00.000+0000,99.82374904228888
2,2019-12-12T19:12:00.000+0000,103.04600699223626
1,2019-12-12T19:18:00.000+0000,124.01225921087658
1,2019-12-12T19:03:00.000+0000,121.59701260422068
2,2019-12-12T19:17:00.000+0000,102.52696346354172
2,2019-12-12T19:03:00.000+0000,151.32113479939554
2,2019-12-12T19:08:00.000+0000,102.90531378192937
1,2019-12-12T19:25:00.000+0000,125.90463053783732
1,2019-12-12T19:13:00.000+0000,121.01931843385852


In [22]:
avg_speed_bymake_df = speedMeasurementStream \
  .groupBy('Make', window('LastEvent', '30 seconds')) \
  .agg(avg('Speed').alias('Speed')) \
  .select('Make', col('window.start').alias('WindowTime'), 'Speed')

display(avg_speed_bymake_df)

Make,WindowTime,Speed
BMW,2019-12-12T19:21:30.000+0000,133.7225274725274
Volvo,2019-12-12T19:09:30.000+0000,138.46153846153845
Volkswagen,2019-12-12T19:07:00.000+0000,118.6377829820453
Mercedes,2019-12-12T19:16:00.000+0000,125.46532550662222
Volkswagen,2019-12-12T19:03:00.000+0000,116.12903225806444
Mercedes,2019-12-12T19:11:30.000+0000,126.39498432601872
Opel,2019-12-12T19:09:30.000+0000,99.37489765842476
Volvo,2019-12-12T19:26:00.000+0000,138.46153846153845
Suzuki,2019-12-12T19:20:30.000+0000,133.3333333333334
Suzuki,2019-12-12T19:05:30.000+0000,88.23529411764699
