# Spark Streaming Transform Data
read data from Kafka topic, filter and reduce and write back to other Kafka Topic

In [20]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
import pyspark.sql.functions as f
from pyspark.sql.avro.functions import to_avro, from_avro

# DELETE
from schema_registry.client import SchemaRegistryClient


import datetime
from datetime import datetime
import json


# use 95% of the screen for jupyter cell
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:100% !important; }<style>"))


schemaRegistryAddr="http://kafka-cp-schema-registry.kafka.svc.cluster.local:8081"


  from IPython.core.display import display, HTML


In [21]:
appName="jupyter-stream"

conf = SparkConf()

# CLUSTER MANAGER
################################################################################
# set Kubernetes Master as Cluster Manager(“k8s://https://” is NOT a typo, this is how Spark knows the “provider” type).
conf.setMaster("k8s://https://kubernetes.default.svc.cluster.local:443")

# CONFIGURE KUBERNETES
################################################################################
# set the namespace that will be used for running the driver and executor pods.
conf.set("spark.kubernetes.namespace","frontend")
# set the docker image from which the Worker pods are created
conf.set("spark.kubernetes.container.image", "thinkportgmbh/workshops:spark-3.3.1")
conf.set("spark.kubernetes.container.image.pullPolicy", "Always")

# set service account to be used
conf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
# authentication for service account(required to create worker pods):
conf.set("spark.kubernetes.authenticate.caCertFile", "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt")
conf.set("spark.kubernetes.authenticate.oauthTokenFile", "/var/run/secrets/kubernetes.io/serviceaccount/token")


# CONFIGURE SPARK
################################################################################
conf.set("spark.sql.adaptive.enabled", "False")
# set driver host. In this case the ingres service for the spark driver
# find name of the driver service with 'kubectl get services' or in the helm chart configuration
conf.set("spark.driver.host", "jupyter-spark-driver.frontend.svc.cluster.local")
# set the port, If this port is busy, spark-shell tries to bind to another port.
conf.set("spark.driver.port", "29413")
# add the postgres driver jars into session
conf.set("spark.jars", "/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /opt/spark/jars/kafka-clients-3.3.1.jar, /opt/spark/jars/spark-avro_2.12-3.3.1.jar")
conf.set("spark.driver.extraClassPath","/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /opt/spark/jars/kafka-clients-3.3.1.jar, /opt/spark/jars/spark-avro_2.12-3.3.1.jar")
conf.set("spark.executor.extraClassPath","/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /opt/spark/jars/kafka-clients-3.3.1.jar, /opt/spark/jars/spark-avro_2.12-3.3.1.jar")
#conf.set("spark.executor.extraLibrary","/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /opt/spark/jars/kafka-clients-3.3.1.jar")



# CONFIGURE S3 CONNECTOR
conf.set("spark.hadoop.fs.s3a.endpoint", "minio.minio.svc.cluster.local:9000")
conf.set("spark.hadoop.fs.s3a.access.key", "trainadm")
conf.set("spark.hadoop.fs.s3a.secret.key", "train@thinkport")
conf.set("spark.hadoop.fs.s3a.path.style.access", "true")
conf.set("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
conf.set("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")



# conf.set("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1")

# CONFIGURE WORKER (Customize based on workload)
################################################################################
# set number of worker pods
conf.set("spark.executor.instances", "1")
# set memory of each worker pod
conf.set("spark.executor.memory", "1G")
# set cpu of each worker pod
conf.set("spark.executor.cores", "2")

# SPARK SESSION
################################################################################
# and last, create the spark session and pass it the config object

spark = SparkSession\
    .builder\
    .config(conf=conf) \
    .config('spark.sql.session.timeZone', 'Europe/Berlin') \
    .appName(appName)\
    .getOrCreate()

# also get the spark context
sc=spark.sparkContext
ssc = StreamingContext(sc, 2)

# change the log level to warning, to see less output
sc.setLogLevel('WARN')

# get the configuration object to check all the configurations the session was startet with
for entry in sc.getConf().getAll():
        if entry[0] in ["spark.app.name","spark.kubernetes.namespace","spark.executor.memory","spark.executor.cores","spark.driver.host","spark.master"]:
            print(entry[0],"=",entry[1])
            


spark.kubernetes.namespace = frontend
spark.master = k8s://https://kubernetes.default.svc.cluster.local:443
spark.app.name = jupyter-stream
spark.executor.memory = 1G
spark.executor.cores = 2
spark.driver.host = jupyter-spark-driver.frontend.svc.cluster.local


#### Read Schema from Kafka registry

#### Read Stream from Kafka

In [22]:

df_step_1 = (spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "kafka-cp-kafka.kafka.svc.cluster.local:9092")
      .option("subscribe", "twitter-json")
      .option("startingOffsets", "earliest")
      .load()
     )

df_step_1.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [23]:
# Schema einmal aus einer Rohnachricht bestimmen
# PS am besten noch in eine Datei rausspeichern
jsonSchema=StructType([StructField('payload', StructType([StructField('Contributors', ArrayType(StringType(), True), True), StructField('CreatedAt', LongType(), True), StructField('CurrentUserRetweetId', LongType(), True), StructField('FavoriteCount', LongType(), True), StructField('Favorited', BooleanType(), True), StructField('GeoLocation', StringType(), True), StructField('HashtagEntities', ArrayType(StructType([StructField('End', LongType(), True), StructField('Start', LongType(), True), StructField('Text', StringType(), True)]), True), True), StructField('Id', LongType(), True), StructField('InReplyToScreenName', StringType(), True), StructField('InReplyToStatusId', LongType(), True), StructField('InReplyToUserId', LongType(), True), StructField('Lang', StringType(), True), StructField('MediaEntities', ArrayType(StructType([StructField('DisplayURL', StringType(), True), StructField('End', LongType(), True), StructField('ExpandedURL', StringType(), True), StructField('ExtAltText', StringType(), True), StructField('Id', LongType(), True), StructField('MediaURL', StringType(), True), StructField('MediaURLHttps', StringType(), True), StructField('Sizes', ArrayType(ArrayType(StringType(), True), True), True), StructField('Start', LongType(), True), StructField('Text', StringType(), True), StructField('Type', StringType(), True), StructField('URL', StringType(), True), StructField('VideoAspectRatioHeight', LongType(), True), StructField('VideoAspectRatioWidth', LongType(), True), StructField('VideoDurationMillis', LongType(), True), StructField('VideoVariants', ArrayType(StructType([StructField('Bitrate', LongType(), True), StructField('ContentType', StringType(), True), StructField('Url', StringType(), True)]), True), True)]), True), True), StructField('Place', StructType([StructField('Country', StringType(), True), StructField('CountryCode', StringType(), True), StructField('FullName', StringType(), True), StructField('Id', StringType(), True), StructField('Name', StringType(), True), StructField('PlaceType', StringType(), True), StructField('StreetAddress', StringType(), True), StructField('URL', StringType(), True)]), True), StructField('PossiblySensitive', BooleanType(), True), StructField('Retweet', BooleanType(), True), StructField('RetweetCount', LongType(), True), StructField('Retweeted', BooleanType(), True), StructField('RetweetedByMe', BooleanType(), True), StructField('Source', StringType(), True), StructField('SymbolEntities', ArrayType(StructType([StructField('End', LongType(), True), StructField('Start', LongType(), True), StructField('Text', StringType(), True)]), True), True), StructField('Text', StringType(), True), StructField('Truncated', BooleanType(), True), StructField('URLEntities', ArrayType(StructType([StructField('DisplayURL', StringType(), True), StructField('End', LongType(), True), StructField('ExpandedURL', StringType(), True), StructField('Start', LongType(), True), StructField('Text', StringType(), True), StructField('URL', StringType(), True)]), True), True), StructField('User', StructType([StructField('BiggerProfileImageURL', StringType(), True), StructField('BiggerProfileImageURLHttps', StringType(), True), StructField('ContributorsEnabled', BooleanType(), True), StructField('CreatedAt', LongType(), True), StructField('DefaultProfile', BooleanType(), True), StructField('DefaultProfileImage', BooleanType(), True), StructField('Description', StringType(), True), StructField('FavouritesCount', LongType(), True), StructField('FollowRequestSent', BooleanType(), True), StructField('FollowersCount', LongType(), True), StructField('FriendsCount', LongType(), True), StructField('GeoEnabled', BooleanType(), True), StructField('Id', LongType(), True), StructField('Lang', StringType(), True), StructField('ListedCount', LongType(), True), StructField('Location', StringType(), True), StructField('MiniProfileImageURL', StringType(), True), StructField('MiniProfileImageURLHttps', StringType(), True), StructField('Name', StringType(), True), StructField('OriginalProfileImageURL', StringType(), True), StructField('OriginalProfileImageURLHttps', StringType(), True), StructField('ProfileBackgroundColor', StringType(), True), StructField('ProfileBackgroundImageURL', StringType(), True), StructField('ProfileBackgroundImageUrlHttps', StringType(), True), StructField('ProfileBackgroundTiled', BooleanType(), True), StructField('ProfileBannerIPadRetinaURL', StringType(), True), StructField('ProfileBannerIPadURL', StringType(), True), StructField('ProfileBannerMobileRetinaURL', StringType(), True), StructField('ProfileBannerMobileURL', StringType(), True), StructField('ProfileBannerRetinaURL', StringType(), True), StructField('ProfileBannerURL', StringType(), True), StructField('ProfileImageURL', StringType(), True), StructField('ProfileImageURLHttps', StringType(), True), StructField('ProfileLinkColor', StringType(), True), StructField('ProfileSidebarBorderColor', StringType(), True), StructField('ProfileSidebarFillColor', StringType(), True), StructField('ProfileTextColor', StringType(), True), StructField('ProfileUseBackgroundImage', BooleanType(), True), StructField('Protected', BooleanType(), True), StructField('ScreenName', StringType(), True), StructField('ShowAllInlineMedia', BooleanType(), True), StructField('StatusesCount', LongType(), True), StructField('TimeZone', StringType(), True), StructField('Translator', BooleanType(), True), StructField('URL', StringType(), True), StructField('UtcOffset', LongType(), True), StructField('Verified', BooleanType(), True), StructField('WithheldInCountries', ArrayType(StringType(), True), True)]), True), StructField('UserMentionEntities', ArrayType(StructType([StructField('End', LongType(), True), StructField('Id', LongType(), True), StructField('Name', StringType(), True), StructField('ScreenName', StringType(), True), StructField('Start', LongType(), True), StructField('Text', StringType(), True)]), True), True), StructField('WithheldInCountries', ArrayType(StringType(), True), True)]), True), StructField('schema', StructType([StructField('doc', StringType(), True), StructField('fields', ArrayType(StructType([StructField('doc', StringType(), True), StructField('field', StringType(), True), StructField('fields', ArrayType(StructType([StructField('doc', StringType(), True), StructField('field', StringType(), True), StructField('items', StructType([StructField('optional', BooleanType(), True), StructField('type', StringType(), True)]), True), StructField('name', StringType(), True), StructField('optional', BooleanType(), True), StructField('type', StringType(), True), StructField('version', LongType(), True)]), True), True), StructField('items', StructType([StructField('doc', StringType(), True), StructField('fields', ArrayType(StructType([StructField('doc', StringType(), True), StructField('field', StringType(), True), StructField('items', StructType([StructField('doc', StringType(), True), StructField('fields', ArrayType(StructType([StructField('doc', StringType(), True), StructField('field', StringType(), True), StructField('optional', BooleanType(), True), StructField('type', StringType(), True)]), True), True), StructField('name', StringType(), True), StructField('optional', BooleanType(), True), StructField('type', StringType(), True)]), True), StructField('keys', StructType([StructField('optional', BooleanType(), True), StructField('type', StringType(), True)]), True), StructField('optional', BooleanType(), True), StructField('type', StringType(), True), StructField('values', StructType([StructField('doc', StringType(), True), StructField('fields', ArrayType(StructType([StructField('doc', StringType(), True), StructField('field', StringType(), True), StructField('optional', BooleanType(), True), StructField('type', StringType(), True)]), True), True), StructField('name', StringType(), True), StructField('optional', BooleanType(), True), StructField('type', StringType(), True)]), True)]), True), True), StructField('name', StringType(), True), StructField('optional', BooleanType(), True), StructField('type', StringType(), True)]), True), StructField('name', StringType(), True), StructField('optional', BooleanType(), True), StructField('type', StringType(), True), StructField('version', LongType(), True)]), True), True), StructField('name', StringType(), True), StructField('optional', BooleanType(), True), StructField('type', StringType(), True)]), True), StructField('partition', IntegerType(), True)])

In [24]:
df_step_2= (df_step_1
            # cast binary to string and string with json schema to json object
            .select(f.from_json(f.col("value").cast("string"),jsonSchema).alias("data"))
            # from json take only the payload
            .select(f.col("data.payload").alias("p"))
           )

df_step_2.printSchema()


root
 |-- p: struct (nullable = true)
 |    |-- Contributors: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- CreatedAt: long (nullable = true)
 |    |-- CurrentUserRetweetId: long (nullable = true)
 |    |-- FavoriteCount: long (nullable = true)
 |    |-- Favorited: boolean (nullable = true)
 |    |-- GeoLocation: string (nullable = true)
 |    |-- HashtagEntities: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- End: long (nullable = true)
 |    |    |    |-- Start: long (nullable = true)
 |    |    |    |-- Text: string (nullable = true)
 |    |-- Id: long (nullable = true)
 |    |-- InReplyToScreenName: string (nullable = true)
 |    |-- InReplyToStatusId: long (nullable = true)
 |    |-- InReplyToUserId: long (nullable = true)
 |    |-- Lang: string (nullable = true)
 |    |-- MediaEntities: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- DisplayU

In [25]:
df_step_3=(df_step_2
           .select(
               "p.Id",
               "p.CreatedAt",
               "p.Place.Country",
               "p.Place.CountryCode",
               f.col("p.User.id").alias("UserID"),
               "p.User.ScreenName",
               "p.RetweetCount",
               "p.User.FriendsCount",
               "p.User.FollowersCount",
               "p.Lang",
                f.col("p.User.Lang").alias("UserLang"),
               "p.User.Location",
               f.col("p.HashtagEntities.Text").alias("Hashtags"),
               "p.Text"
               
           )
          )

df_step_3.printSchema()

root
 |-- Id: long (nullable = true)
 |-- CreatedAt: long (nullable = true)
 |-- Country: string (nullable = true)
 |-- CountryCode: string (nullable = true)
 |-- UserID: long (nullable = true)
 |-- ScreenName: string (nullable = true)
 |-- RetweetCount: long (nullable = true)
 |-- FriendsCount: long (nullable = true)
 |-- FollowersCount: long (nullable = true)
 |-- Lang: string (nullable = true)
 |-- UserLang: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Hashtags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Text: string (nullable = true)



In [26]:
stream_query_debug=(df_step_3
                    .writeStream.format("console")
                    .option("truncate", "false")
                    .outputMode("append")
                    .start()
                    .awaitTermination()
                   )

22/12/06 13:58:23 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-851c178f-9dcc-4a2d-900d-6d2cb36727f6. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.


                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-------------------+-------------+-------+-----------+-------------------+---------------+------------+------------+--------------+----+--------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Id                 |CreatedAt    |Country|CountryCode|

                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+-------------------+-------------+-------+-----------+------------------+---------------+------------+------------+--------------+----+--------+--------+---------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+
|Id                 |CreatedAt    |Country|CountryCode|UserID            |ScreenName     |RetweetCount|FriendsCount|FollowersCount|Lang|UserLang|Location|Hashtags                                                                                     |Text                                                                                                                                            |
+-------------------+-------------+-------+-----------+------------------+---------------+------------+------------

                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+-------------------+-------------+-------+-----------+------------------+---------------+------------+------------+--------------+----+--------+--------+--------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
|Id                 |CreatedAt    |Country|CountryCode|UserID            |ScreenName     |RetweetCount|FriendsCount|FollowersCount|Lang|UserLang|Location|Hashtags                                    |Text                                                                                                                                          |
+-------------------+-------------+-------+-----------+------------------+---------------+------------+------------+--------------+----+--------+--------+--------------------------------------------+------------------

                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+-------------------+-------------+-------+-----------+----------+----------+------------+------------+--------------+----+--------+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Id                 |CreatedAt    |Country|CountryCode|UserID    |ScreenName|RetweetCount|FriendsCount|FollowersCount|Lang|UserLang|Location       |Hashtags           

                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+-------------------+-------------+-------+-----------+------------------+---------------+------------+------------+--------------+----+--------+--------+----------------+-------------------------------------------------------------------------------------------------------------------------------------------------+
|Id                 |CreatedAt    |Country|CountryCode|UserID            |ScreenName     |RetweetCount|FriendsCount|FollowersCount|Lang|UserLang|Location|Hashtags        |Text                                                                                                                                             |
+-------------------+-------------+-------+-----------+------------------+---------------+------------+------------+--------------+----+--------+--------+----------------+------------------------------------------------------------------------------------------------

                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+-------------------+-------------+-------+-----------+--------+-------------+------------+------------+--------------+----+--------+----------------+----------------+-----------------------------------------------------------------------------------------+
|Id                 |CreatedAt    |Country|CountryCode|UserID  |ScreenName   |RetweetCount|FriendsCount|FollowersCount|Lang|UserLang|Location        |Hashtags        |Text                                                                                     |
+-------------------+-------------+-------+-----------+--------+-------------+------------+------------+--------------+----+--------+----------------+----------------+-----------------------------------------------------------------------------------------+
|1600127789545521154|1670335168000|null   |null       |29717423|lukaspfeiffer|0           |771         |1066          |de  |null 

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.9/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
#spark.streams.active[0].stop()
#stream_query_debug.stop()

df_step_4=df_step_3.select(f.to_json(f.struct(df_step_3.schema.names)).alias('value')) 

df_step_4.printSchema()

In [17]:
df_step_5=df_step_3.select(to_avro(f.struct(df_step_3.schema.names)).alias('value')) 

df_step_5.printSchema()

root
 |-- value: binary (nullable = false)



                                                                                

In [18]:
kafka_write=(df_step_4
             .writeStream
             .format("kafka")
             .outputMode("append")
             .option("kafka.bootstrap.servers", "kafka-cp-kafka.kafka.svc.cluster.local:9092")
             .option("topic", "spark-target-schema-avro")
             .option("checkpointLocation", "/opt/spark/work-dir/12")
             .start()
             .awaitTermination()
            )

# topic
# spark-target-schema2 --> json mit struct f.to_json(f.struct(df_step_4.schema.names)
# spark-target-schema3 --> read from twitter-json mit orginal Schema und struct f.to_json(f.struct(df_step_4.schema.names)
# spark-target-schema-avro --> read from twitter-json mit orginal Schema und struct f.to_avro(f.struct(df_step_4.schema.names)




ERROR:root:KeyboardInterrupt while sending command.                             
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.9/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
s3_write_json=(df_step_4
          .writeStream
          .format("json")
          .outputMode("append")
          .option("path", "s3a://spark-stream-to-s3/json")
          .option("checkpointLocation", "/opt/spark/work-dir/5")
          .trigger(processingTime='10 seconds')
          .start()
          .awaitTermination()
         )

In [None]:
s3_write_avro=(df_step_4
          .writeStream
          .format("avro")
          .outputMode("append")
          .option("path", "s3a://spark-stream-to-s3/avro")
          .option("checkpointLocation", "/opt/spark/work-dir/6")
          .start()
          .awaitTermination()
         )

In [None]:
s3_write_parquet=(df_step_4
          .writeStream
          .format("parquet")
          .outputMode("append")
          .option("path", "s3a://spark-stream-to-s3/parquet")
          .option("checkpointLocation", "/opt/spark/work-dir/7")
          .start()
          .awaitTermination()
         )

In [None]:
!pyspark --version


In [19]:
# Terminate Spark Session
# shut down executor pods
spark.stop()


22/12/01 23:58:57 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed.
22/12/01 23:59:50 ERROR MicroBatchExecution: Query [id = 49bbe66e-4bcf-41ea-b086-f931e24ac560, runId = b2491bba-b795-44f4-9c83-eb6f118c595a] terminated with error
org.apache.spark.SparkException: The Spark SQL phase planning failed with an internal error. Please, fill a bug report in, and provide the full stack trace.
	at org.apache.spark.sql.execution.QueryExecution$.toInternalError(QueryExecution.scala:500)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:512)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:185)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:184)
	at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:145)
	at org.apache.spark.sql.execution.QueryEx

Exception in thread "stream execution thread for [id = 49bbe66e-4bcf-41ea-b086-f931e24ac560, runId = b2491bba-b795-44f4-9c83-eb6f118c595a]" org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:301)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:103)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:87)
	at org.apache.spark.sql.execution.streaming.state.StateStoreCoordinatorRef.deactivateInstances(StateStoreCoordinator.scala:119)
	at org.apache.spark.sql.streaming.StreamingQueryManager.notifyQueryTermination(StreamingQueryManager.scala:406)
	at org.apache.spark.sql.execution.streaming.StreamExecution.$anonfun$runStream$3(StreamExecution.scala:357)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.UninterruptibleThread.runUninterruptibly(Uninterrup