# Prototyping Spark Streaming Application
read data from Kafka topic, filter and reduce and write back to other Kafka Topic

In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.sql import Row
import pyspark.sql.functions as f

import datetime
from datetime import datetime
import json


# use 95% of the screen for jupyter cell
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:100% !important; }<style>"))

In [11]:
appName="jupyter-stream"

conf = SparkConf()

# CLUSTER MANAGER
################################################################################
# set Kubernetes Master as Cluster Manager(“k8s://https://” is NOT a typo, this is how Spark knows the “provider” type).
conf.setMaster("k8s://https://kubernetes.default.svc.cluster.local:443")

# CONFIGURE KUBERNETES
################################################################################
# set the namespace that will be used for running the driver and executor pods.
conf.set("spark.kubernetes.namespace","frontend")
# set the docker image from which the Worker pods are created
conf.set("spark.kubernetes.container.image", "thinkportgmbh/workshops:spark-3.3.1")
conf.set("spark.kubernetes.container.image.pullPolicy", "Always")

# set service account to be used
conf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
# authentication for service account(required to create worker pods):
conf.set("spark.kubernetes.authenticate.caCertFile", "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt")
conf.set("spark.kubernetes.authenticate.oauthTokenFile", "/var/run/secrets/kubernetes.io/serviceaccount/token")


# CONFIGURE SPARK
################################################################################
conf.set("spark.sql.session.timeZone", "Europe/Berlin")
# set driver host. In this case the ingres service for the spark driver
# find name of the driver service with 'kubectl get services' or in the helm chart configuration
conf.set("spark.driver.host", "jupyter-spark-driver.frontend.svc.cluster.local")
# set the port, If this port is busy, spark-shell tries to bind to another port.
conf.set("spark.driver.port", "29413")
# add the postgres driver jars into session
conf.set("spark.jars", "/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /opt/spark/jars/kafka-clients-3.3.1.jar")
conf.set("spark.driver.extraClassPath","/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /opt/spark/jars/kafka-clients-3.3.1.jar")
conf.set("spark.executor.extraClassPath","/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /opt/spark/jars/kafka-clients-3.3.1.jar")
#conf.set("spark.executor.extraLibrary","/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /opt/spark/jars/kafka-clients-3.3.1.jar")



# conf.set("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1")

# CONFIGURE WORKER (Customize based on workload)
################################################################################
# set number of worker pods
conf.set("spark.executor.instances", "1")
# set memory of each worker pod
conf.set("spark.executor.memory", "1G")
# set cpu of each worker pod
conf.set("spark.executor.cores", "2")

# SPARK SESSION
################################################################################
# and last, create the spark session and pass it the config object

spark = SparkSession\
    .builder\
    .config(conf=conf) \
    .config('spark.sql.session.timeZone', 'Europe/Berlin') \
    .appName(appName)\
    .getOrCreate()

# also get the spark context
sc=spark.sparkContext
# change the log level to warning, to see less output
sc.setLogLevel('WARN')

# get the configuration object to check all the configurations the session was startet with
for entry in sc.getConf().getAll():
        if entry[0] in ["spark.app.name","spark.kubernetes.namespace","spark.executor.memory","spark.executor.cores","spark.driver.host","spark.master"]:
            print(entry[0],"=",entry[1])

22/11/28 21:53:27 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@1cc7e865 is aborting.
22/11/28 21:53:27 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@1cc7e865 aborted.
22/11/28 21:53:27 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@3d32cdd6 is aborting.
22/11/28 21:53:27 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@3d32cdd6 aborted.
22/11/28 21:53:27 ERROR MicroBatchExecution: Query [id = cc2589f2-d62f-40b3-ae5d-10f46f880923, runId = b767eb46-1b8c-4e84-933c-ee633c0ea7bf] terminated with error
org.apache.spark.SparkException: Writing job aborted
	at org.apache.spark.sql.errors.QueryExecutionErrors$.writingJobAbortedError(QueryExecutionErrors.scala:767)
	at org.apache.spark.sql.execution.

Exception in thread "stream execution thread for [id = e531fbee-2dfc-47ac-af88-1fccb37fad16, runId = 6a4de177-adca-4a7f-8a8e-6b07ab78c751]" org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:301)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:103)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:87)
	at org.apache.spark.sql.execution.streaming.state.StateStoreCoordinatorRef.deactivateInstances(StateStoreCoordinator.scala:119)
	at org.apache.spark.sql.streaming.StreamingQueryManager.notifyQueryTermination(StreamingQueryManager.scala:406)
	at org.apache.spark.sql.execution.streaming.StreamExecution.$anonfun$runStream$3(StreamExecution.scala:357)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.UninterruptibleThread.runUninterruptibly(Uninterrup

spark.kubernetes.namespace = frontend
spark.master = k8s://https://kubernetes.default.svc.cluster.local:443
spark.app.name = jupyter-stream
spark.executor.memory = 1G
spark.executor.cores = 2
spark.driver.host = jupyter-spark-driver.frontend.svc.cluster.local


In [12]:
df = (spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "kafka-cp-kafka.kafka.svc.cluster.local:9092")
      .option("subscribe", "twitter-json")
      .option("startingOffsets", "earliest")
      .load()
     )

df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [13]:
# UDF to parse array stored as string using JSON
def parse_array_from_string(x):
    
    if x is not None:
        res = json.loads(x)
    else:
        res =[None]
    return res

hashtag_array = f.udf(parse_array_from_string, ArrayType(StringType()))


In [15]:
df2 = (df
       # take only the value as String
       .selectExpr("CAST(value AS STRING)")
       # take only required fields
       .select(
           f.get_json_object(f.col("value"),"$.payload.CreatedAt").cast("long").alias("tweet_created"),
           f.get_json_object(f.col("value"),"$.payload.Id").cast("long").alias("tweet_id"),
           f.get_json_object(f.col("value"),"$.payload.Lang").alias("tweet_language"),
           f.get_json_object(f.col("value"),"$.payload.HashtagEntities[*].Text").alias("hashtag"),
           f.get_json_object(f.col("value"),"$.payload.Place.Country").alias("country"),
           f.get_json_object(f.col("value"),"$.payload.Place.CountryCode").alias("country_code"),
           f.get_json_object(f.col("value"),"$.payload.User.ScreenName").alias("user"),
           f.get_json_object(f.col("value"),"$.payload.User.Lang").alias("user_language"),   
           f.get_json_object(f.col("value"),"$.payload.User.Location").alias("user_location"),
           f.get_json_object(f.col("value"),"$.payload.User.StatusesCount").cast("int").alias("statuses_count"),
           f.get_json_object(f.col("value"),"$.payload.RetweetCount").cast("int").alias("retweet_count"),
           f.get_json_object(f.col("value"),"$.payload.Text").alias("tweet_text")    
       )
       # convert to timestamp (cast string to long --> convert from number to timestamp)
       .withColumn("tweet_created",f.from_unixtime(f.col("tweet_created")/1000))
       # using udf function to convert string to json array
       .withColumn("hashtag",hashtag_array(f.col("hashtag")))
     )

df2.printSchema()

#df2.writeStream.format("console").option("truncate", "false").outputMode("append").start().awaitTermination()

root
 |-- tweet_created: string (nullable = true)
 |-- tweet_id: long (nullable = true)
 |-- tweet_language: string (nullable = true)
 |-- hashtag: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- country: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- user: string (nullable = true)
 |-- user_language: string (nullable = true)
 |-- user_location: string (nullable = true)
 |-- statuses_count: integer (nullable = true)
 |-- retweet_count: integer (nullable = true)
 |-- tweet_text: string (nullable = true)



In [17]:
kafka_write=(df2
             .selectExpr("to_json(struct(*)) AS value")
             .writeStream
             .format("kafka")
             .outputMode("append")
             .option("kafka.bootstrap.servers", "kafka-cp-kafka.kafka.svc.cluster.local:9092")
             .option("topic", "twitter-table")
             .option("checkpointLocation", "/opt/spark/work-dir")
             .start()
             .awaitTermination()
            )

22/11/28 21:55:44 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


ERROR:root:KeyboardInterrupt while sending command.                             
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.9/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
!pyspark --version


In [18]:
# Terminate Spark Session
# shut down executor pods
spark.stop()


22/11/29 10:02:19 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed.
22/11/29 10:02:23 ERROR MicroBatchExecution: Query [id = c7187e51-525f-4869-a45d-34438b3fa6cb, runId = d7c5d0ea-a886-43cf-affc-b4595f29f84f] terminated with error
org.apache.spark.SparkException: The Spark SQL phase planning failed with an internal error. Please, fill a bug report in, and provide the full stack trace.
	at org.apache.spark.sql.execution.QueryExecution$.toInternalError(QueryExecution.scala:500)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:512)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:185)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:184)
	at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:145)
	at org.apache.spark.sql.execution.QueryEx

Exception in thread "stream execution thread for [id = c7187e51-525f-4869-a45d-34438b3fa6cb, runId = d7c5d0ea-a886-43cf-affc-b4595f29f84f]" org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:301)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:103)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:87)
	at org.apache.spark.sql.execution.streaming.state.StateStoreCoordinatorRef.deactivateInstances(StateStoreCoordinator.scala:119)
	at org.apache.spark.sql.streaming.StreamingQueryManager.notifyQueryTermination(StreamingQueryManager.scala:406)
	at org.apache.spark.sql.execution.streaming.StreamExecution.$anonfun$runStream$3(StreamExecution.scala:357)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.UninterruptibleThread.runUninterruptibly(Uninterrup