# Spark Streaming - Write to Parquet

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.sql import Row
import pyspark.sql.functions as f

import datetime
from datetime import datetime
import json


# use 95% of the screen for jupyter cell
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:100% !important; }<style>"))

  from IPython.core.display import display, HTML


In [12]:
appName="jupyter-stream-to-s3"

conf = SparkConf()

# CLUSTER MANAGER
################################################################################
# set Kubernetes Master as Cluster Manager(“k8s://https://” is NOT a typo, this is how Spark knows the “provider” type).
conf.setMaster("k8s://https://kubernetes.default.svc.cluster.local:443")

# CONFIGURE KUBERNETES
################################################################################
# set the namespace that will be used for running the driver and executor pods.
conf.set("spark.kubernetes.namespace","frontend")
# set the docker image from which the Worker pods are created
conf.set("spark.kubernetes.container.image", "thinkportgmbh/workshops:spark-3.3.1")
conf.set("spark.kubernetes.container.image.pullPolicy", "Always")

# set service account to be used
conf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
# authentication for service account(required to create worker pods):
conf.set("spark.kubernetes.authenticate.caCertFile", "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt")
conf.set("spark.kubernetes.authenticate.oauthTokenFile", "/var/run/secrets/kubernetes.io/serviceaccount/token")


# CONFIGURE SPARK
################################################################################
conf.set("spark.sql.session.timeZone", "Europe/Berlin")
# set driver host. In this case the ingres service for the spark driver
# find name of the driver service with 'kubectl get services' or in the helm chart configuration
conf.set("spark.driver.host", "jupyter-spark-driver.frontend.svc.cluster.local")
# set the port, If this port is busy, spark-shell tries to bind to another port.
conf.set("spark.driver.port", "29413")
# add the postgres driver jars into session
conf.set("spark.jars", "/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /opt/spark/jars/kafka-clients-3.3.1.jar")
conf.set("spark.driver.extraClassPath","/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /opt/spark/jars/kafka-clients-3.3.1.jar")
conf.set("spark.executor.extraClassPath","/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /opt/spark/jars/kafka-clients-3.3.1.jar")
#conf.set("spark.executor.extraLibrary","/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /opt/spark/jars/kafka-clients-3.3.1.jar")

# CONFIGURE S3 CONNECTOR
conf.set("spark.hadoop.fs.s3a.endpoint", "minio.minio.svc.cluster.local:9000")
conf.set("spark.hadoop.fs.s3a.access.key", "trainadm")
conf.set("spark.hadoop.fs.s3a.secret.key", "train@thinkport")
conf.set("spark.hadoop.fs.s3a.path.style.access", "true")
conf.set("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
conf.set("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")

# conf.set("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1")

# CONFIGURE WORKER (Customize based on workload)
################################################################################
# set number of worker pods
conf.set("spark.executor.instances", "1")
# set memory of each worker pod
conf.set("spark.executor.memory", "1G")
# set cpu of each worker pod
conf.set("spark.executor.cores", "2")

# SPARK SESSION
################################################################################
# and last, create the spark session and pass it the config object

spark = SparkSession\
    .builder\
    .config(conf=conf) \
    .config('spark.sql.session.timeZone', 'Europe/Berlin') \
    .appName(appName)\
    .getOrCreate()

# also get the spark context
sc=spark.sparkContext
# change the log level to warning, to see less output
sc.setLogLevel('WARN')

# get the configuration object to check all the configurations the session was startet with
for entry in sc.getConf().getAll():
        if entry[0] in ["spark.app.name","spark.kubernetes.namespace","spark.executor.memory","spark.executor.cores","spark.driver.host","spark.master"]:
            print(entry[0],"=",entry[1])

spark.kubernetes.namespace = frontend
spark.app.name = jupyter-stream-to-s3
spark.master = k8s://https://kubernetes.default.svc.cluster.local:443
spark.executor.memory = 1G
spark.executor.cores = 2
spark.driver.host = jupyter-spark-driver.frontend.svc.cluster.local


In [13]:
df_step_1 = (spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "kafka-cp-kafka.kafka.svc.cluster.local:9092")
      .option("subscribe", "twitter-json")
      .option("startingOffsets", "earliest")
      .load()
     )

df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [14]:
   df_step_2 = (df_step_1
       # take only the value and cast as string
       .selectExpr("CAST(value AS STRING)")
    )


In [15]:
df_step_3 = (df_step_2
       .select(
           f.get_json_object(f.col("value"),"$.payload.CreatedAt").alias("tweet_created"),
           f.get_json_object(f.col("value"),"$.payload.Id").alias("tweet_id"),
           f.get_json_object(f.col("value"),"$.payload.Lang").alias("tweet_language"),
           f.get_json_object(f.col("value"),"$.payload.HashtagEntities[*].Text").alias("hashtag"),
           f.get_json_object(f.col("value"),"$.payload.Place.Country").alias("country"),
           f.get_json_object(f.col("value"),"$.payload.Place.CountryCode").alias("country_code"),
           f.get_json_object(f.col("value"),"$.payload.User.ScreenName").alias("user"),
           f.get_json_object(f.col("value"),"$.payload.User.Lang").alias("user_language"),   
           f.get_json_object(f.col("value"),"$.payload.User.Location").alias("user_location"),
           f.get_json_object(f.col("value"),"$.payload.User.StatusesCount").alias("statuses_count"),
           f.get_json_object(f.col("value"),"$.payload.RetweetCount").alias("retweet_count"),
           f.get_json_object(f.col("value"),"$.payload.Text").alias("tweet_text")    
       ) 

    )

In [16]:
# UDF to parse array stored as string using JSON
def parse_array_from_string(x):
    
    if x is not None:
        res = json.loads(x)
    else:
        res =[None]
    return res

hashtag_array = f.udf(parse_array_from_string, ArrayType(StringType()))

In [17]:
 df_step_4 = (df_step_3
        # cast to correct data types 
        # convert to timestamp (cast string to long --> convert from number to timestamp)
        .withColumn("tweet_created",f.from_unixtime((f.col("tweet_created").cast("long"))/1000))
        .withColumn("tweet_id",f.col("tweet_id").cast("long"))
        # using udf function to convert string to json array
        .withColumn("hashtag",hashtag_array(f.col("hashtag")))
        .withColumn("statuses_count",f.col("statuses_count").cast("int"))
        .withColumn("retweet_count",f.col("retweet_count").cast("int"))

    )

In [19]:
stream_query=(df_step_4
        .writeStream
        .format("parquet")
        .option("path", "s3a://twitter/twitter_clean")
        .trigger(processingTime='2 seconds')
        .option("checkpointLocation", "/opt/spark/work-dir")
        .start()
        #.awaitTermination()
    )

22/11/29 16:55:18 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


[Stage 0:>                                                          (0 + 1) / 1]

In [21]:
#
stream_query.stop()

In [22]:
spark.streams.active

[]

In [11]:
spark.stop()

22/11/29 16:50:04 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed.
