# Spark Aufgaben

1. Read Twitter Streams from Avro
2. Do analysis Tasks
3. Write Result to Delta several times
4. Du some History analysis on it



In [109]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.sql import Row
import pyspark.sql.functions as f

from delta import *


import datetime
from datetime import datetime
import json


# use 95% of the screen for jupyter cell
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:100% !important; }<style>"))

  from IPython.core.display import display, HTML


In [110]:
appName="jupyter-spark"

conf = SparkConf()

# CLUSTER MANAGER
################################################################################
# set Kubernetes Master as Cluster Manager(“k8s://https://” is NOT a typo, this is how Spark knows the “provider” type).
conf.setMaster("k8s://https://kubernetes.default.svc.cluster.local:443")

# CONFIGURE KUBERNETES
################################################################################
# set the namespace that will be used for running the driver and executor pods.
conf.set("spark.kubernetes.namespace","frontend")
# set the docker image from which the Worker pods are created
conf.set("spark.kubernetes.container.image", "thinkportgmbh/workshops:spark-3.3.1")
conf.set("spark.kubernetes.container.image.pullPolicy", "Always")

# set service account to be used
conf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
# authentication for service account(required to create worker pods):
conf.set("spark.kubernetes.authenticate.caCertFile", "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt")
conf.set("spark.kubernetes.authenticate.oauthTokenFile", "/var/run/secrets/kubernetes.io/serviceaccount/token")


# CONFIGURE SPARK
################################################################################
conf.set("spark.sql.session.timeZone", "Europe/Berlin")
# set driver host. In this case the ingres service for the spark driver
# find name of the driver service with 'kubectl get services' or in the helm chart configuration
conf.set("spark.driver.host", "jupyter-spark-driver.frontend.svc.cluster.local")
# set the port, If this port is busy, spark-shell tries to bind to another port.
conf.set("spark.driver.port", "29413")
# add the postgres driver jars into session
conf.set("spark.jars", "/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /opt/spark/jars/kafka-clients-3.3.1.jar, /opt/spark/jars/spark-avro_2.12-3.3.1.jar")
#conf.set("spark.driver.extraClassPath","/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /opt/spark/jars/kafka-clients-3.3.1.jar, /opt/spark/jars/spark-avro_2.12-3.3.1.jar")
conf.set("spark.executor.extraClassPath","/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /opt/spark/jars/kafka-clients-3.3.1.jar, /opt/spark/jars/spark-avro_2.12-3.3.1.jar")
#conf.set("spark.executor.extraLibrary","/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /opt/spark/jars/kafka-clients-3.3.1.jar")

# CONFIGURE S3 CONNECTOR
conf.set("spark.hadoop.fs.s3a.endpoint", "minio.minio.svc.cluster.local:9000")
conf.set("spark.hadoop.fs.s3a.access.key", "trainadm")
conf.set("spark.hadoop.fs.s3a.secret.key", "train@thinkport")
conf.set("spark.hadoop.fs.s3a.path.style.access", "true")
conf.set("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
conf.set("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")

# conf.set("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1")

# CONFIGURE WORKER (Customize based on workload)
################################################################################
# set number of worker pods
conf.set("spark.executor.instances", "1")
# set memory of each worker pod
conf.set("spark.executor.memory", "1G")
# set cpu of each worker pod
conf.set("spark.executor.cores", "2")
# Number of possible tasks = cores * executores

# SPARK SESSION
################################################################################
# and last, create the spark session and pass it the config object

spark = SparkSession\
    .builder\
    .config(conf=conf) \
    .config('spark.sql.session.timeZone', 'Europe/Berlin') \
    .appName(appName)\
    .getOrCreate()

# also get the spark context
sc=spark.sparkContext
# change the log level to warning, to see less output
sc.setLogLevel('WARN')

# get the configuration object to check all the configurations the session was startet with
for entry in sc.getConf().getAll():
        if entry[0] in ["spark.app.name","spark.kubernetes.namespace","spark.executor.memory","spark.executor.cores","spark.driver.host","spark.master"]:
            print(entry[0],"=",entry[1])

spark.kubernetes.namespace = frontend
spark.master = k8s://https://kubernetes.default.svc.cluster.local:443
spark.app.name = jupyter-spark
spark.executor.memory = 1G
spark.executor.cores = 2
spark.driver.host = jupyter-spark-driver.frontend.svc.cluster.local


In [111]:
df=spark.read.format("json").load("s3a://spark-target-schema3/topics/spark-target-schema3/")
df.printSchema()
print(df.count())

root
 |-- CreatedAt: long (nullable = true)
 |-- FollowersCount: long (nullable = true)
 |-- FriendsCount: long (nullable = true)
 |-- Hashtags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Id: long (nullable = true)
 |-- Lang: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- RetweetCount: long (nullable = true)
 |-- ScreenName: string (nullable = true)
 |-- Text: string (nullable = true)
 |-- UserID: long (nullable = true)
 |-- partition: integer (nullable = true)

70


In [112]:
df.withColumn("CreatedAt",f.from_unixtime(f.col("CreatedAt")/1000)).select("CreatedAt","Hashtags").show

<bound method DataFrame.show of DataFrame[CreatedAt: string, Hashtags: array<string>]>

In [113]:
spark.read.format("avro").load("s3a://spark-stream-to-s3/avro").show()

+-------------------+-------------------+--------------+
|      tweet_created|           tweet_id|tweet_language|
+-------------------+-------------------+--------------+
|2022-11-29 12:15:55|1597549913642090496|            en|
|2022-11-29 12:15:55|1597549916074790913|            en|
|2022-11-29 12:15:56|1597549917639245824|            en|
|2022-11-29 12:15:56|1597549918113402880|            en|
|2022-11-29 12:15:56|1597549920029900802|            en|
|2022-11-29 12:15:58|1597549927374127105|            en|
|2022-11-29 12:15:58|1597549928733097984|            en|
|2022-11-29 12:15:59|1597549930490601472|            en|
|2022-11-29 12:15:59|1597549931857940482|            en|
|2022-11-29 12:16:22|1597550030130388992|            en|
|2022-11-29 12:16:24|1597550037373943809|            en|
|2022-11-29 12:16:34|1597550080382599169|            en|
|2022-11-29 12:16:53|1597550157830455296|            en|
|2022-11-29 12:18:35|1597550584345030657|            es|
|2022-11-29 12:19:37|1597550846

In [114]:
spark.read.format("parquet").load("s3a://spark-stream-to-s3/parquet").show()

+-------------------+-------------------+--------------+--------------------+-------+------------+---------------+-------------+--------------------+--------------+-------------+--------------------+
|      tweet_created|           tweet_id|tweet_language|             hashtag|country|country_code|           user|user_language|       user_location|statuses_count|retweet_count|          tweet_text|
+-------------------+-------------------+--------------+--------------------+-------+------------+---------------+-------------+--------------------+--------------+-------------+--------------------+
|2022-11-29 12:15:56|1597549921485332480|            en|[ai, ml, artifici...|   null|        null|        cuongcz|         null|        New York, NY|         37313|            0|#ai #ml #artifici...|
|2022-11-29 12:15:57|1597549922915590144|            en|[ai, ml, artifici...|   null|        null|        cuongcz|         null|        New York, NY|         37314|            0|#ai #ml #artifici...|


In [115]:
"""
df = (spark
      .read
      .json("s3a://json-500/topics/twitter-table")
      .withColumnRenamed("hashtag","hashtags")
     ).cache()
"""      


'\ndf = (spark\n      .read\n      .json("s3a://json-500/topics/twitter-table")\n      .withColumnRenamed("hashtag","hashtags")\n     ).cache()\n'

In [116]:
"""
df.select("hashtags").show(2,truncate=False)
print(df.columns)
df.count()
"""

'\ndf.select("hashtags").show(2,truncate=False)\nprint(df.columns)\ndf.count()\n'

In [117]:
"""
df_avro = df = spark.read.format("avro").load("s3a://arvo-100/topics/twitter-avroConverter").cache()
"""

'\ndf_avro = df = spark.read.format("avro").load("s3a://arvo-100/topics/twitter-avroConverter").cache()\n'

In [118]:
"""
df_avro.select("CreatedAt","Id").show(2,truncate=False)
df_avro.columns
"""

'\ndf_avro.select("CreatedAt","Id").show(2,truncate=False)\ndf_avro.columns\n'

## Aufgaben

In [119]:
df.printSchema()

root
 |-- CreatedAt: long (nullable = true)
 |-- FollowersCount: long (nullable = true)
 |-- FriendsCount: long (nullable = true)
 |-- Hashtags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Id: long (nullable = true)
 |-- Lang: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- RetweetCount: long (nullable = true)
 |-- ScreenName: string (nullable = true)
 |-- Text: string (nullable = true)
 |-- UserID: long (nullable = true)
 |-- partition: integer (nullable = true)



In [120]:
df.head()


Row(CreatedAt=1669938163000, FollowersCount=4813, FriendsCount=84, Hashtags=['MachineLearning', 'BigData', 'Analytics', 'DataScience', 'AI', 'IoT', 'IIoT', 'Python', 'RStats', 'TensorFlow', 'JavaScript', 'ReactJS', 'CloudComputing', 'Serverless', 'Linux', 'Programming', 'Coding', '100DaysofCode', 'blockchain', 'AWS', 'SQL', 'NLP', 'CodeNewbies', 'DL'], Id=1598462628237807616, Lang='en', Location='Toronto, Ontario', RetweetCount=0, ScreenName='Sheraj99', Text='Scikit-learn for #MachineLearning Cheatsheet #BigData #Analytics #DataScience #AI #IoT #IIoT #Python #RStats #TensorFlow #JavaScript #ReactJS #CloudComputing #Serverless #Linux #Programming #Coding #100DaysofCode #blockchain #AWS #SQL #NLP #CodeNewbies #DL https://t.co/9K8SQRyVK4 https://t.co/SArtolUrYr', UserID=268619848, partition=0)

In [121]:

df2=df.withColumn("CreatedAt",f.from_unixtime(f.col("CreatedAt")/1000))
df2.select("CreatedAt").show()


+-------------------+
|          CreatedAt|
+-------------------+
|2022-12-02 00:42:43|
|2022-12-02 00:43:00|
|2022-12-02 00:43:33|
|2022-12-02 00:44:03|
|2022-12-02 00:44:12|
|2022-12-02 00:44:24|
|2022-12-02 00:44:25|
|2022-12-02 00:44:26|
|2022-12-02 00:44:35|
|2022-12-02 00:45:00|
|2022-12-02 00:38:58|
|2022-12-02 00:39:07|
|2022-12-02 00:40:01|
|2022-12-02 00:40:14|
|2022-12-02 00:40:41|
|2022-12-02 00:40:47|
|2022-12-02 00:41:37|
|2022-12-02 00:42:30|
|2022-12-02 00:42:32|
|2022-12-02 00:42:36|
+-------------------+
only showing top 20 rows



### 1. Zählen der Tweets pro Stunde

In [103]:
df_hourly=(df2  
        # if stream did run less than an hour take minutes
        .withColumn("hour", f.hour(f.col("CreatedAt")))
        .groupBy("hour")
        .count()
        .withColumnRenamed("count","total")
        .sort("hour")
    )

df_hourly.show(20)

[Stage 63:>                                                         (0 + 2) / 2]

+----+-----+
|hour|total|
+----+-----+
|   0|   70|
+----+-----+



                                                                                

### 1. Top 10 User nach Tweet-Anzahl

In [122]:
df_top_user=(df2
                .groupBy("ScreenName")
                .agg(
                    f.count("ScreenName").alias("total")
                    )
                .orderBy(f.col("total").desc())
                .limit(10)
                )

df_top_user.show()


+--------------+-----+
|    ScreenName|total|
+--------------+-----+
|       sdogdev|    6|
| jayeshmthakur|    4|
|periscopeislit|    4|
| flutterbyamey|    3|
| MedicaliPhone|    2|
|     TimedoArt|    2|
|     aisear_ch|    2|
|  magnifintech|    2|
|  amit_rai_333|    2|
|  hernangraffe|    2|
+--------------+-----+



### Top 5 Hashtags der Top 10 User

In [123]:
# Top 5 Hashtags der Top 10 User
# a) reduziere Gesamtdaten auf die Menge der 10 Top User via Join
df_top3_per_user=(df_top_user
            # filter via join
            .join(df,[df_top_user.ScreenName==df.ScreenName],how="left")
            # hashtags array in Zeilen Einträge exploden
            .withColumn("hashtag",f.explode(f.col("Hashtags")))
            # hashtag array Spalte entfernen
            .drop("Hashtags")
            # groupieren und counten by hashtag
            .groupBy("hashtag").count()
            # rückwärts sortieren
            .sort(f.col("count").desc())
            # top 5 selectieren
            .limit(5)
            
    )
    
df_top3_per_user.show()

[Stage 84:>                                                         (0 + 2) / 2]

+-------------+-----+
|      hashtag|count|
+-------------+-----+
|           AI|   18|
|       Python|   16|
|       RStats|   15|
|   TensorFlow|   14|
|DataAnalytics|   11|
+-------------+-----+



                                                                                

 ### Top 10 Influencer (User deren #BigData-tweets mit den meisten Followern) 

In [None]:
# CHECK WARUM IST RETWEET COUNT IMMER 0??
df_withRetweets=(df_top_user
            # filter via join auf die Top 10 Influencer
            .join(df,[df_top_user.user==df.user],how="left")
            .where(f.col("retweet_count")>0)
                )   
  
df_withRetweets.show()

In [None]:
# Top 10 Influencer (User deren #BigData-tweets am meisten retweeted wurden) (des letzten Monats)
df_withRetweets=(df_top_user
            # filter via join auf die Top 10 Influencer
            .join(df,[df_top_user.user==df.user],how="left")
            # filter auf wurde retweeded hat eine Wert
            .where(f.col("tweet_id").isNotNull())
            # filter auf die etzten zwei Wochen
            #.filter(f.add_months(f.current_date(), -1) < f.col("created_date"))
            .filter(f.array_contains(f.col("hashtags"),"BigData"))
            # groupieren bei retweed_id und name
            .groupBy("tweet_id",df.user)
            .agg(f.max("retweet_count"))
            .groupBy(df.user)
            .agg(f.sum("max(retweet_count)"))
            .withColumnRenamed("sum(max(retweet_count))","total")
            .orderBy(f.col("total").desc())
            .limit(10)
            
    )

df_withRetweets.show()

### Aufgabe der Titel noch fehlt( Filter nach Location der TOP 10 User, Top  Hashtag pro Location)  IN SQL oder Spark

In [None]:
sql="""
SELECT user_location, word, total FROM (
    SELECT user_location, word, total, rank() over(partition by a.user_location order by a.total desc) as rank FROM
    (
        SELECT user_location,word, count(*) as total
        FROM tweets
        LATERAL VIEW explode(tweets.hashtags) tweets as word
        WHERE word not in ("BigData","bigdata","")
        AND tweets.user_location IN
        (SELECT t.user_location
                FROM (
                         SELECT user_location, count(*) as total from tweets
                         WHERE user_location not in ("","null","REMOTE","Earth")
                         GROUP BY user_location
                         ORDER BY total DESC LIMIT 10
                     ) as t
        )
        GROUP BY user_location, word
        ) as a
    ) as b
WHERE rank=1
ORDER BY total DESC LIMIT 10;
"""

df.registerTempTable("tweets")

df_result = spark.sql(sql)
df_result.show()

In [None]:
df3=(df
    .select("user_location")
    .where(~f.col("user_location").isin("","null","REMOTE","Earth"))
    .groupBy("user_location")
    .count()
    .withColumnRenamed("count","location_total")
    .orderBy(f.col("location_total").desc())
    .limit(10)
    )

df3.show()

In [None]:
df4=(df
    .select("user_location","hashtag")
    .withColumn("singletag",f.explode(f.col("hashtag")))
    .groupBy("user_location","singletag")
    .count()
    .withColumnRenamed("count","tags_total")
    )

df4.show()

In [None]:
df5=(df3.alias("a")
    .join(f.broadcast(df4.alias("b")),[df3.user_location==df4.user_location],how="left")
    .select("a.user_location","a.location_total","b.singletag","b.tags_total")
    .withColumn("rank",f.row_number().over(Window.partitionBy("a.user_location").orderBy(f.col("b.tags_total").desc())))
    .filter(f.col("rank")==1)
    .sort(f.col("location_total").desc())
    .limit(100)
    )
df5.show()

## Schreiben der Daten nach Parquet und Delta

In [None]:
writer_parquet=(df
                .write.partitionBy("tweet_language")
                .mode("overwrite")
                .format("parquet")
                .save("s3a://solution/twitter_parquet")
            )



In [None]:
writer_delta=(df
                #.where(f.col("tweet_language").isin("en","de"))
                .write.partitionBy("tweet_language")
                .mode("overwrite")
                .format("delta")
                .save("s3a://solution/twitter_delta")
             )

## Delta History and Time Travel

In [None]:
# Load Delta file in s3 into Delta Table Object
dt = DeltaTable.forPath(spark, "s3a://solution/twitter_delta")
dt.toDF().show(2)


#### Time Travel Aufgabe
1. Excecute write to delta several times and check how the history adds new entries  
2. Load one of the versions and check all the `tweet_language` (via distinct().show())
3. write again delta with activated filter to write only out the `tweet_language` `en` and `de` 
4. confirm in the history log that less files were written out
5. Load the latest version and check that only two entries are available
6. Load older versions and confirm that there are still all data available
7. Revert old version to latest again

In [None]:
# get the metadata for the full history of the table
fullHistoryDF = dt.history()    

# get the metadata for the last operation
lastOperationDF = dt.history(1) 

fullHistoryDF.select("version","readVersion","timestamp","userId","operation","operationParameters","operationMetrics").show()

In [None]:
# load latest delta version
df_timetravel = spark.read.format("delta").load("s3a://solution/twitter_delta")

df_timetravel.select("tweet_language").distinct().show()

In [None]:
#load specific historic version
df_timetravel = spark.read.format("delta").option("versionAsOf", 2).load("s3a://solution/twitter_delta")

df_timetravel.select("tweet_language").distinct().show()

In [None]:
# write old version back as latest
f_timetravel = (spark
                .read.format("delta").option("versionAsOf", 0).load("s3a://solution/twitter_delta")
                .write.partitionBy("tweet_language").mode("overwrite").format("delta").save("s3a://solution/twitter_delta")
               )
                

In [16]:
spark.stop()

22/12/01 23:00:29 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed.
