# Spark Aufgaben

1. Read Twitter Streams from Avro
2. Do analysis Tasks
3. Write Result to Delta several times
4. Du some History analysis on it



In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.sql import Row
import pyspark.sql.functions as f

from delta import *


import datetime
from datetime import datetime
import json


# use 95% of the screen for jupyter cell
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:100% !important; }<style>"))

  from IPython.core.display import display, HTML


In [2]:
appName="jupyter-spark"

conf = SparkConf()

# CLUSTER MANAGER
################################################################################
# set Kubernetes Master as Cluster Manager(“k8s://https://” is NOT a typo, this is how Spark knows the “provider” type).
conf.setMaster("k8s://https://kubernetes.default.svc.cluster.local:443")

# CONFIGURE KUBERNETES
################################################################################
# set the namespace that will be used for running the driver and executor pods.
conf.set("spark.kubernetes.namespace","frontend")
# set the docker image from which the Worker pods are created
conf.set("spark.kubernetes.container.image", "thinkportgmbh/workshops:spark-3.3.1")
conf.set("spark.kubernetes.container.image.pullPolicy", "Always")

# set service account to be used
conf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
# authentication for service account(required to create worker pods):
conf.set("spark.kubernetes.authenticate.caCertFile", "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt")
conf.set("spark.kubernetes.authenticate.oauthTokenFile", "/var/run/secrets/kubernetes.io/serviceaccount/token")


# CONFIGURE SPARK
################################################################################
conf.set("spark.sql.session.timeZone", "Europe/Berlin")
# set driver host. In this case the ingres service for the spark driver
# find name of the driver service with 'kubectl get services' or in the helm chart configuration
conf.set("spark.driver.host", "jupyter-spark-driver.frontend.svc.cluster.local")
# set the port, If this port is busy, spark-shell tries to bind to another port.
conf.set("spark.driver.port", "29413")
# add the postgres driver jars into session
conf.set("spark.jars", "/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /opt/spark/jars/kafka-clients-3.3.1.jar, /opt/spark/jars/spark-avro_2.12-3.3.1.jar")
#conf.set("spark.driver.extraClassPath","/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /opt/spark/jars/kafka-clients-3.3.1.jar, /opt/spark/jars/spark-avro_2.12-3.3.1.jar")
conf.set("spark.executor.extraClassPath","/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /opt/spark/jars/kafka-clients-3.3.1.jar, /opt/spark/jars/spark-avro_2.12-3.3.1.jar")
#conf.set("spark.executor.extraLibrary","/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /opt/spark/jars/kafka-clients-3.3.1.jar")

# CONFIGURE S3 CONNECTOR
conf.set("spark.hadoop.fs.s3a.endpoint", "minio.minio.svc.cluster.local:9000")
conf.set("spark.hadoop.fs.s3a.access.key", "trainadm")
conf.set("spark.hadoop.fs.s3a.secret.key", "train@thinkport")
conf.set("spark.hadoop.fs.s3a.path.style.access", "true")
conf.set("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
conf.set("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")

# conf.set("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1")

# CONFIGURE WORKER (Customize based on workload)
################################################################################
# set number of worker pods
conf.set("spark.executor.instances", "1")
# set memory of each worker pod
conf.set("spark.executor.memory", "1G")
# set cpu of each worker pod
conf.set("spark.executor.cores", "2")
# Number of possible tasks = cores * executores

# SPARK SESSION
################################################################################
# and last, create the spark session and pass it the config object

spark = SparkSession\
    .builder\
    .config(conf=conf) \
    .config('spark.sql.session.timeZone', 'Europe/Berlin') \
    .appName(appName)\
    .getOrCreate()

# also get the spark context
sc=spark.sparkContext
# change the log level to warning, to see less output
sc.setLogLevel('WARN')

# get the configuration object to check all the configurations the session was startet with
for entry in sc.getConf().getAll():
        if entry[0] in ["spark.app.name","spark.kubernetes.namespace","spark.executor.memory","spark.executor.cores","spark.driver.host","spark.master"]:
            print(entry[0],"=",entry[1])

22/12/07 16:51:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


spark.kubernetes.namespace = frontend
spark.master = k8s://https://kubernetes.default.svc.cluster.local:443
spark.app.name = jupyter-spark
spark.executor.memory = 1G
spark.executor.cores = 2
spark.driver.host = jupyter-spark-driver.frontend.svc.cluster.local


In [3]:
df=spark.read.format("avro").load("s3a://twitter/avro")

print(df.count())

22/12/07 16:52:20 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties




1775


                                                                                

In [4]:
df.show()

<bound method DataFrame.show of DataFrame[tweet_id: string, created_at: timestamp, tweet_message: string, user_name: string, user_location: string, user_follower_count: int, user_friends_count: int, retweet_count: int, language: string, hashtags: array<string>]>

In [5]:
spark.read.format("avro").load("s3a://twitter/avro").show()

                                                                                

+-------------------+-------------------+--------------------+---------------+--------------------+-------------------+------------------+-------------+--------+--------------------+
|           tweet_id|         created_at|       tweet_message|      user_name|       user_location|user_follower_count|user_friends_count|retweet_count|language|            hashtags|
+-------------------+-------------------+--------------------+---------------+--------------------+-------------------+------------------+-------------+--------+--------------------+
|1600474456073424898|2022-12-07 13:57:00|This #Tesla Cyber...|  Paula_Piccard|            New York|              73768|              9401|            0|      en|                null|
|1600474576818954242|2022-12-07 13:57:29|AI Isn't Artifici...|   DTN_Graphics|             Nigeria|                159|               298|            0|      en|                null|
|1600474724580003846|2022-12-07 13:58:04|RT @Long_Shot_Ads...|WorldTrendsInfo|       

## Aufgaben

In [6]:
df.printSchema()

root
 |-- tweet_id: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- tweet_message: string (nullable = true)
 |-- user_name: string (nullable = true)
 |-- user_location: string (nullable = true)
 |-- user_follower_count: integer (nullable = true)
 |-- user_friends_count: integer (nullable = true)
 |-- retweet_count: integer (nullable = true)
 |-- language: string (nullable = true)
 |-- hashtags: array (nullable = true)
 |    |-- element: string (containsNull = true)



### 1. Zählen der Tweets pro Stunde

In [None]:
df_hourly=(df  
        # if stream did run less than an hour take minutes
        .withColumn("hour", f.hour(f.col("created_at")))
        .groupBy("hour")
        .count()
        .withColumnRenamed("count","total")
        .sort("hour")
    )

df_hourly.show(20)

### 1. Top 10 User nach Tweet-Anzahl

In [28]:
df_top_user=(df
                .groupBy("user_name")
                .agg(
                    f.count("user_name").alias("total")
                    )
                .orderBy(f.col("total").desc())
                .limit(10)
                .withColumnRenamed("user_name","user")
                )

df_top_user.show()




+--------------+-----+
|          user|total|
+--------------+-----+
|   Eli_Krumova|  127|
|Khulood_Almani|   82|
|       cuongcz|   61|
| Soul_Dispatch|   54|
|  chidambara09|   50|
|       sdogdev|   48|
|TechNativeWire|   28|
| flutterbyamey|   28|
|      mikejo_m|   26|
|    TechNative|   22|
+--------------+-----+



                                                                                

### Top 5 Hashtags der Top 10 User

In [8]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import lower, col
# Top 5 Hashtags der Top 10 User
# a) reduziere Gesamtdaten auf die Menge der 10 Top User via Join
df_top5_per_user=(df_top_user
            # filter via join
            .join(df,[df_top_user.user==df.user_name],how="left")
            # hashtags array in Zeilen Einträge exploden
              # groupieren und counten by hashtag
            .withColumn("hashtags",explode("hashtags"))
            .withColumn("hashtags", lower(col('hashtags')))
            .groupBy("hashtags").agg(f.count("hashtags"))
                  
                  
          #  # rückwärts sortieren
            .sort(f.col("count(hashtags)").desc())
            # top 5 selectieren
            .limit(5)
            
    )
    
df_top5_per_user.show()



+---------------+---------------+
|       hashtags|count(hashtags)|
+---------------+---------------+
|        bigdata|            226|
|             ai|            225|
|    datascience|            223|
|machinelearning|            177|
|      analytics|            153|
+---------------+---------------+



                                                                                

 ### Top 10 Influencer (User mit #BigData-tweets mit den meisten Followern) 

In [16]:
df_top_influencer=(df
                .groupBy("user_name")
                .agg(
                    f.max("user_follower_count").alias("follower")
                    )
                .orderBy(f.col("follower").desc())
                .limit(10)
                )
df_top_influencer.show()



+---------------+--------+
|      user_name|follower|
+---------------+--------+
|     KirkDBorne|  371809|
| Ronald_vanLoon|  294773|
|         rwang0|  140016|
|     sallyeaves|  132261|
|    ipfconline1|  128732|
| SpirosMargaris|  126185|
|DataScienceDojo|  116258|
|    IainLJBrown|  113804|
|      mvollmer1|  100366|
|      JimMarous|   96994|
+---------------+--------+



                                                                                

In [29]:
# Top 10 Influencer mit den meisten tweets
df_withRetweets=(df_top_influencer
            # filter via join auf die Top 10 Influencer
            .join(df_top_user, [df_top_influencer.user_name==df_top_user.user],how="left")
            .orderBy(f.col("follower").desc())
            .limit(10)
            .orderBy(f.col("total").desc())     
           
            
    )

df_withRetweets.show()

                                                                                

+---------------+--------+----+-----+
|      user_name|follower|user|total|
+---------------+--------+----+-----+
|     KirkDBorne|  371809|null| null|
| Ronald_vanLoon|  294773|null| null|
|         rwang0|  140016|null| null|
|     sallyeaves|  132261|null| null|
|    ipfconline1|  128732|null| null|
| SpirosMargaris|  126185|null| null|
|DataScienceDojo|  116258|null| null|
|    IainLJBrown|  113804|null| null|
|      mvollmer1|  100366|null| null|
|      JimMarous|   96994|null| null|
+---------------+--------+----+-----+



### Aufgabe der Titel noch fehlt( Filter nach Location der TOP 10 User, Top  Hashtag pro Location)  IN SQL oder Spark

In [None]:
sql="""
SELECT user_location, word, total FROM (
    SELECT user_location, word, total, rank() over(partition by a.user_location order by a.total desc) as rank FROM
    (
        SELECT user_location,word, count(*) as total
        FROM tweets
        LATERAL VIEW explode(tweets.hashtags) tweets as word
        WHERE word not in ("BigData","bigdata","")
        AND tweets.user_location IN
        (SELECT t.user_location
                FROM (
                         SELECT user_location, count(*) as total from tweets
                         WHERE user_location not in ("","null","REMOTE","Earth")
                         GROUP BY user_location
                         ORDER BY total DESC LIMIT 10
                     ) as t
        )
        GROUP BY user_location, word
        ) as a
    ) as b
WHERE rank=1
ORDER BY total DESC LIMIT 10;
"""

df.registerTempTable("tweets")

df_result = spark.sql(sql)
df_result.show()

In [None]:
df3=(df
    .select("user_location")
    .where(~f.col("user_location").isin("","null","REMOTE","Earth"))
    .groupBy("user_location")
    .count()
    .withColumnRenamed("count","location_total")
    .orderBy(f.col("location_total").desc())
    .limit(10)
    )

df3.show()

In [None]:
df4=(df
    .select("user_location","hashtag")
    .withColumn("singletag",f.explode(f.col("hashtag")))
    .groupBy("user_location","singletag")
    .count()
    .withColumnRenamed("count","tags_total")
    )

df4.show()

In [None]:
df5=(df3.alias("a")
    .join(f.broadcast(df4.alias("b")),[df3.user_location==df4.user_location],how="left")
    .select("a.user_location","a.location_total","b.singletag","b.tags_total")
    .withColumn("rank",f.row_number().over(Window.partitionBy("a.user_location").orderBy(f.col("b.tags_total").desc())))
    .filter(f.col("rank")==1)
    .sort(f.col("location_total").desc())
    .limit(100)
    )
df5.show()

## Schreiben der Daten nach Parquet und Delta

In [None]:
writer_parquet=(df
                .write.partitionBy("tweet_language")
                .mode("overwrite")
                .format("parquet")
                .save("s3a://solution/twitter_parquet")
            )



In [None]:
writer_delta=(df
                #.where(f.col("tweet_language").isin("en","de"))
                .write.partitionBy("tweet_language")
                .mode("overwrite")
                .format("delta")
                .save("s3a://solution/twitter_delta")
             )

## Delta History and Time Travel

In [None]:
# Load Delta file in s3 into Delta Table Object
dt = DeltaTable.forPath(spark, "s3a://solution/twitter_delta")
dt.toDF().show(2)


#### Time Travel Aufgabe
1. Excecute write to delta several times and check how the history adds new entries  
2. Load one of the versions and check all the `tweet_language` (via distinct().show())
3. write again delta with activated filter to write only out the `tweet_language` `en` and `de` 
4. confirm in the history log that less files were written out
5. Load the latest version and check that only two entries are available
6. Load older versions and confirm that there are still all data available
7. Revert old version to latest again

In [None]:
# get the metadata for the full history of the table
fullHistoryDF = dt.history()    

# get the metadata for the last operation
lastOperationDF = dt.history(1) 

fullHistoryDF.select("version","readVersion","timestamp","userId","operation","operationParameters","operationMetrics").show()

In [None]:
# load latest delta version
df_timetravel = spark.read.format("delta").load("s3a://solution/twitter_delta")

df_timetravel.select("tweet_language").distinct().show()

In [None]:
#load specific historic version
df_timetravel = spark.read.format("delta").option("versionAsOf", 2).load("s3a://solution/twitter_delta")

df_timetravel.select("tweet_language").distinct().show()

In [None]:
# write old version back as latest
f_timetravel = (spark
                .read.format("delta").option("versionAsOf", 0).load("s3a://solution/twitter_delta")
                .write.partitionBy("tweet_language").mode("overwrite").format("delta").save("s3a://solution/twitter_delta")
               )
                

In [None]:
spark.stop()