# Operations on Streaming Dataframes/Datasets Demo

### Demo

In [1]:
import os
import pathlib
import findspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, current_timestamp, window
from pyspark.sql.types import StructType, TimestampType

prj_dir = pathlib.Path().resolve().parent.parent
spark_home = os.path.join(prj_dir / 'spark-3.5.0-bin-hadoop3')
findspark.init(spark_home)

spark = SparkSession\
    .builder\
    .appName("UserInteractionAnalyzer")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/23 16:31:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
staging_dir = 'monitoring_data'

In [4]:
# Read all the csv files written atomically in a directory
# The schema is as follows:
# userA, userB, timestamp, interaction
userSchema = StructType()\
    .add("userA", "integer")\
    .add("userB", "integer")\
    .add("timestamp", TimestampType())\
    .add("interaction", "string")

In [5]:
# Create DataFrame representing the stream of input lines from connection to localhost:9999
activity = spark\
    .readStream\
    .option("sep", ",")\
    .schema(userSchema)\
    .csv(staging_dir + "/*.csv")

In [6]:
user_b = activity\
    .select("userB")\
    .where("interaction = \"MT\"")

In [None]:
query = user_b \
    .writeStream \
    .trigger(processingTime='10 seconds') \
    .format("parquet") \
    .option("checkpointLocation", "applicationHistory") \
    .option("path",staging_dir + "/out") \
    .start()

query2 = user_b\
    .writeStream\
    .trigger(processingTime='10 seconds')\
    .format("console")\
    .start()

query.awaitTermination()
# spark.sql("select * from aggregates").show()   # interactively query in-memory table

23/11/23 16:37:20 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/11/23 16:37:20 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/g6/vgc6wxj13x95m3zxhrn480540000gn/T/temporary-1438e02c-021e-4952-bcc2-77311d3487d2. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/11/23 16:37:20 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


-------------------------------------------
Batch: 0
-------------------------------------------
+------+
| userB|
+------+
|   383|
|  1454|
| 38034|
| 29600|
|    88|
|  4022|
|261475|
| 33476|
|146171|
| 59195|
| 20802|
| 14376|
|    88|
| 50901|
| 31957|
| 16783|
|    88|
|406395|
| 68285|
| 30978|
+------+
only showing top 20 rows

