In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder\
    .master("local")\
    .appName("lesson_16")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
json_schema = StructType([StructField("product_id", IntegerType(), True), StructField("date", DateType(), True)])

In [4]:
events_static = spark.read\
    .schema(json_schema)\
    .json("/datalake/bronze/rd_payload/2021-07-09/")

In [5]:
events_static = events_static.where(col('product_id') == 17848)

In [6]:
display(events_static)

DataFrame[product_id: int, date: date]

In [7]:
events_static.show()

+----------+----------+
|product_id|      date|
+----------+----------+
|     17848|2021-07-09|
+----------+----------+



In [8]:
events_streaming = spark.readStream\
    .schema(json_schema)\
    .option("maxFilesPerTrigger", 1)\
    .json("/datalake/bronze/rd_payload/spark_stream/")   # .kafka("...")

In [None]:
# Append mode

In [None]:
append_df = events_streaming.where(col('product_id') == 17848)

In [None]:
append_df.writeStream\
    .format("memory")\
    .queryName("event_stream")\
    .outputMode("append")\
    .start()

In [None]:
# %sql
# select * from event_stream;   # it works in DataBricks
# select count(*) from event_stream;

In [9]:
# Complete mode

In [10]:
complete_df = events_streaming\
    .groupBy(col('product_id'), window(col("date"), '1 day')).count()   # windowing 1 day delay for execution (wait for grouping)

In [11]:
complete_df.writeStream\
    .format("memory")\
    .queryName("event_stream")\
    .outputMode("complete")\
    .start()

22/03/08 16:45:23 WARN streaming.StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-f9a57739-71c9-425b-92ea-13c279c6c7cc. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.


<pyspark.sql.streaming.StreamingQuery at 0x7fa95a7b36d0>

                                                                                

In [12]:
# %sql
# select * from event_stream;   # it works in DataBricks
# select count(*) from event_stream;