In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
# %fs
# ls   - for using in DataBricks, "spark" already runned
spark = SparkSession.builder\
    .master("local")\
    .appName("lesson_16")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
json_schema = StructType([StructField("product_id", IntegerType(), True), StructField("date", DateType(), True)])

In [4]:
events_static = spark.read\
    .schema(json_schema)\
    .json("/datalake/bronze/rd_payload/2021-07-09/")

In [5]:
events_static = events_static.where(col('product_id') < 10000)

In [6]:
display(events_static)

DataFrame[product_id: int, date: date]

In [7]:
events_static.show()

[Stage 0:>                                                          (0 + 1) / 1]

+----------+----------+
|product_id|      date|
+----------+----------+
|      4310|2021-07-09|
|      8251|2021-07-09|
|      6870|2021-07-09|
|      2606|2021-07-09|
|      1334|2021-07-09|
|       153|2021-07-09|
|      3867|2021-07-09|
|      3097|2021-07-09|
|      1801|2021-07-09|
|      8286|2021-07-09|
|       202|2021-07-09|
|      1038|2021-07-09|
|      7042|2021-07-09|
|      2025|2021-07-09|
|      6109|2021-07-09|
|      8280|2021-07-09|
|      4724|2021-07-09|
|      1322|2021-07-09|
|      5291|2021-07-09|
|      4317|2021-07-09|
+----------+----------+
only showing top 20 rows



                                                                                

In [8]:
events_static.count()

707

In [9]:
events_streaming = spark.readStream\
    .schema(json_schema)\
    .option("maxFilesPerTrigger", 1)\
    .json("/datalake/bronze/rd_payload/spark_stream/")   # .kafka("...")

In [10]:
# Append mode

In [11]:
append_df = events_streaming\
    .where(col('product_id') < 10000)

In [12]:
append = append_df.writeStream\
    .format("memory")\
    .queryName("event_stream_append")\
    .outputMode("append")\
    .start()

22/04/03 12:09:06 WARN streaming.StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-cc2fe707-e34a-491e-af32-50d37915e4d2. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
                                                                                

In [13]:
# %sql
# select * from event_stream_append;   # it works in DataBricks
# select count(*) from event_stream_append;

In [14]:
spark.sql("select * from event_stream_append").show()

+----------+----------+
|product_id|      date|
+----------+----------+
|      4310|2021-07-09|
|      8251|2021-07-09|
|      6870|2021-07-09|
|      2606|2021-07-09|
|      1334|2021-07-09|
|       153|2021-07-09|
|      3867|2021-07-09|
|      3097|2021-07-09|
|      1801|2021-07-09|
|      8286|2021-07-09|
|       202|2021-07-09|
|      1038|2021-07-09|
|      7042|2021-07-09|
|      2025|2021-07-09|
|      6109|2021-07-09|
|      8280|2021-07-09|
|      4724|2021-07-09|
|      1322|2021-07-09|
|      5291|2021-07-09|
|      4317|2021-07-09|
+----------+----------+
only showing top 20 rows



In [15]:
spark.sql("select count(*) from event_stream_append").show()

+--------+
|count(1)|
+--------+
|    1385|
+--------+



In [16]:
append.stop()

In [17]:
append_df = events_streaming\
    .where(col('product_id') > 10000)

In [18]:
append = append_df.writeStream\
    .format("memory")\
    .queryName("event_stream_append")\
    .outputMode("append")\
    .start()

22/04/03 12:09:22 WARN streaming.StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-223e3c35-e2d2-4120-91c8-fe629fa3be7a. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.


In [19]:
spark.sql("select count(*) from event_stream_append").show()

+--------+
|count(1)|
+--------+
|    5372|
+--------+



In [20]:
append.stop()

In [21]:
# Complete mode

In [22]:
complete_df = events_streaming\
    .groupBy(col('date'))\
    .count()

In [23]:
complete = complete_df.writeStream\
    .format("memory")\
    .queryName("event_stream_complete").outputMode("complete").start()

22/04/03 12:09:41 WARN streaming.StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-a40d459c-5639-4d21-b41f-ab7070bbf87d. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.

In [24]:
# %sql
# select * from event_stream_complete;   # it works in DataBricks
# select count(*) from event_stream_complete;



In [26]:
spark.sql("select * from event_stream_complete").show()

+----------+-----+
|      date|count|
+----------+-----+
|2021-07-08| 1540|
|2021-07-07| 1881|
|2021-07-09| 3336|
+----------+-----+



In [27]:
complete.stop()

In [28]:
complete_df = events_streaming\
    .groupBy(col('product_id'), window(col("date"), '1 day'))\
    .count()   # windowing 1 day delay for execution (wait for grouping, joining, usually - 15 minutes)

In [29]:
complete = complete_df.writeStream\
    .format("memory")\
    .queryName("event_stream_complete")\
    .outputMode("complete")\
    .start()      # need for aggregation operations

22/04/03 12:11:56 WARN streaming.StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-3ff77f34-2055-4d6a-aef4-efab7f803ce6. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
[Stage 27:====>                                                  (17 + 1) / 200]

In [31]:
spark.sql("select count(*) from event_stream_complete").show()

+--------+
|count(1)|
+--------+
|    6584|
+--------+



In [32]:
complete.stop()