# Chapter 8. Structured Streaming

In [None]:
from uuid import uuid1

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = (SparkSession.builder
  # Add Kafka-source library.  The version after ":" must be the Kafka version that you use  
  .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.0")
  .master("local[4]")
  .appName("StructuredStreaming")
  .getOrCreate())
spark

## The Fundamentals of a Structured Streaming Query

For the following streaming query to work, we need a TCP server that will listen at `127.0.0.1:61080` and will be sending text lines.

We can use `netcat-openbsd` for this. In a terminal run `nc -lk -s 127.0.0.1 -p 61080` and start typing text lines. Observe the output in this notebook. It should be something like this

```
-------------------------------------------
Batch: 1
-------------------------------------------
+----+-----+
|word|count|
+----+-----+
| foo|    1|
+----+-----+
```

To terminate the query interrupt the Jupyter kernel (menu Krenel -> Interrupt Kernel)

In [None]:
checkpoint_dir = f"/tmp/spark-streaming-checkpoints-{uuid1()}"
# checkpoint_dir = f"./spark-streaming-checkpoints"
lines = (spark
         .readStream
         .format("socket")
         .option("host", "127.0.0.1")
         .option("port", "61080")
         .load())
words = lines.select(F.explode(F.split(F.col("value"), "\\s")).alias("word"))
counts = words.groupBy("word").count()
streaming_query = (counts
                   .writeStream
                   .format("console")
                   .outputMode("complete")
                   .trigger(processingTime="1 second")
                   .option("checkpointLocation", checkpoint_dir)
                   .start())
streaming_query.awaitTermination()

## Kafka

In [None]:
schema = "`word` string, `count` long"
counts_sdf = spark.readStream.format("csv").schema(schema).option("header", "true").load("../data/counts")

In [None]:
checkpoint_dir = f"/tmp/spark-streaming-checkpoints-{uuid1()}"
streamingQuery = (counts_sdf
  .selectExpr(
    "cast(word as string) as key",
    "cast(count as string) as value")
  .writeStream
  .format("kafka")
  .option("kafka.bootstrap.servers", "localhost:9093,localhost:9094,localhost:9095")
  .option("topic", "wordcounts")
  .outputMode("update")
  .option("checkpointLocation", checkpoint_dir)
  .start())
# If the counts are not written to the Kafka topic,
# check the terminal where you started the notebook for error logs