In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, unix_timestamp, col
from pyspark.sql.functions import split
from pyspark.sql.types import TimestampType

In [None]:
import pyspark
print(pyspark.__version__)

In [None]:
import os
os.environ['SPARK_LOCAL_IP'] = '127.0.0.1'

In [None]:
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("StructuredNetworkWordCount") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .config("spark.executor.processTreeMetrics.enabled", "false") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

**Run the script and then run the following command in a new terminal:**

```bash
nc -lk 9999
```

In [None]:
# Create DataFrame representing the stream of input lines from connection to localhost:9999
lines = spark \
    .readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 9999) \
    .load()

# Split the lines into words
words = lines.select(
   explode(
       split(lines.value, " ")
   ).alias("word")
)

# Generate running word count
wordCounts = words.groupBy("word").count()

In [None]:
query = wordCounts \
    .writeStream \
    .outputMode("complete") \
    .format("console") \
    .option("checkpointLocation", "./stream_check_console") \
    .start()

In [None]:
query2 = wordCounts \
    .writeStream \
    .outputMode("complete") \
    .format("delta") \
    .option("checkpointLocation", "./stream_check") \
    .start("./stream.parquet")

In [None]:
spark.streams.awaitAnyTermination()

In [None]:
spark.read.parquet("./stream.parquet").show()