In [7]:
from delta import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [8]:
builder = (SparkSession.builder
           .appName("config-streaming")
           .master("spark://spark-master:7077")
           .config("spark.executor.memory", "512m")   
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))

spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [9]:
# Create DataFrame representing the stream of input lines from connection to localhost:9999
lines = (spark.readStream
         .format("socket")
         .option("host", "localhost")
         .option("port", 9999)
         .load())

In [10]:
# Split the lines into words
words = lines.select(
   explode(split(lines.value, " ")).alias("word"))

In [11]:
# Generate running word count
wordCounts = words.groupBy("word").count()

In [12]:
 # Start running the query that prints the running counts to the console
query = (wordCounts.writeStream
         .outputMode("complete")
         .format("console")
         .start())

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+----+-----+
|word|count|
+----+-----+
+----+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------+-----+
|        word|count|
+------------+-----+
|        Data|    2|
|    overview|    1|
|     models.|    1|
|Fundamentals|    1|
|      stream|    1|
|       solve|    1|
|         you|    1|
|   landscape|    1|
|    systems.|    1|
|replication,|    1|
|         for|    1|
|         Joe|    1|
|  tolerance,|    1|
|    provides|    1|
|        Reis|    1|
|      topics|    1|
|   practices|    1|
|     concise|    1|
| distributed|    1|
|        your|    1|
+------------+-----+
only showing top 20 rows



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+------------+-----+
|        word|count|
+------------+-----+
|   Dynamical|    1|
|        Data|    2|
|     complex|    1|
|    overview|    1|
|     models.|    1|
|     Science|    1|
|Fundamentals|    1|
|      stream|    1|
|      Nathan|    1|
|          by|    1|
|       solve|    2|
|         you|    2|
|   landscape|    1|
|          L.|    1|
|    systems.|    1|
|       apply|    1|
|replication,|    1|
|         for|    1|
|         Joe|    1|
|         how|    1|
+------------+-----+
only showing top 20 rows



In [14]:
query.stop()

In [None]:
spark.stop()