In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Reading from Sockets")
    .getOrCreate()
)
spark.conf.set("spark.sql.shuffle.partitions", 8)
spark

In [2]:
# Code to Start the batch read

df_raw = spark.read.format("text").load("data/sample_input.txt")

df_raw.printSchema()
df_raw.show()


root
 |-- value: string (nullable = true)

+--------------------+
|               value|
+--------------------+
|simson had a dog ...|
+--------------------+



In [22]:
# Split the line into words 

from pyspark.sql.functions import split, lit, explode, col

df_words = df_raw.withColumn("splited", split(df_raw.value, ' '))
df_words = df_words.withColumn("exploded", explode(col("splited")))
df_words = df_words.drop("value","splited")
df_words.show(truncate=False)

+--------+
|exploded|
+--------+
|simson  |
|had     |
|a       |
|dog     |
|long    |
|back    |
+--------+



In [24]:
# Aggregate the words to generate count 

from pyspark.sql.functions import split, lit, explode, col, count
df_agg = df_words.groupBy("exploded").agg(count(lit(1))).alias("count")
df_agg.show()


+--------+--------+
|exploded|count(1)|
+--------+--------+
|  simson|       1|
|    back|       1|
|     dog|       1|
|     had|       1|
|    long|       1|
|       a|       1|
+--------+--------+



In [None]:
# Code to Start the readStream

In [2]:
# Change the data into readStream
df_stream_raw = spark.readStream.format("socket").option("host", "localhost").option("port", "9999").load()

In [3]:
# Split the line into words 
from pyspark.sql.functions import split, lit, explode, col, count
df_steam_words = df_stream_raw.withColumn("splited", split(df_stream_raw.value, ' '))
df_steam_words = df_steam_words.withColumn("exploded", explode(col("splited")))
df_steam_words = df_steam_words.drop("value","splited")
df_stream_agg = df_steam_words.groupBy("exploded").agg(count(lit(1))).alias("count")

In [None]:
# Aggregate the words to generate count 
# available outputMode are complete, update and append. 
df_stream_agg.writeStream.format("console").outputMode("complete").start().awaitTermination()

In [None]:
# writeStream console Output, mode: complete 

25/03/04 17:18:05 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-e2097240-1b96-45d7-8d3a-a19d2ed22ce5. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/03/04 17:18:05 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
-------------------------------------------
Batch: 0
-------------------------------------------
+--------+--------+
|exploded|count(1)|
+--------+--------+
+--------+--------+

[I 2025-03-04 17:19:14.737 ServerApp] Saving file at /spark-streaming/reading_from_sockets.ipynb
-------------------------------------------
Batch: 1
-------------------------------------------
+--------+--------+
|exploded|count(1)|
+--------+--------+
|     cat|       1|
|     owl|       1|
|     dog|       1|
+--------+--------+

-------------------------------------------
Batch: 2
-------------------------------------------
+--------+--------+
|exploded|count(1)|
+--------+--------+
|     cat|       1|
|   mouse|       1|
|     owl|       1|
|     dog|       2|
+--------+--------+



In [None]:
df_stream_agg.writeStream.format("console").outputMode("update").start().awaitTermination()


In [None]:
# writeStream console Output, mode: update 

25/03/04 17:42:39 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-c3c93400-8bd8-49c9-89da-2f6849673b31. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/03/04 17:42:39 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
-------------------------------------------
Batch: 0
-------------------------------------------
+--------+--------+
|exploded|count(1)|
+--------+--------+
+--------+--------+

-------------------------------------------
Batch: 1
-------------------------------------------
+--------+--------+
|exploded|count(1)|
+--------+--------+
|     cat|       1|
|     owl|       1|
+--------+--------+