In [2]:
import threading

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):
        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [3]:
sc

In [4]:
spark

In [5]:
socketDF = spark.readStream.format("socket").option("host", "seppe.net").option("port", 7778).load()
socketDF.printSchema()

root
 |-- value: string (nullable = true)



In [8]:
from pyspark.sql.functions import from_json, schema_of_json

In [9]:
def process_row(df, epoch_id):
    print(epoch_id)
    if df.count() == 0: return
    schema = schema_of_json(df.first().value)
    df_cols = df.selectExpr('CAST(value AS STRING)')\
        .select(from_json('value', schema)\
        .alias('temp'))\
        .select('temp.*')
    df_cols.show()
    # We can also save here using something such as:
    df.write.format("json").mode("append").save("data")

In [10]:
query = socketDF.writeStream.trigger(processingTime='5 seconds').foreachBatch(process_row).start()  

0
1
+--------+--------+---------------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+----+-----+
|     aid|comments|         domain|frontpage|          posted_at|         source_text|        source_title|               title|                 url|user|votes|
+--------+--------+---------------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+----+-----+
|40185580|       0|newrepublic.com|    false|2024-04-28 02:45:54|Inside the Meltdo...|Inside the Meltdo...|The Meltdown at t...|https://newrepubl...|cwwc|    2|
+--------+--------+---------------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+----+-----+

2
+--------+--------+--------------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------+-----+
|     aid|comments|  

10
+--------+--------+--------------------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------+-----+
|     aid|comments|              domain|frontpage|          posted_at|         source_text|        source_title|               title|                 url|          user|votes|
+--------+--------+--------------------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------+-----+
|40185802|       0|github.com/huangy...|    false|2024-04-28 03:42:13|GitHub - huangyz0...|GitHub - huangyz0...|Show HN: Free Git...|https://github.co...|cheeseisinsane|    2|
+--------+--------+--------------------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------+-----+

11
+--------+--------+-----------+---------+-------------------+--------------------+--------------------+----------

20
+--------+--------+--------------------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+----------+-----+
|     aid|comments|              domain|frontpage|          posted_at|         source_text|        source_title|               title|                 url|      user|votes|
+--------+--------+--------------------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+----------+-----+
|40185993|       0|         cbsnews.com|    false|2024-04-28 04:27:41|South Dakota Gov....|South Dakota Gov....|South Dakota Gove...|https://www.cbsne...|      geox|    1|
|40185995|       0|tomorrowcorporati...|    false|2024-04-28 04:28:20|Tomorrow Corporat...|Tomorrow Corporat...|Retro Game Intern...|https://tomorrowc...|    iamwil|    2|
|40186012|       0|     fastcompany.com|    false|2024-04-28 04:31:50| fastcompany.com\n\n|     fastcompany.com|Weight-loss drugs...|http

29
+--------+--------+-----------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+------+-----+
|     aid|comments|     domain|frontpage|          posted_at|         source_text|        source_title|               title|                 url|  user|votes|
+--------+--------+-----------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+------+-----+
|40186299|       0|youtube.com|     true|2024-04-28 05:44:50|The Better Boardi...|The Better Boardi...|The Better Boardi...|https://www.youtu...|doener|    4|
+--------+--------+-----------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+------+-----+

30
+--------+--------+-----------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+---------+-----+
|     aid|comments|     domain|front

38
+--------+--------+----------------+---------+-------------------+--------------------+--------------------+---------------+--------------------+------+-----+
|     aid|comments|          domain|frontpage|          posted_at|         source_text|        source_title|          title|                 url|  user|votes|
+--------+--------+----------------+---------+-------------------+--------------------+--------------------+---------------+--------------------+------+-----+
|40186497|       0|imgupscaling.com|    false|2024-04-28 06:34:54|Image Upscale - F...|Image Upscale - F...|Image Upscaling|https://imgupscal...|ljhint|    1|
+--------+--------+----------------+---------+-------------------+--------------------+--------------------+---------------+--------------------+------+-----+

39
+--------+--------+-----------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+-------+-----+
|     aid|comments|     domain|frontpa

47
+--------+--------+-----------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------+-----+
|     aid|comments|     domain|frontpage|          posted_at|         source_text|        source_title|               title|                 url|       user|votes|
+--------+--------+-----------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------+-----+
|40186752|       0|postgres.ai|    false|2024-04-28 07:30:40|Common DB schema ...|Common DB schema ...|Common DB schema ...|https://postgres....|thunderbong|    1|
|40186754|       0| emiruz.com|    false|2024-04-28 07:30:50|Metric learning w...|Metric learning w...|Metric Learning w...|https://emiruz.co...|    usgroup|    1|
+--------+--------+-----------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------+-----+

48
+--------

56
+--------+--------+--------------------+---------+-------------------+--------------------+----------------+--------------------+--------------------+---------+-----+
|     aid|comments|              domain|frontpage|          posted_at|         source_text|    source_title|               title|                 url|     user|votes|
+--------+--------+--------------------+---------+-------------------+--------------------+----------------+--------------------+--------------------+---------+-----+
|40187044|       0|medium.com/lb.wri...|    false|2024-04-28 08:46:28|Just a moment...\...|Just a moment...|Vault Dwellers an...|https://medium.co...|  leobear|    1|
|40187056|       0|          xtopics.co|    false|2024-04-28 08:48:34|Grow on X with le...|            NULL|Show HN: XTopics ...|  https://xtopics.co|tropianhs|    1|
+--------+--------+--------------------+---------+-------------------+--------------------+----------------+--------------------+--------------------+---------+--

66
+--------+--------+----------+---------+-------------------+--------------------+--------------------+--------------------+-------------------+------+-----+
|     aid|comments|    domain|frontpage|          posted_at|         source_text|        source_title|               title|                url|  user|votes|
+--------+--------+----------+---------+-------------------+--------------------+--------------------+--------------------+-------------------+------+-----+
|40187139|       0|masto.host|    false|2024-04-28 09:04:33|Masto.host - Full...|Masto.host - Full...|Managed Mastodon ...|https://masto.host/|doener|    1|
+--------+--------+----------+---------+-------------------+--------------------+--------------------+--------------------+-------------------+------+-----+

67
+--------+--------+------------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+------+-----+
|     aid|comments|      domain|frontpage|      

In [11]:
query.stop()