In [1]:
import os
import pathlib
import re
import sys

import findspark
from operator import add
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.functions import regexp_replace, col, trim, lower, desc

prj_home = pathlib.Path().resolve().parent.parent
spark_home = os.path.join(prj_home / 'spark-3.5.0-bin-hadoop3')
findspark.init(spark_home)

In [2]:
sc = SparkContext(master='', appName='PySparkSqlContext') # machine that spark runs and the number of worker threads
ssc = StreamingContext(sparkContext=sc, batchDuration=20)
sql_context = SQLContext(sparkContext=sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/17 16:32:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
curr_dir = pathlib.Path().resolve()
lines = ssc.textFileStream(os.path.join(curr_dir / 'data'))
words = lines.flatMap(lambda line: re.split(' ', line.lower().strip()))

In [4]:
def analyze(time, rdd):
    print('-.-.-.-.-.-.-.-. %s -.-.-.-.-.-.-.-.' % str(time))
    if not rdd.isEmpty():
        rdd = rdd.filter(lambda word: len(word) > 3)
        rdd.collect()
    
        rdd = rdd.map(lambda word: (word, 1))
        words_df = sql_context.createDataFrame(rdd, ['word', 'count'])
        words_df.show()
    
        df_transformed = words_df.select(
            lower(
                trim(
                    regexp_replace(
                        col('word'),
                        r'[.,\/#$%^&*()-_+=~!"\s]*',
                        ''
                    )
                )
            ).alias('keyword')
        )
    
        top_words = sql_context.createDataFrame(
            df_transformed.groupBy('keyword').count().sort(desc('count')).take(200)
        )
    
        top_words.show(200)
    else:
        print("No stream received!")
    
words.foreachRDD(analyze)

In [5]:
ssc.start()

-.-.-.-.-.-.-.-. 2023-11-17 16:33:00 -.-.-.-.-.-.-.-.
No stream received!
-.-.-.-.-.-.-.-. 2023-11-17 16:33:20 -.-.-.-.-.-.-.-.
No stream received!
-.-.-.-.-.-.-.-. 2023-11-17 16:33:40 -.-.-.-.-.-.-.-.


                                                                                

+---------+-----+
|     word|count|
+---------+-----+
|    seuss|    1|
|   shine.|    1|
|    play.|    1|
|    house|    1|
|     that|    1|
|    cold,|    1|
|    cold,|    1|
|     day.|    1|
|    there|    1|
|     with|    1|
|   sally.|    1|
|   there,|    1|
|     two.|    1|
|    said,|    1|
|     "how|    1|
|     wish|    1|
|something|    1|
|     do!"|    1|
|     cold|    1|
|     play|    1|
+---------+-----+
only showing top 20 rows



                                                                                

+---------+-----+
|  keyword|count|
+---------+-----+
|     like|   88|
|     them|   77|
|     with|   76|
|     will|   58|
|     this|   55|
|     that|   50|
|     then|   50|
|     said|   43|
|     they|   37|
|     have|   37|
|      sir|   37|
|    would|   37|
|     what|   35|
|     fish|   34|
|     some|   31|
|    house|   29|
|    there|   28|
|     here|   27|
|   things|   26|
|   grinch|   26|
|     down|   25|
|     look|   24|
|    could|   23|
|      now|   23|
|    thing|   22|
|    socks|   22|
|      not|   21|
|      say|   21|
|     know|   20|
|      you|   18|
|     your|   18|
|    their|   17|
|     when|   17|
|     knox|   17|
|     good|   17|
|     from|   16|
|    mouse|   16|
|   little|   16|
|   should|   15|
|     tree|   15|
|   mother|   15|
|      box|   15|
|     come|   15|
|     bump|   15|
|      cat|   14|
|    three|   14|
|     play|   14|
|     came|   14|
|christmas|   13|
|     sing|   13|
|      fox|   13|
|     call|   13|
|      hat