In [None]:
import os
import pathlib
import findspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, current_timestamp, window
from pyspark.sql.types import StructType, TimestampType, StructField, IntegerType, FloatType, StringType

prj_dir = pathlib.Path().resolve().parent.parent
spark_home = os.path.join(prj_dir / 'spark-3.5.0-bin-hadoop3')
findspark.init(spark_home)

spark = SparkSession\
    .builder\
    .appName("InteractionCount")\
    .config("spark.eventLog.enabled", "true")\
    .config("spark.eventLog.dir", "applicationHistory")\
    .master("local[*]")\
    .getOrCreate()

interactions_schema = StructType() \
    .add(StructField('user_a', StringType())) \
    .add(StructField('user_b', StringType())) \
    .add(StructField('timestamp', TimestampType())) \
    .add(StructField('interaction', StringType()))

interactions = spark.readStream.schema(interactions_schema).csv('interactions/*.csv')

windowed_count = interactions.groupBy(
    window("timestamp", '60 seconds', '40 seconds'),
    "user_a"
).count()
# .withWatermark("interactions_user_a", "1 minute")  # Specify the watermark on the timestamp column
 
windowed_count.writeStream \
    .format('console') \
    .outputMode('complete') \
    .option('truncate', 'false') \
    .option('numRows', '10000') \
    .trigger(processingTime='12 seconds') \
    .start() \
    .awaitTermination()



Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/24 11:44:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/11/24 11:44:30 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/g6/vgc6wxj13x95m3zxhrn480540000gn/T/temporary-f2dcf605-70f8-41b4-8019-f73088e3bae3. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/11/24 11:44:30 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------------------------------------------+------+-----+
|window                                    |user_a|count|
+------------------------------------------+------+-----+
|{2012-07-04 00:56:40, 2012-07-04 00:57:40}|454613|4    |
|{2012-07-06 00:56:00, 2012-07-06 00:57:00}|454613|1    |
|{2012-07-06 00:56:40, 2012-07-06 00:57:40}|454613|1    |
|{2012-07-04 00:56:00, 2012-07-04 00:57:00}|454613|4    |
|{2012-07-05 00:56:40, 2012-07-05 00:57:40}|454613|3    |
|{2012-07-05 00:56:00, 2012-07-05 00:57:00}|454613|3    |
+------------------------------------------+------+-----+

